In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [42]:
df = pd.read_csv('datasets/US-pumpkins.csv')
df.head()

Unnamed: 0,City Name,Type,Package,Variety,Sub Variety,Grade,Date,Low Price,High Price,Mostly Low,...,Unit of Sale,Quality,Condition,Appearance,Storage,Crop,Repack,Trans Mode,Unnamed: 24,Unnamed: 25
0,BALTIMORE,,24 inch bins,,,,4/29/17,270.0,280.0,270.0,...,,,,,,,E,,,
1,BALTIMORE,,24 inch bins,,,,5/6/17,270.0,280.0,270.0,...,,,,,,,E,,,
2,BALTIMORE,,24 inch bins,HOWDEN TYPE,,,9/24/16,160.0,160.0,160.0,...,,,,,,,N,,,
3,BALTIMORE,,24 inch bins,HOWDEN TYPE,,,9/24/16,160.0,160.0,160.0,...,,,,,,,N,,,
4,BALTIMORE,,24 inch bins,HOWDEN TYPE,,,11/5/16,90.0,100.0,90.0,...,,,,,,,N,,,


In [43]:
df.isnull().sum()

City Name             0
Type               1712
Package               0
Variety               5
Sub Variety        1461
Grade              1757
Date                  0
Low Price             0
High Price            0
Mostly Low          103
Mostly High         103
Origin                3
Origin District    1626
Item Size           279
Color               616
Environment        1757
Unit of Sale       1595
Quality            1757
Condition          1757
Appearance         1757
Storage            1757
Crop               1757
Repack                0
Trans Mode         1757
Unnamed: 24        1757
Unnamed: 25        1654
dtype: int64

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1757 entries, 0 to 1756
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   City Name        1757 non-null   object 
 1   Type             45 non-null     object 
 2   Package          1757 non-null   object 
 3   Variety          1752 non-null   object 
 4   Sub Variety      296 non-null    object 
 5   Grade            0 non-null      float64
 6   Date             1757 non-null   object 
 7   Low Price        1757 non-null   float64
 8   High Price       1757 non-null   float64
 9   Mostly Low       1654 non-null   float64
 10  Mostly High      1654 non-null   float64
 11  Origin           1754 non-null   object 
 12  Origin District  131 non-null    object 
 13  Item Size        1478 non-null   object 
 14  Color            1141 non-null   object 
 15  Environment      0 non-null      float64
 16  Unit of Sale     162 non-null    object 
 17  Quality       

In [45]:
object_columns = ['Variety', 'Origin', 'Item Size', 'Color']
for column in object_columns:
    df[column] = df[column].fillna(df[column].mode()[0])

float_columns = ['Mostly Low', 'Mostly High']
for column in float_columns:
    df[column] = df[column].fillna(df[column].median())

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1757 entries, 0 to 1756
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   City Name        1757 non-null   object 
 1   Type             45 non-null     object 
 2   Package          1757 non-null   object 
 3   Variety          1757 non-null   object 
 4   Sub Variety      296 non-null    object 
 5   Grade            0 non-null      float64
 6   Date             1757 non-null   object 
 7   Low Price        1757 non-null   float64
 8   High Price       1757 non-null   float64
 9   Mostly Low       1757 non-null   float64
 10  Mostly High      1757 non-null   float64
 11  Origin           1757 non-null   object 
 12  Origin District  131 non-null    object 
 13  Item Size        1757 non-null   object 
 14  Color            1757 non-null   object 
 15  Environment      0 non-null      float64
 16  Unit of Sale     162 non-null    object 
 17  Quality       

In [47]:
df.drop(columns = ['Grade', 'Type', 'Origin District', 'Quality', 'Condition', 
                   'Appearance', 'Storage', 'Crop', 'Trans Mode', 'Sub Variety', 
                   'Unit of Sale', 'Environment'], inplace = True)

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1757 entries, 0 to 1756
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   City Name    1757 non-null   object 
 1   Package      1757 non-null   object 
 2   Variety      1757 non-null   object 
 3   Date         1757 non-null   object 
 4   Low Price    1757 non-null   float64
 5   High Price   1757 non-null   float64
 6   Mostly Low   1757 non-null   float64
 7   Mostly High  1757 non-null   float64
 8   Origin       1757 non-null   object 
 9   Item Size    1757 non-null   object 
 10  Color        1757 non-null   object 
 11  Repack       1757 non-null   object 
 12  Unnamed: 24  0 non-null      float64
 13  Unnamed: 25  103 non-null    object 
dtypes: float64(5), object(9)
memory usage: 192.3+ KB


In [49]:
x = df.drop(columns = ['Color'])
y = df['Color']

In [50]:
from sklearn.preprocessing import LabelEncoder

In [51]:
for column in x.columns:
    if x[column].dtype == 'object':
        le = LabelEncoder()
        x[column] = le.fit_transform(x[column]).astype(float)  # Convert to float

In [52]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [53]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1757 entries, 0 to 1756
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   City Name    1757 non-null   float64
 1   Package      1757 non-null   float64
 2   Variety      1757 non-null   float64
 3   Date         1757 non-null   float64
 4   Low Price    1757 non-null   float64
 5   High Price   1757 non-null   float64
 6   Mostly Low   1757 non-null   float64
 7   Mostly High  1757 non-null   float64
 8   Origin       1757 non-null   float64
 9   Item Size    1757 non-null   float64
 10  Repack       1757 non-null   float64
 11  Unnamed: 24  0 non-null      float64
 12  Unnamed: 25  1757 non-null   float64
dtypes: float64(13)
memory usage: 178.6 KB


In [54]:
model = DecisionTreeClassifier(criterion = 'gini', random_state = 42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [57]:
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, zero_division = 0)

In [58]:
print('accuracy:', accuracy, '\n')
print('classification report: \n', class_report, '\n')

accuracy: 0.8977272727272727 

classification report: 
               precision    recall  f1-score   support

      ORANGE       0.92      0.96      0.94       306
     STRIPED       0.00      0.00      0.00         0
       WHITE       0.68      0.46      0.55        46

    accuracy                           0.90       352
   macro avg       0.53      0.47      0.50       352
weighted avg       0.89      0.90      0.89       352
 

