In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('US-Pumpkins.csv')
df.head()

Unnamed: 0,City Name,Type,Package,Variety,Sub Variety,Grade,Date,Low Price,High Price,Mostly Low,...,Unit of Sale,Quality,Condition,Appearance,Storage,Crop,Repack,Trans Mode,Unnamed: 24,Unnamed: 25
0,BALTIMORE,,24 inch bins,,,,4/29/17,270.0,280.0,270.0,...,,,,,,,E,,,
1,BALTIMORE,,24 inch bins,,,,5/6/17,270.0,280.0,270.0,...,,,,,,,E,,,
2,BALTIMORE,,24 inch bins,HOWDEN TYPE,,,9/24/16,160.0,160.0,160.0,...,,,,,,,N,,,
3,BALTIMORE,,24 inch bins,HOWDEN TYPE,,,9/24/16,160.0,160.0,160.0,...,,,,,,,N,,,
4,BALTIMORE,,24 inch bins,HOWDEN TYPE,,,11/5/16,90.0,100.0,90.0,...,,,,,,,N,,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1757 entries, 0 to 1756
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   City Name        1757 non-null   object 
 1   Type             45 non-null     object 
 2   Package          1757 non-null   object 
 3   Variety          1752 non-null   object 
 4   Sub Variety      296 non-null    object 
 5   Grade            0 non-null      float64
 6   Date             1757 non-null   object 
 7   Low Price        1757 non-null   float64
 8   High Price       1757 non-null   float64
 9   Mostly Low       1654 non-null   float64
 10  Mostly High      1654 non-null   float64
 11  Origin           1754 non-null   object 
 12  Origin District  131 non-null    object 
 13  Item Size        1478 non-null   object 
 14  Color            1141 non-null   object 
 15  Environment      0 non-null      float64
 16  Unit of Sale     162 non-null    object 
 17  Quality       

In [4]:
df.isnull().sum()

City Name             0
Type               1712
Package               0
Variety               5
Sub Variety        1461
Grade              1757
Date                  0
Low Price             0
High Price            0
Mostly Low          103
Mostly High         103
Origin                3
Origin District    1626
Item Size           279
Color               616
Environment        1757
Unit of Sale       1595
Quality            1757
Condition          1757
Appearance         1757
Storage            1757
Crop               1757
Repack                0
Trans Mode         1757
Unnamed: 24        1757
Unnamed: 25        1654
dtype: int64

In [5]:
null_per = df.isnull().mean() * 100
print(null_per)

City Name            0.000000
Type                97.438816
Package              0.000000
Variety              0.284576
Sub Variety         83.153102
Grade              100.000000
Date                 0.000000
Low Price            0.000000
High Price           0.000000
Mostly Low           5.862265
Mostly High          5.862265
Origin               0.170746
Origin District     92.544109
Item Size           15.879340
Color               35.059761
Environment        100.000000
Unit of Sale        90.779738
Quality            100.000000
Condition          100.000000
Appearance         100.000000
Storage            100.000000
Crop               100.000000
Repack               0.000000
Trans Mode         100.000000
Unnamed: 24        100.000000
Unnamed: 25         94.137735
dtype: float64


In [6]:
df.drop(columns=null_per[null_per == 100].index, inplace=True)

In [7]:
null_per = df.isnull().mean() * 100
print(null_per)

City Name           0.000000
Type               97.438816
Package             0.000000
Variety             0.284576
Sub Variety        83.153102
Date                0.000000
Low Price           0.000000
High Price          0.000000
Mostly Low          5.862265
Mostly High         5.862265
Origin              0.170746
Origin District    92.544109
Item Size          15.879340
Color              35.059761
Unit of Sale       90.779738
Repack              0.000000
Unnamed: 25        94.137735
dtype: float64


In [8]:
float_cols = df.select_dtypes(include=['float']).columns
for col in float_cols:
   median_value = df[col].median()  
   df[col].fillna(median_value, inplace=True)  

In [9]:
object_cols = df.select_dtypes(include=['object']).columns

df.fillna({col: df[col].mode()[0] for col in object_cols}, inplace=True)

In [10]:
df.isnull().sum()

City Name          0
Type               0
Package            0
Variety            0
Sub Variety        0
Date               0
Low Price          0
High Price         0
Mostly Low         0
Mostly High        0
Origin             0
Origin District    0
Item Size          0
Color              0
Unit of Sale       0
Repack             0
Unnamed: 25        0
dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1757 entries, 0 to 1756
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   City Name        1757 non-null   object 
 1   Type             1757 non-null   object 
 2   Package          1757 non-null   object 
 3   Variety          1757 non-null   object 
 4   Sub Variety      1757 non-null   object 
 5   Date             1757 non-null   object 
 6   Low Price        1757 non-null   float64
 7   High Price       1757 non-null   float64
 8   Mostly Low       1757 non-null   float64
 9   Mostly High      1757 non-null   float64
 10  Origin           1757 non-null   object 
 11  Origin District  1757 non-null   object 
 12  Item Size        1757 non-null   object 
 13  Color            1757 non-null   object 
 14  Unit of Sale     1757 non-null   object 
 15  Repack           1757 non-null   object 
 16  Unnamed: 25      1757 non-null   object 
dtypes: float64(4),

In [12]:
df.head()

Unnamed: 0,City Name,Type,Package,Variety,Sub Variety,Date,Low Price,High Price,Mostly Low,Mostly High,Origin,Origin District,Item Size,Color,Unit of Sale,Repack,Unnamed: 25
0,BALTIMORE,Organic,24 inch bins,HOWDEN TYPE,FLAT TYPE,4/29/17,270.0,280.0,270.0,280.0,MARYLAND,QUEBEC,lge,ORANGE,PER BIN,E,LOWER.
1,BALTIMORE,Organic,24 inch bins,HOWDEN TYPE,FLAT TYPE,5/6/17,270.0,280.0,270.0,280.0,MARYLAND,QUEBEC,lge,ORANGE,PER BIN,E,LOWER.
2,BALTIMORE,Organic,24 inch bins,HOWDEN TYPE,FLAT TYPE,9/24/16,160.0,160.0,160.0,160.0,DELAWARE,QUEBEC,med,ORANGE,PER BIN,N,LOWER.
3,BALTIMORE,Organic,24 inch bins,HOWDEN TYPE,FLAT TYPE,9/24/16,160.0,160.0,160.0,160.0,VIRGINIA,QUEBEC,med,ORANGE,PER BIN,N,LOWER.
4,BALTIMORE,Organic,24 inch bins,HOWDEN TYPE,FLAT TYPE,11/5/16,90.0,100.0,90.0,100.0,MARYLAND,QUEBEC,lge,ORANGE,PER BIN,N,LOWER.


In [13]:
for col in df.columns:
    unique_values = df[col].unique()
    if len(unique_values) < 5:  
        print(f"Column: {col}")
        
        print(unique_values)
        print("-" * 40)

Column: Type
['Organic']
----------------------------------------
Column: Sub Variety
['FLAT TYPE' 'ROUND TYPE']
----------------------------------------
Column: Color
['ORANGE' 'WHITE' 'STRIPED']
----------------------------------------
Column: Unit of Sale
['PER BIN' 'EACH' 'PER LB' 'SHELLACKED']
----------------------------------------
Column: Repack
['E' 'N']
----------------------------------------
Column: Unnamed: 25
['LOWER.' 'STEADY.' 'ABOUT STEADY.' 'MINIATURE LOWER, OTHERS STEADY.']
----------------------------------------


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1757 entries, 0 to 1756
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   City Name        1757 non-null   object 
 1   Type             1757 non-null   object 
 2   Package          1757 non-null   object 
 3   Variety          1757 non-null   object 
 4   Sub Variety      1757 non-null   object 
 5   Date             1757 non-null   object 
 6   Low Price        1757 non-null   float64
 7   High Price       1757 non-null   float64
 8   Mostly Low       1757 non-null   float64
 9   Mostly High      1757 non-null   float64
 10  Origin           1757 non-null   object 
 11  Origin District  1757 non-null   object 
 12  Item Size        1757 non-null   object 
 13  Color            1757 non-null   object 
 14  Unit of Sale     1757 non-null   object 
 15  Repack           1757 non-null   object 
 16  Unnamed: 25      1757 non-null   object 
dtypes: float64(4),

In [15]:
encoder = LabelEncoder()

In [16]:
for col in object_cols:
    df[col] = encoder.fit_transform(df[col])

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1757 entries, 0 to 1756
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   City Name        1757 non-null   int32  
 1   Type             1757 non-null   int32  
 2   Package          1757 non-null   int32  
 3   Variety          1757 non-null   int32  
 4   Sub Variety      1757 non-null   int32  
 5   Date             1757 non-null   int32  
 6   Low Price        1757 non-null   float64
 7   High Price       1757 non-null   float64
 8   Mostly Low       1757 non-null   float64
 9   Mostly High      1757 non-null   float64
 10  Origin           1757 non-null   int32  
 11  Origin District  1757 non-null   int32  
 12  Item Size        1757 non-null   int32  
 13  Color            1757 non-null   int32  
 14  Unit of Sale     1757 non-null   int32  
 15  Repack           1757 non-null   int32  
 16  Unnamed: 25      1757 non-null   int32  
dtypes: float64(4),

In [18]:
X = df.drop('Color', axis=1)
y = df['Color']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [20]:
model = DecisionTreeClassifier(criterion='gini', random_state=42)

In [21]:
model.fit(X_train, y_train)

In [22]:
y_pred = model.predict(X_test)

In [23]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n{conf_matrix}')

class_report = classification_report(y_test, y_pred)
print(f'Classification Report:\n{class_report}')

Accuracy: 0.9005681818181818
Confusion Matrix:
[[303   1  16]
 [  1   0   1]
 [ 16   0  14]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       320
           1       0.00      0.00      0.00         2
           2       0.45      0.47      0.46        30

    accuracy                           0.90       352
   macro avg       0.47      0.47      0.47       352
weighted avg       0.90      0.90      0.90       352

