In [1]:
!kaggle datasets download -d gorororororo23/plant-growth-data-classification --quiet

Dataset URL: https://www.kaggle.com/datasets/gorororororo23/plant-growth-data-classification
License(s): other


In [2]:
# import necessary libraries
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('/content/plant-growth-data-classification.zip')

In [4]:
df.head()

Unnamed: 0,Soil_Type,Sunlight_Hours,Water_Frequency,Fertilizer_Type,Temperature,Humidity,Growth_Milestone
0,loam,5.192294,bi-weekly,chemical,31.719602,61.591861,0
1,sandy,4.033133,weekly,organic,28.919484,52.422276,1
2,loam,8.892769,bi-weekly,none,23.179059,44.660539,0
3,loam,8.241144,bi-weekly,none,18.465886,46.433227,0
4,sandy,8.374043,bi-weekly,organic,18.128741,63.625923,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Soil_Type         193 non-null    object 
 1   Sunlight_Hours    193 non-null    float64
 2   Water_Frequency   193 non-null    object 
 3   Fertilizer_Type   193 non-null    object 
 4   Temperature       193 non-null    float64
 5   Humidity          193 non-null    float64
 6   Growth_Milestone  193 non-null    int64  
dtypes: float64(3), int64(1), object(3)
memory usage: 10.7+ KB


### Visualizing the data

In [6]:
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [7]:
px.histogram(df, x='Water_Frequency', title='Water_frequency vs Growth Milestone', color='Growth_Milestone', color_discrete_sequence=['Red', 'Green'])

In [8]:
px.histogram(df, x="Soil_Type", title='Soil type vs. Growth Milestone', color='Growth_Milestone')

In [9]:
px.histogram(df, x='Fertilizer_Type', title='Fertilizer type vs. Growth Milestone', color='Growth_Milestone')

In [10]:
# maintaining gap
px.histogram(df, x='Sunlight_Hours', title='Sunlight hours vs. Growth Milestone', color='Growth_Milestone')

In [11]:
# plotting temp vs humdity
px.scatter(df, x='Temperature', y='Humidity', title='Temperature vs. Humidity', color='Growth_Milestone')

In [12]:
px.scatter(df, x='Sunlight_Hours', y='Humidity', title='Sunlight hours vs. Humidity', color='Growth_Milestone')

identifying the feature and target column and
splitting the dataset into training and testing data

In [13]:
from sklearn.model_selection import train_test_split
inputs = df.drop('Growth_Milestone', axis=1)
target = df['Growth_Milestone']
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2, random_state=42)

target

0      0
1      1
2      0
3      0
4      0
      ..
188    0
189    1
190    0
191    1
192    0
Name: Growth_Milestone, Length: 193, dtype: int64


 identify the numeric and categorical column



In [14]:
# identify the numeric and categorical column
numeric_cols = inputs.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = inputs.select_dtypes(include=['object']).columns
print(numeric_cols)
print(categorical_cols)

Index(['Sunlight_Hours', 'Temperature', 'Humidity'], dtype='object')
Index(['Soil_Type', 'Water_Frequency', 'Fertilizer_Type'], dtype='object')


encode the categorical column using onehot encoder

In [15]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(X_train[categorical_cols])


In [16]:
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
encoded_cols

['Soil_Type_clay',
 'Soil_Type_loam',
 'Soil_Type_sandy',
 'Water_Frequency_bi-weekly',
 'Water_Frequency_daily',
 'Water_Frequency_weekly',
 'Fertilizer_Type_chemical',
 'Fertilizer_Type_none',
 'Fertilizer_Type_organic']

In [17]:
all_cols= list(numeric_cols)+ encoded_cols

In [18]:
# add encoded columns
X_train[encoded_cols] = encoder.transform(X_train[categorical_cols])
X_test[encoded_cols] = encoder.transform(X_test[categorical_cols])

In [19]:
encoder.transform(X_train[categorical_cols])

array([[1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 1., 0.]])

In [20]:
X_train.head()

Unnamed: 0,Soil_Type,Sunlight_Hours,Water_Frequency,Fertilizer_Type,Temperature,Humidity,Soil_Type_clay,Soil_Type_loam,Soil_Type_sandy,Water_Frequency_bi-weekly,Water_Frequency_daily,Water_Frequency_weekly,Fertilizer_Type_chemical,Fertilizer_Type_none,Fertilizer_Type_organic
82,clay,6.936717,bi-weekly,none,32.168255,59.182806,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
109,loam,4.789,bi-weekly,chemical,31.6,68.9,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
163,loam,9.043,weekly,organic,26.4,70.1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
35,clay,6.462298,bi-weekly,organic,27.517198,34.175036,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
136,loam,6.832,daily,none,31.7,47.8,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [21]:
X_test.head()

Unnamed: 0,Soil_Type,Sunlight_Hours,Water_Frequency,Fertilizer_Type,Temperature,Humidity,Soil_Type_clay,Soil_Type_loam,Soil_Type_sandy,Water_Frequency_bi-weekly,Water_Frequency_daily,Water_Frequency_weekly,Fertilizer_Type_chemical,Fertilizer_Type_none,Fertilizer_Type_organic
45,loam,8.822032,bi-weekly,organic,26.509484,35.940896,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
137,sandy,5.612,weekly,organic,19.2,61.6,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
76,clay,7.016074,bi-weekly,organic,34.810103,52.782838,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
144,clay,8.317,weekly,none,27.5,60.3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
113,sandy,6.356,daily,none,30.8,63.7,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


Scaling the numeric columns

In [22]:
# scaling numeric columns
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(df[numeric_cols])

X_train[numeric_cols] = scaler.transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])


In [23]:
# X_train.head()
X_test.head()

Unnamed: 0,Soil_Type,Sunlight_Hours,Water_Frequency,Fertilizer_Type,Temperature,Humidity,Soil_Type_clay,Soil_Type_loam,Soil_Type_sandy,Water_Frequency_bi-weekly,Water_Frequency_daily,Water_Frequency_weekly,Fertilizer_Type_chemical,Fertilizer_Type_none,Fertilizer_Type_organic
45,loam,0.814332,bi-weekly,organic,0.576717,0.109477,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
137,sandy,0.26848,weekly,organic,0.203976,0.632273,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
76,clay,0.507237,bi-weekly,organic,1.0,0.452626,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
144,clay,0.728453,weekly,none,0.627228,0.605786,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
113,sandy,0.394994,daily,none,0.795508,0.67506,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [24]:
# using decision tree classsifier
from sklearn.tree import DecisionTreeClassifier

# importing GridSearchCV for hyperparameter tuning
from sklearn.model_selection import GridSearchCV


In [25]:
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_features': ['auto', 'sqrt', 'log2'],
    'random_state': [42, 100, 1 ,50, 70]
}


In [26]:
grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,  # Use all available cores
    verbose=1
)


In [27]:
grid_search.fit(X_train[all_cols], y_train)


Fitting 5 folds for each of 3240 candidates, totalling 16200 fits


In [28]:
print("Best Parameters:")
print(grid_search.best_params_)
print("Best Score:")
print(grid_search.best_score_)


Best Parameters:
{'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 50, 'splitter': 'random'}
Best Score:
0.6359139784946237


### Evaluation of model

In [29]:
# importing accuracy_score, classification_report, confusion_matrix for evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [30]:
best_model = grid_search.best_estimator_
y_preds = best_model.predict(X_test[all_cols])

In [31]:
accuracy = accuracy_score(y_test, y_preds)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5897435897435898


In [32]:
conf_matrix = confusion_matrix(y_test, y_preds)
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[13  4]
 [12 10]]


In [33]:
best_model

In [34]:
import joblib
joblib.dump(best_model, 'model.pkl')
joblib.dump(encoder, 'encoder.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']