In [47]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

from sklearn.metrics import classification_report

In [48]:
df = pd.read_csv('data/crop.csv',index_col=0)
df.head()

Unnamed: 0_level_0,P,K,temperature,humidity,ph,rainfall,label
N,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
90,42,43,20.879744,82.002744,6.502985,202.935536,rice
85,58,41,21.770462,80.319644,7.038096,226.655537,rice
60,55,44,23.004459,82.320763,7.840207,263.964248,rice
74,35,40,26.491096,80.158363,6.980401,242.864034,rice
78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2200 entries, 90 to 104
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   P            2200 non-null   int64  
 1   K            2200 non-null   int64  
 2   temperature  2200 non-null   float64
 3   humidity     2200 non-null   float64
 4   ph           2200 non-null   float64
 5   rainfall     2200 non-null   float64
 6   label        2200 non-null   object 
dtypes: float64(4), int64(2), object(1)
memory usage: 137.5+ KB


In [50]:
from sklearn.preprocessing import LabelEncoder

In [51]:
le = LabelEncoder()

In [52]:
y = le.fit_transform(df['label'])

In [53]:
X = df.drop('label',axis=1)

In [54]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train.shape, X_test.shape

((1650, 6), (550, 6))

In [57]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42, n_jobs=-1,oob_score=True)

rf.fit(X_train, y_train)

RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=42)

In [58]:
rf.oob_score_

0.9939393939393939

In [59]:
y_pred = rf.predict(X_test)

In [60]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        29
           1       1.00      1.00      1.00        23
           2       1.00      1.00      1.00        24
           3       1.00      1.00      1.00        31
           4       1.00      1.00      1.00        27
           5       1.00      1.00      1.00        26
           6       1.00      1.00      1.00        21
           7       1.00      1.00      1.00        19
           8       0.84      1.00      0.92        27
           9       1.00      1.00      1.00        24
          10       1.00      1.00      1.00        15
          11       0.96      1.00      0.98        23
          12       1.00      1.00      1.00        27
          13       1.00      0.96      0.98        26
          14       1.00      1.00      1.00        26
          15       1.00      1.00      1.00        22
          16       1.00      1.00      1.00        19
          17       1.00    

### Hyper parameter tuning

In [61]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
params = {
    'max_depth': [4,5,6,7],
    'n_estimators': [50,100,200,300,500,1000]
}
from sklearn.model_selection import GridSearchCV
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf,
                           param_grid=params,
                           cv = 4,
                           n_jobs=-1, verbose=1, scoring="accuracy")

In [62]:
grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 24 candidates, totalling 96 fits


GridSearchCV(cv=4, estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [4, 5, 6, 7],
                         'n_estimators': [50, 100, 200, 300, 500, 1000]},
             scoring='accuracy', verbose=1)

In [63]:
grid_search.best_score_

0.987883177789793

In [64]:
grid_search.best_params_

{'max_depth': 7, 'n_estimators': 50}

In [65]:
rf_best = grid_search.best_estimator_
rf_best

RandomForestClassifier(max_depth=7, n_estimators=50, n_jobs=-1, random_state=42)

In [66]:
# from sklearn.tree import plot_tree
# plt.figure(figsize=(80,40))
# plot_tree(rf_best.estimators_[5], feature_names = X.columns,class_names=le.classes_,filled=True);

In [67]:
rf_best.feature_importances_

array([0.18211207, 0.20753507, 0.08176756, 0.23252759, 0.05034155,
       0.24571616])

In [68]:
imp_df = pd.DataFrame({
    "Varname": X_train.columns,
    "Imp": rf_best.feature_importances_
})

In [69]:
imp_df.sort_values(by="Imp", ascending=False)

Unnamed: 0,Varname,Imp
5,rainfall,0.245716
3,humidity,0.232528
1,K,0.207535
0,P,0.182112
2,temperature,0.081768
4,ph,0.050342


In [70]:
y_pred = rf_best.predict(X_test)

In [71]:
from sklearn.metrics import accuracy_score,classification_report

In [72]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        29
           1       1.00      1.00      1.00        23
           2       1.00      1.00      1.00        24
           3       1.00      1.00      1.00        31
           4       1.00      1.00      1.00        27
           5       1.00      1.00      1.00        26
           6       0.91      1.00      0.95        21
           7       1.00      1.00      1.00        19
           8       0.76      0.96      0.85        27
           9       1.00      1.00      1.00        24
          10       1.00      1.00      1.00        15
          11       0.91      0.91      0.91        23
          12       1.00      1.00      1.00        27
          13       1.00      0.96      0.98        26
          14       1.00      1.00      1.00        26
          15       1.00      1.00      1.00        22
          16       1.00      1.00      1.00        19
          17       1.00    