In [32]:
import numpy as np
import pandas as pd
from numpy import math

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

In [33]:
df = pd.read_csv('Dataset_Jan.csv')

In [34]:
df.head()

Unnamed: 0,Area,Year,Item,Area Harvested,Yield,Humidity,Wind Speed,Temperature,Production
0,Bangladesh,1961,Areca nuts,82600,7627,,,,62995
1,Bangladesh,1961,Bananas,33600,132738,,,,446000
2,Bangladesh,1961,Barley,29947,5768,,,,17272
3,Bangladesh,1961,"Bastfibres, others",30900,11117,,,,34350
4,Bangladesh,1961,"Beans, dry",68798,7236,,,,49784


In [35]:
# df['Item'].value_counts()
df['Item'].nunique()

54

In [36]:
df.shape

(3185, 9)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3185 entries, 0 to 3184
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Area            3185 non-null   object 
 1   Year            3185 non-null   int64  
 2   Item            3185 non-null   object 
 3   Area Harvested  3185 non-null   int64  
 4   Yield           3185 non-null   int64  
 5   Humidity        2105 non-null   float64
 6   Wind Speed      2105 non-null   float64
 7   Temperature     2105 non-null   float64
 8   Production      3185 non-null   int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 224.1+ KB


In [38]:
mean_temp=df['Temperature'].mean()
mean_hum=df['Humidity'].mean()
mean_wind=df['Wind Speed'].mean()
df['Temperature'].fillna(value=mean_temp, inplace=True)
df['Humidity'].fillna(value=mean_hum, inplace=True)
df['Wind Speed'].fillna(value=mean_wind, inplace=True)

In [39]:
df.head()

Unnamed: 0,Area,Year,Item,Area Harvested,Yield,Humidity,Wind Speed,Temperature,Production
0,Bangladesh,1961,Areca nuts,82600,7627,14.404385,162.94838,26.311311,62995
1,Bangladesh,1961,Bananas,33600,132738,14.404385,162.94838,26.311311,446000
2,Bangladesh,1961,Barley,29947,5768,14.404385,162.94838,26.311311,17272
3,Bangladesh,1961,"Bastfibres, others",30900,11117,14.404385,162.94838,26.311311,34350
4,Bangladesh,1961,"Beans, dry",68798,7236,14.404385,162.94838,26.311311,49784


## OneHotEncoding using sklearn

In [40]:
from sklearn.model_selection import train_test_split
# first 5 cols as train as input, last col as output
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:8],df.iloc[:,-1],test_size=0.2,random_state=2)

In [41]:
X_test.head()

Unnamed: 0,Area,Year,Item,Area Harvested,Yield,Humidity,Wind Speed,Temperature
3064,Bangladesh,2017,Spices nes,47332,35494,16.42,170.19,25.36
761,Bangladesh,1975,"Beans, green",6632,39704,14.404385,162.94838,26.311311
2692,Bangladesh,2010,Tea,52236,11486,15.5,172.75,26.02
2940,Bangladesh,2015,Linseed,7009,6941,15.44,188.19,25.92
1901,Bangladesh,1996,Coconuts,29200,30479,13.0,174.75,28.0


In [42]:
from sklearn.preprocessing import OneHotEncoder

In [43]:
# object
# multicollinearity removal
# removing the catagories from the columns with drop first
ohe = OneHotEncoder(drop='first',sparse=False,dtype=np.int32)

In [44]:
X_train_new = ohe.fit_transform(X_train[['Item']])
# we get the no. of cols depending on the categories
X_train_new

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [45]:
X_test_new = ohe.transform(X_test[['Item']])

In [46]:
X_test_new

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [47]:
# now to append everything together
data = np.hstack((X_train[['Year','Area Harvested','Yield','Humidity','Wind Speed','Temperature']].values,X_train_new))
data

array([[2.008000e+03, 7.258100e+04, 9.856000e+03, ..., 0.000000e+00,
        0.000000e+00, 0.000000e+00],
       [2.011000e+03, 7.087230e+05, 2.149400e+04, ..., 0.000000e+00,
        0.000000e+00, 0.000000e+00],
       [1.985000e+03, 7.702300e+04, 6.218100e+04, ..., 0.000000e+00,
        1.000000e+00, 0.000000e+00],
       ...,
       [1.990000e+03, 4.014000e+03, 4.527900e+04, ..., 0.000000e+00,
        0.000000e+00, 0.000000e+00],
       [2.008000e+03, 4.648200e+04, 8.547000e+03, ..., 0.000000e+00,
        0.000000e+00, 0.000000e+00],
       [2.008000e+03, 1.127915e+07, 4.144100e+04, ..., 0.000000e+00,
        0.000000e+00, 0.000000e+00]])

## Random Forest

In [48]:
data.shape

(2548, 59)

In [49]:
x = data[:,0:8]
y = data[:,-1]

In [50]:
y

array([0., 0., 0., ..., 0., 0., 0.])

In [51]:

X_train_new, X_test_new, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42,stratify=y)

In [52]:
# Instantiate and fit the RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier()
forest.fit(X_train_new, y_train)

RandomForestClassifier()

In [53]:
# Make predictions for the test set
y_pred_test = forest.predict(X_test_new)

In [54]:
# View accuracy score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy_score(y_test, y_pred_test)

0.9968602825745683

#### Confusion Matrix

In [55]:
# View confusion matrix for test data and predictions
confusion_matrix(y_test, y_pred_test)

array([[626,   0],
       [  2,   9]], dtype=int64)

#### Classification Report

In [56]:
# View the classification report for test data and predictions
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       626
         1.0       1.00      0.82      0.90        11

    accuracy                           1.00       637
   macro avg       1.00      0.91      0.95       637
weighted avg       1.00      1.00      1.00       637

