In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Dataset_Dec.csv')

In [3]:
df.head()

Unnamed: 0,Area,Year,Item,Area Harvested,Yield,Production
0,Bangladesh,1961,Areca nuts,82600,7627,62995
1,Bangladesh,1961,Bananas,33600,132738,446000
2,Bangladesh,1961,Barley,29947,5768,17272
3,Bangladesh,1961,"Bastfibres, others",30900,11117,34350
4,Bangladesh,1961,"Beans, dry",68798,7236,49784


In [4]:
# df['Item'].value_counts()
df['Item'].nunique()

54

## OneHotEncoding using sklearn

In [5]:
from sklearn.model_selection import train_test_split
# first 5 cols as train as input, last col as output
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:5],df.iloc[:,-1],test_size=0.2,random_state=2)

In [6]:
X_test.head()

Unnamed: 0,Area,Year,Item,Area Harvested,Yield
3064,Bangladesh,2017,Spices nes,47332,35494
761,Bangladesh,1975,"Beans, green",6632,39704
2692,Bangladesh,2010,Tea,52236,11486
2940,Bangladesh,2015,Linseed,7009,6941
1901,Bangladesh,1996,Coconuts,29200,30479


In [7]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
# object
# multicollinearity removal
# removing the catagories from the columns with drop first
ohe = OneHotEncoder(drop='first',sparse=False,dtype=np.int32)

In [9]:
X_train_new = ohe.fit_transform(X_train[['Area','Item']])
# we get the no. of cols depending on the categories
X_train_new

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [10]:
X_test_new = ohe.transform(X_test[['Area','Item']])

In [11]:
X_test_new

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [12]:
# now to append everything together
data = np.hstack((X_train[['Year','Area Harvested','Yield']].values,X_train_new))
data

array([[    2008,    72581,     9856, ...,        0,        0,        0],
       [    2011,   708723,    21494, ...,        0,        0,        0],
       [    1985,    77023,    62181, ...,        0,        1,        0],
       ...,
       [    1990,     4014,    45279, ...,        0,        0,        0],
       [    2008,    46482,     8547, ...,        0,        0,        0],
       [    2008, 11279150,    41441, ...,        0,        0,        0]],
      dtype=int64)

## SVM

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

In [14]:
x = data[:,0:5]
y = data[:,-1]
X_train_new, X_test_new, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [15]:
model = SVC()
model.fit(X_train_new,y_train)
predict_production = model.predict(X_test_new)
print("Accuracy: ",accuracy_score(y_test, predict_production))

Accuracy:  0.996078431372549


## Evaluation of the model

#### R-Squared

In [22]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, predict_production)
print(r2)

0.7103095711445612


#### Mean Absolute Error (MAE)

In [26]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, predict_production)

0.00392156862745098

#### Mean squared error

In [27]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, predict_production)

0.00392156862745098

#### AUC-ROC Curve

In [50]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, predict_production)

0.8571428571428572

#### Confusion Matrix and related Metrics

In [48]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, classification_report
cm = confusion_matrix(y_test, predict_production)
ac = accuracy_score(y_test, predict_production)
rs = recall_score(y_test, predict_production)
pr = precision_score(y_test, predict_production)
f1 = f1_score(y_test, predict_production, average=None)

In [34]:
# Confusion Matrix
print (cm)

[[503   0]
 [  2   5]]


In [35]:
# Accuracy Score
print (ac)

0.996078431372549


In [36]:
# Recall Score
print (rs)

0.7142857142857143


In [42]:
# Precision Score
print (pr)

1.0


In [43]:
# F1 Score
print (f1)

[0.99801587 0.83333333]


#### Classification Report

In [51]:
# report
print(classification_report(y_test, predict_production, target_names=None))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       503
           1       1.00      0.71      0.83         7

    accuracy                           1.00       510
   macro avg       1.00      0.86      0.92       510
weighted avg       1.00      1.00      1.00       510

