In [4]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,f1_score,precision_score
import os
import glob
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score, make_scorer
from pprint import pprint
from sklearn.inspection import permutation_importance
from imblearn.under_sampling import NearMiss

In [46]:
df=pd.read_excel(r'C:\Users\javad\OneDrive - University of Windsor\Shirin Research\research\ML Codes\Data_VIF.xlsx',index_col=0)

In [47]:
df['Label'].value_counts()

Alert          388
Stable         316
Sustainable    309
Name: Label, dtype: int64

In [48]:
x=df.drop(['Label','Year'],axis=1)
y=df['Label']

In [49]:
scaler = StandardScaler()
x = pd.DataFrame(scaler.fit_transform(x), columns = x.columns)
x.head()

Unnamed: 0,GDP growth,GDP per capita,Population,School enrolment,Control of corruption,Regime types
0,0.390021,-0.700372,-0.145309,-1.626815,-1.263694,-0.750929
1,-0.471464,-0.520049,-0.100416,0.039238,-0.403364,-0.750929
2,1.848659,-0.569359,-0.184441,-2.055227,-1.06333,-1.818356
3,1.023579,-0.383175,-0.065234,0.424186,-0.17521,0.316498
4,2.237309,-0.594117,-0.291514,0.52512,-0.50515,-0.750929


In [50]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state = 1)

## We implement the GridSeach to do the hyperparameter Tuning

In [70]:
params=[{'n_neighbors': [3, 5, 7, 9],
         'weights': ['uniform', 'distance'],
         'leaf_size': [15, 20]}]
         

In [71]:
knn_model = KNeighborsClassifier()

In [85]:
clf = GridSearchCV(knn_model, params, n_jobs=-1,cv=5, scoring='accuracy')

In [86]:
model = clf.fit(X_train, y_train)

In [87]:
pprint(model.best_estimator_.get_params())

{'algorithm': 'auto',
 'leaf_size': 15,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 7,
 'p': 2,
 'weights': 'distance'}


In [88]:
y_pred=model.predict(X_test)

In [89]:
result = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(result)
result1 = classification_report(y_test, y_pred)
print('Classification Report:',)
print (result1)
result2 = accuracy_score(y_test,y_pred)
print('Accuracy:',result2)

Confusion Matrix:
[[151  19  11   0]
 [ 20  62   2   0]
 [  8   0  55   1]
 [  0   0   6  55]]
Classification Report:
              precision    recall  f1-score   support

       Alert       0.77      0.74      0.75        84
      Stable       0.74      0.86      0.80        64
 Sustainable       0.98      0.90      0.94        61

    accuracy                           0.83       390
   macro avg       0.83      0.83      0.83       390
weighted avg       0.83      0.83      0.83       390

Accuracy: 0.8282051282051283


# Over sampling KNN

In [90]:
df_over=df.copy()

In [91]:
x_over=df_over.drop(['Label','Year'],axis=1)
y_over=df_over['Label']

In [92]:
scaler_over = StandardScaler()
x_over= pd.DataFrame(scaler_over.fit_transform(x_over), columns = x_over.columns)
x_over.head()

Unnamed: 0,GDP growth,GDP per capita,Population,School enrolment,Control of corruption,Regime types
0,0.390021,-0.700372,-0.145309,-1.626815,-1.263694,-0.750929
1,-0.471464,-0.520049,-0.100416,0.039238,-0.403364,-0.750929
2,1.848659,-0.569359,-0.184441,-2.055227,-1.06333,-1.818356
3,1.023579,-0.383175,-0.065234,0.424186,-0.17521,0.316498
4,2.237309,-0.594117,-0.291514,0.52512,-0.50515,-0.750929


In [93]:
oversample = SMOTE()
x_over, y_over = oversample.fit_resample(x_over, y_over)

In [94]:
y_over.value_counts()

Alert          933
Stable         933
Sustainable    933
Name: Label, dtype: int64

In [95]:
X_over_train, X_over_test, y_over_train, y_over_test = train_test_split(x_over,y_over,test_size=0.2, random_state = 1)

In [96]:
knn_over = KNeighborsClassifier()

In [113]:
clf_over = GridSearchCV(knn_over, params, n_jobs=-1,cv=5, scoring='accuracy')

In [114]:
model_over = clf_over.fit(X_over_train, y_over_train)

In [115]:
pprint(model_over.best_estimator_.get_params())

{'algorithm': 'auto',
 'leaf_size': 15,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 3,
 'p': 2,
 'weights': 'distance'}


In [116]:
y_pred_over = model_over.predict(X_over_test)

In [117]:
result_over = confusion_matrix(y_over_test,y_pred_over)
print('Confusion Matrix:')
print(result_over)
result1_over = classification_report(y_over_test,y_pred_over)
print('Classification Report:',)
print (result1_over)
result2_over = accuracy_score(y_over_test,y_pred_over)
print('Accuracy:',result2_over)

Confusion Matrix:
[[159  31  11   3]
 [  3 189   1   0]
 [  1   0 171   3]
 [  0   0   5 170]]
Classification Report:
              precision    recall  f1-score   support

       Alert       0.86      0.98      0.92       193
      Stable       0.91      0.98      0.94       175
 Sustainable       0.97      0.97      0.97       175

    accuracy                           0.92       747
   macro avg       0.93      0.93      0.92       747
weighted avg       0.93      0.92      0.92       747

Accuracy: 0.9223560910307899


# Under sampling Random forest

In [127]:
df_under=df.copy()

In [128]:
x_under=df_under.drop(['Label','Year'],axis=1)
y_under=df_under['Label']

In [129]:
scaler_under = StandardScaler()
x_under= pd.DataFrame(scaler_under.fit_transform(x_under), columns = x_under.columns)
x_under.head()

Unnamed: 0,GDP growth,GDP per capita,Population,School enrolment,Control of corruption,Regime types
0,0.390021,-0.700372,-0.145309,-1.626815,-1.263694,-0.750929
1,-0.471464,-0.520049,-0.100416,0.039238,-0.403364,-0.750929
2,1.848659,-0.569359,-0.184441,-2.055227,-1.06333,-1.818356
3,1.023579,-0.383175,-0.065234,0.424186,-0.17521,0.316498
4,2.237309,-0.594117,-0.291514,0.52512,-0.50515,-0.750929


In [130]:
undersample = NearMiss(version=1, n_neighbors=3)

In [131]:
x_under, y_under= undersample.fit_resample(x_under,y_under)

In [133]:
y_under.value_counts()

Alert          309
Stable         309
Sustainable    309
Name: Label, dtype: int64

In [134]:
X_under_train, X_under_test, y_under_train, y_under_test = train_test_split(x_under,y_under,test_size=0.2, random_state = 1)

In [135]:
knn_model_under =KNeighborsClassifier()

In [136]:
clf_under = GridSearchCV(knn_model_under, params, n_jobs=-1,cv=5, scoring='accuracy')

In [137]:
model_under = clf_under.fit(X_under_train, y_under_train)

In [138]:
pprint(model_under.best_estimator_.get_params())

{'algorithm': 'auto',
 'leaf_size': 15,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 3,
 'p': 2,
 'weights': 'distance'}


In [139]:
y_pred_under = model_under.predict(X_under_test)

In [140]:
result_under = confusion_matrix(y_under_test,y_pred_under)
print('Confusion Matrix:')
print(result_under)
result1_under = classification_report(y_under_test,y_pred_under)
print('Classification Report:',)
print (result1_under)
result2_under = accuracy_score(y_under_test,y_pred_under)
print('Accuracy:',result2_under)

Confusion Matrix:
[[68  1  5  0]
 [ 3 50  0  0]
 [ 8  0 46  1]
 [ 1  0  8 57]]
Classification Report:
              precision    recall  f1-score   support

       Alert       0.98      0.94      0.96        53
      Stable       0.78      0.84      0.81        55
 Sustainable       0.98      0.86      0.92        66

    accuracy                           0.89       248
   macro avg       0.90      0.89      0.89       248
weighted avg       0.90      0.89      0.89       248

Accuracy: 0.8911290322580645
