In [5]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,f1_score,precision_score
import os
import glob
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score, make_scorer
from pprint import pprint
from sklearn.inspection import permutation_importance
from imblearn.under_sampling import NearMiss

In [6]:
df=pd.read_excel(r'C:\Users\javad\OneDrive - University of Windsor\Shirin Research\research\ML Codes\Data_VIF.xlsx',index_col=0)

In [7]:
df['Label'].value_counts()

Alert          388
Stable         316
Sustainable    309
Name: Label, dtype: int64

In [8]:
x=df.drop(['Label','Year'],axis=1)
y=df['Label']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state = 1)

## We implement the GridSeach to do the hyperparameter Tuning

In [10]:
model_params = {
    'n_estimators': [10, 20, 50, 100, 150,200,250,300],
    'criterion':["gini","entropy", "log_loss"]
    }

In [11]:
rf_model = RandomForestClassifier()

In [17]:
clf = GridSearchCV(rf_model, model_params, n_jobs=-1,cv=5, scoring='accuracy')

In [19]:
model = clf.fit(X_train, y_train)

In [20]:
pprint(model.best_estimator_.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 150,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [21]:
y_pred=model.predict(X_test)

In [22]:
result = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(result)
result1 = classification_report(y_test, y_pred)
print('Classification Report:',)
print (result1)
result2 = accuracy_score(y_test,y_pred)
print('Accuracy:',result2)

Confusion Matrix:
[[176   4   1   0]
 [ 14  70   0   0]
 [  6   0  56   2]
 [  0   0   6  55]]
Classification Report:
              precision    recall  f1-score   support

       Alert       0.95      0.83      0.89        84
      Stable       0.89      0.88      0.88        64
 Sustainable       0.96      0.90      0.93        61

    accuracy                           0.92       390
   macro avg       0.92      0.90      0.91       390
weighted avg       0.92      0.92      0.91       390

Accuracy: 0.9153846153846154


# Over sampling Random Forest

In [23]:
df_over=df.copy()

In [24]:
x_over=df_over.drop(['Label','Year'],axis=1)
y_over=df_over['Label']

In [25]:
oversample = SMOTE()
x_over, y_over = oversample.fit_resample(x_over, y_over)

In [26]:
y_over.value_counts()

Alert          933
Stable         933
Sustainable    933
Name: Label, dtype: int64

In [27]:
X_over_train, X_over_test, y_over_train, y_over_test = train_test_split(x_over,y_over,test_size=0.2, random_state = 1)

In [28]:
rf_model_over = RandomForestClassifier()

In [34]:
clf_over = GridSearchCV(rf_model_over, model_params, n_jobs=-1,cv=5, scoring='accuracy')

In [35]:
model_over = clf_over.fit(X_over_train, y_over_train)

In [36]:
pprint(model_over.best_estimator_.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 150,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [37]:
y_pred_over = model_over.predict(X_over_test)

In [38]:
result_over = confusion_matrix(y_over_test,y_pred_over)
print('Confusion Matrix:')
print(result_over)
result1_over = classification_report(y_over_test,y_pred_over)
print('Classification Report:',)
print (result1_over)
result2_over = accuracy_score(y_over_test,y_pred_over)
print('Accuracy:',result2_over)

Confusion Matrix:
[[184  14   6   0]
 [ 14 179   0   0]
 [  2   0 171   2]
 [  0   0   3 172]]
Classification Report:
              precision    recall  f1-score   support

       Alert       0.93      0.93      0.93       193
      Stable       0.95      0.98      0.96       175
 Sustainable       0.99      0.98      0.99       175

    accuracy                           0.95       747
   macro avg       0.95      0.95      0.95       747
weighted avg       0.95      0.95      0.94       747

Accuracy: 0.9451137884872824


# Under sampling Random forest

In [39]:
df_under=df.copy()

In [40]:
x_under=df_under.drop(['Label','Year'],axis=1)
y_under=df_under['Label']

In [41]:
undersample = NearMiss(version=1, n_neighbors=3)

In [42]:
x_under, y_under= undersample.fit_resample(x_under,y_under)

In [43]:
y_under.value_counts()

Alert          309
Stable         309
Sustainable    309
Name: Label, dtype: int64

In [44]:
X_under_train, X_under_test, y_under_train, y_under_test = train_test_split(x_under,y_under,test_size=0.2, random_state = 1)

In [45]:
rf_model_under = RandomForestClassifier()

In [51]:
clf_under = GridSearchCV(rf_model_under, model_params, n_jobs=-1,cv=5, scoring='accuracy')

In [52]:
model_under = clf_under.fit(X_under_train, y_under_train)

In [53]:
pprint(model_under.best_estimator_.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 300,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [54]:
y_pred_under = model_under.predict(X_under_test)

In [55]:
result_under = confusion_matrix(y_under_test,y_pred_under)
print('Confusion Matrix:')
print(result_under)
result1_under = classification_report(y_under_test,y_pred_under)
print('Classification Report:',)
print (result1_under)
result2_under = accuracy_score(y_under_test,y_pred_under)
print('Accuracy:',result2_under)

Confusion Matrix:
[[67  4  2  1]
 [ 3 50  0  0]
 [ 0  1 51  3]
 [ 0  0  6 60]]
Classification Report:
              precision    recall  f1-score   support

       Alert       0.91      0.94      0.93        53
      Stable       0.86      0.93      0.89        55
 Sustainable       0.94      0.91      0.92        66

    accuracy                           0.92       248
   macro avg       0.92      0.92      0.92       248
weighted avg       0.92      0.92      0.92       248

Accuracy: 0.9193548387096774


In [56]:
model_over.best_estimator_.feature_importances_


array([0.04709458, 0.30058015, 0.11821947, 0.15784942, 0.29290476,
       0.08335162])

In [46]:
x_over.columns

Index(['GDP growth', 'GDP per capita', 'Population', 'School enrolment ',
       'Corruption', 'Regime types'],
      dtype='object')

In [60]:
data2 = {'Feature': ['GDP growth', 'GDP per capita', 'Population', 'School enrolment ',
       'Corruption', 'Regime types'], 'Feature Importance': [0.04709458, 0.30058015, 0.08335162, 0.15784942, 0.29290476,
       0.11821947]}

In [61]:
result2=pd.DataFrame(data2)
result2

Unnamed: 0,Feature,Feature Importance
0,GDP growth,0.047095
1,GDP per capita,0.30058
2,Population,0.083352
3,School enrolment,0.157849
4,Corruption,0.292905
5,Regime types,0.118219
