<a href="https://colab.research.google.com/github/quadribello/Hamoye/blob/master/hamoye_stage_c.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#import important modules
import pandas as pd
import numpy  as np 

In [3]:
df = pd.read_csv('https://query.data.world/s/wh6j7rxy2hvrn4ml75ci62apk5hgae')
df
df.record.value_counts()

  interactivity=interactivity, compiler=compiler, result=result)


EFConsPerCap    9024
BiocapTotGHA    9024
BiocapPerCap    9023
AreaPerCap      9023
EFProdTotGHA    9023
AreaTotHA       9023
EFProdPerCap    9023
EFConsTotGHA    9023
Name: record, dtype: int64

In [4]:
# check distribution of target variable
df['QScore'].value_counts()
df_na = df.isna().sum()
# for simplicity, we will drop the rows with missing values.
df = df.dropna()
df.isna().sum()
df['QScore'].value_counts()

3A    51473
2A      224
1A       16
Name: QScore, dtype: int64

In [5]:
# An obvious change in our target variable after removing the missing values is that there are only three classes left 
# and from the distribution of the 3 classes, we can see that there is an obvious imbalance between the classes.
# There are methods that can be applied to handle this imbalance such as oversampling and undersampling.
# oversampling involves increasing the number of instances in the class with fewer instances while
# undersampling involves reducing the data points in the class with more instances.
# for now, we will convert this to a binary classification problem by combining class '2A' and '1A'
df['QScore'] = df['QScore'].replace(['1A'],'2A')
df.QScore.value_counts() 

3A    51473
2A      240
Name: QScore, dtype: int64

In [6]:
df_2A = df[df.QScore=='2A']
df_3A = df[df.QScore=='3A'].sample(350)
data_df = df_2A.append(df_3A)
data_df.head()

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
1536,Algeria,2016,4,AreaPerCap,0.2072989,0.8112722,0.048357265,0.022585,0.02998367,0.0,1.119497,2A
1537,Algeria,2016,4,AreaTotHA,8417600.0,32942600.0,1963600.0,917100.0,1217520.0,0.0,45458420.0,2A
1538,Algeria,2016,4,BiocapPerCap,0.2021916,0.2636077,0.027166736,0.007948,0.02924496,0.0,0.530159,2A
1539,Algeria,2016,4,BiocapTotGHA,8210214.0,10704080.0,1103135.245,322736.9162,1187524.0,0.0,21527690.0,2A
1540,Algeria,2016,4,EFConsPerCap,0.6280528,0.1810332,0.162800822,0.014729,0.02924496,1.391455,2.407316,2A


In [7]:
import sklearn.utils
data_df = sklearn.utils.shuffle(data_df)
data_df = data_df.reset_index(drop=True)
data_df.shape
data_df.QScore.value_counts()

3A    350
2A    240
Name: QScore, dtype: int64

In [8]:
#more preprocessing
data_df = data_df.drop(columns=['country_code','country','year'])
x = data_df.drop(columns='QScore')
y = data_df['QScore']

In [9]:
#split the data into training and testing sets
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.3,random_state=0)
y_train.value_counts()

3A    236
2A    177
Name: QScore, dtype: int64

In [10]:
# There is still an imbalance in the class distribution. For this, we use SMOTE onlly on the training data to handle this.
# encode categorical variable
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
x_train.record = encoder.fit_transform(x_train.record)
x_test.record = encoder.fit_transform(x_test.record)
x_test.record.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


7    26
3    26
6    23
4    23
1    22
5    21
0    21
2    15
Name: record, dtype: int64

In [11]:
import imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1)
x_train_balanced, y_balanced = smote.fit_sample(x_train, y_train)
x_train_balanced = pd.DataFrame(x_train_balanced, columns= x_train.columns)



In [12]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalised_train_df = scaler.fit_transform(x_train_balanced.drop(columns=['record']))
normalised_train_df = pd.DataFrame(normalised_train_df, columns=x_train_balanced.drop(columns=['record']).columns)
normalised_train_df['record'] = x_train_balanced['record']
x_test = x_test.reset_index(drop=True)
normalised_test_df = scaler.transform(x_test.drop(columns=['record']))
normalised_test_df = pd.DataFrame(normalised_test_df, columns = x_test.drop(columns=['record']).columns)
normalised_test_df['record']= x_test['record']
normalised_test_df

Unnamed: 0,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,record
0,6.240376e-03,2.981584e-02,5.714499e-03,4.508964e-02,1.497910e-02,0.000000e+00,6.192967e-03,1
1,1.664530e-10,1.626114e-10,3.015169e-10,7.125312e-11,4.038598e-10,8.295395e-11,1.396803e-10,4
2,1.744429e-03,6.333207e-04,4.628804e-05,9.359938e-05,4.702303e-03,6.939998e-03,4.179281e-03,7
3,6.412415e-10,7.046578e-11,6.565578e-10,2.465371e-10,6.785512e-10,0.000000e+00,2.438156e-10,2
4,4.221126e-02,1.002687e-02,5.696184e-02,9.819510e-03,1.207773e-02,2.790728e-02,3.255585e-02,7
...,...,...,...,...,...,...,...,...
172,1.874726e-04,4.718625e-03,1.239961e-02,9.336109e-03,3.467390e-04,0.000000e+00,2.485087e-03,1
173,3.457384e-11,6.869966e-11,2.294683e-10,2.114188e-10,2.140205e-11,1.256838e-10,1.180628e-10,6
174,2.172260e-10,7.481774e-10,2.646499e-10,2.940015e-09,4.388849e-10,0.000000e+00,2.518825e-10,2
175,6.251869e-05,8.106238e-04,1.452601e-04,2.505150e-05,5.018482e-05,5.200568e-05,1.224699e-04,5


In [None]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(normalised_train_df,y_balanced)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='auto', n_jobs=None, penalty='12', 
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='12',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

##Measuring Classification Performance

In [None]:
#Cross-validation and accuracy
from sklearn.model_selection import cross_val_score
scores = cross_val_score(log_reg, normalised_train_df, y_balanced, cv=5, scoring='f1_macro')
scores

array([0.50086806, 0.49365794, 0.47433628, 0.47340426, 0.56286796])

In [None]:
#k-Fold Cross Validation
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
kf.split(normalised_train_df)
f1_scores = []
#run for every split
for train_index, test_index in kf.split(normalised_train_df):
  x_train, x_test = normalised_train_df.iloc[train_index],normalised_train_df.iloc[test_index]
  y_train, y_test = y_balanced[train_index], y_balanced[test_index]
  model = LogisticRegression().fit(x_train, y_train)
  #save result to list
  f1_scores.append(f1_score(y_true=y_test, y_pred=model.predict(x_test), pos_label='2A')*100)
f1_scores

[47.69230769230769,
 63.38028169014084,
 53.73134328358209,
 66.66666666666666,
 0.0]

In [None]:
# Stratifield K-Fold Cross Validation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
f2_scores = []
#run for every split
for train_index, test_index in skf.split(normalised_train_df, y_balanced):
  x_train, x_test = np.array(normalised_train_df)[train_index], np.array(normalised_train_df)[test_index]
  y_train, y_test = y_balanced[train_index], y_balanced[test_index]
  model = LogisticRegression().fit(x_train, y_train)
  #save result to list
  f2_scores.append(f1_score(y_true=y_test, y_pred=model.predict(x_test), pos_label='2A')*100)
f2_scores

[58.18181818181818,
 60.8,
 61.403508771929815,
 44.68085106382979,
 48.97959183673469]

In [None]:
# Leave One Out Cross Validation(LOOCV)
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(LogisticRegression(), normalised_train_df, y_balanced, cv=loo, scoring='f1_macro')
average_score = scores.mean()*100
average_score

37.096774193548384

In [None]:
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix
new_predictions = log_reg.predict(normalised_test_df)
# Confusion Matrix
cnf_mat = confusion_matrix(y_true=y_test, y_pred=new_predictions, labels=['2A','3A'])
cnf_mat

array([[43, 25],
       [73, 36]])

In [None]:
#accuracy
accuracy = accuracy_score(y_true=y_test, y_pred=new_predictions)
print('Accuracy: {}'.format(round(accuracy*100),2))

Accuracy: 45.0


In [None]:
# precision
precision = precision_score(y_true=y_test, y_pred=new_predictions, pos_label='2A')
print('Precision: {}'.format(round(precision*100), 2))

Precision: 37.0


In [None]:
#Recall
recall = recall_score(y_true=y_test, y_pred=new_predictions, pos_label='2A')
print('Recall: {}'.format(round(recall*100),2))

Recall: 63.0


In [None]:
#F1-Score
f1 = f1_score(y_true=y_test, y_pred=new_predictions, pos_label='2A')
print('F1: {}'.format(round(f1*100),2))

F1: 47.0


##Tree-Based Methods and the Support Vector Machine

In [13]:
from sklearn.tree import DecisionTreeClassifier
dec_tree = DecisionTreeClassifier()
dec_tree.fit(normalised_train_df, y_balanced)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')