In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('/storage/emulated/0/\
Hamoye/StageC/original/\
NFA 2019 public_data.csv', low_memory=False)


In [3]:
data

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Armenia,1992,1,AreaPerCap,1.402924e-01,1.995463e-01,0.097188051,3.688847e-02,2.931995e-02,0.000000e+00,5.032351e-01,3A
1,Armenia,1992,1,AreaTotHA,4.830000e+05,6.870000e+05,334600,1.270000e+05,1.009430e+05,0.000000e+00,1.732543e+06,3A
2,Armenia,1992,1,BiocapPerCap,1.598044e-01,1.352610e-01,0.084003213,1.374213e-02,3.339780e-02,0.000000e+00,4.262086e-01,3A
3,Armenia,1992,1,BiocapTotGHA,5.501762e+05,4.656780e+05,289207.1078,4.731155e+04,1.149823e+05,0.000000e+00,1.467355e+06,3A
4,Armenia,1992,1,EFConsPerCap,3.875102e-01,1.894622e-01,1.26E-06,4.164833e-03,3.339780e-02,1.114093e+00,1.728629e+00,3A
...,...,...,...,...,...,...,...,...,...,...,...,...
72181,World,2016,5001,BiocapTotGHA,3.984702e+09,1.504757e+09,5111762779,1.095445e+09,4.726163e+08,0.000000e+00,1.216928e+10,3A
72182,World,2016,5001,EFConsPerCap,5.336445e-01,1.402092e-01,0.273495416,8.974253e-02,6.329435e-02,1.646235e+00,2.746619e+00,3A
72183,World,2016,5001,EFConsTotGHA,3.984702e+09,1.046937e+09,2042179333,6.701039e+08,4.726163e+08,1.229237e+10,2.050891e+10,3A
72184,World,2016,5001,EFProdPerCap,5.336445e-01,1.402092e-01,0.273495416,8.974253e-02,6.329435e-02,1.646235e+00,2.746619e+00,3A


Let us look at the distribution of the values in the independent variable `Qscore`.

In [4]:
data.QScore.value_counts()

3A    51481
2A    10576
2B    10096
1A       16
1B       16
Name: QScore, dtype: int64

We can see that we have 5 different classes to precdict. But before we proceed, let's check for null values in the dataframe and drop the rows.

In [5]:
data.isnull().sum()

country               0
year                  0
country_code          0
record                0
crop_land         20472
grazing_land      20472
forest_land       20472
fishing_ground    20473
built_up_land     20473
carbon            20473
total                 9
QScore                1
dtype: int64

In [6]:
# Dropping rows containing null values
data.dropna(inplace=True)

In [7]:
 # Counts of the values in QScore column
data.QScore.value_counts()

3A    51473
2A      224
1A       16
Name: QScore, dtype: int64

Due to the deletion of some rows containing null values, we have reduced the classes in the `QScore` columns to 3.
 
We are going to combine the classes with the two lowest count, **'2A'** and **'1A'**, so that we can have a binary classification problem.

In [8]:
data['QScore'].replace('1A', '2A', inplace=True)
data.QScore.value_counts()

3A    51473
2A      240
Name: QScore, dtype: int64

There is a great imbalance in the classes, so we will take a sample of data from the class **'3A'**.

In [9]:
data_2A = data[data['QScore']=='2A']
data_3A = data[data['QScore']=='3A'].sample(350, random_state=1)

In [10]:
data_new = pd.concat([data_2A, data_3A]).reset_index(drop=True)
 
data_new

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Algeria,2016,4,AreaPerCap,2.072989e-01,8.112722e-01,0.048357265,2.258528e-02,2.998367e-02,0.000000,1.119497e+00,2A
1,Algeria,2016,4,AreaTotHA,8.417600e+06,3.294260e+07,1963600,9.171000e+05,1.217520e+06,0.000000,4.545842e+07,2A
2,Algeria,2016,4,BiocapPerCap,2.021916e-01,2.636077e-01,0.027166736,7.947991e-03,2.924496e-02,0.000000,5.301590e-01,2A
3,Algeria,2016,4,BiocapTotGHA,8.210214e+06,1.070408e+07,1103135.245,3.227369e+05,1.187524e+06,0.000000,2.152769e+07,2A
4,Algeria,2016,4,EFConsPerCap,6.280528e-01,1.810332e-01,0.162800822,1.472910e-02,2.924496e-02,1.391455,2.407316e+00,2A
...,...,...,...,...,...,...,...,...,...,...,...,...
585,Sierra Leone,1994,197,AreaPerCap,1.424029e-01,5.135843e-01,0.709586732,6.315686e-01,3.220641e-02,0.000000,2.029349e+00,3A
586,Luxembourg,2007,256,BiocapPerCap,4.411709e-01,8.708716e-02,0.986200674,8.009810e-04,1.042298e-01,0.000000,1.619490e+00,3A
587,Botswana,1983,20,EFConsPerCap,3.082520e-01,7.756213e-01,0.264032732,2.549475e-02,1.373033e-02,0.703266,2.090397e+00,3A
588,Argentina,1968,9,BiocapTotGHA,2.383132e+07,7.567452e+07,34001536.15,7.155958e+07,8.568679e+05,0.000000,2.059238e+08,3A


In [11]:
# Shuffling the data
import sklearn.utils
data_new = sklearn.utils.shuffle(data_new, random_state=1)

In [12]:
data_new

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
509,Ghana,2007,81,BiocapTotGHA,1.242086e+07,6.503076e+06,7302759.914,1.531939e+06,1.253929e+06,0.000000e+00,2.901256e+07,3A
257,Bangladesh,2012,16,AreaPerCap,5.471113e-02,3.852896e-03,0.009226403,5.138544e-02,1.538211e-02,0.000000e+00,1.345580e-01,3A
23,Solomon Islands,2016,25,EFProdTotGHA,3.885366e+05,3.673652e+03,1923548.003,1.574918e+06,1.467565e+05,7.384533e+04,4.111278e+06,2A
458,Madagascar,1976,129,BiocapTotGHA,3.481890e+06,2.652845e+07,19647202.72,4.903261e+06,3.968700e+05,0.000000e+00,5.495768e+07,3A
90,Hungary,2016,97,BiocapPerCap,1.535538e+00,6.321249e-02,0.703360396,4.861915e-03,1.599070e-01,0.000000e+00,2.466880e+00,2A
...,...,...,...,...,...,...,...,...,...,...,...,...
129,Kyrgyzstan,2016,113,AreaTotHA,1.364000e+06,9.177000e+06,629000,8.150000e+05,2.541710e+05,0.000000e+00,1.223917e+07,2A
144,Mauritania,2016,136,AreaPerCap,1.071839e-01,9.125742e+00,0.05138316,6.589367e-01,4.798187e-02,0.000000e+00,9.991228e+00,2A
72,Gabon,2016,74,AreaPerCap,2.500265e-01,2.356310e+00,11.71841435,2.352371e+00,3.898873e-02,0.000000e+00,1.671611e+01,2A
235,Ukraine,2016,230,BiocapTotGHA,8.966961e+07,6.457412e+06,19268322.04,6.920056e+06,3.952345e+06,0.000000e+00,1.262677e+08,2A


We are going to drop `country`, `year` and `country_code` columns.

In [13]:
data_new.drop(['country', 'year', 'country_code'], axis=1, inplace=True)

In [14]:
# Split data into training and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_new.drop('QScore', axis=1), data_new['QScore'], test_size=0.3, random_state=1)

In [15]:
data_new

Unnamed: 0,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
509,BiocapTotGHA,1.242086e+07,6.503076e+06,7302759.914,1.531939e+06,1.253929e+06,0.000000e+00,2.901256e+07,3A
257,AreaPerCap,5.471113e-02,3.852896e-03,0.009226403,5.138544e-02,1.538211e-02,0.000000e+00,1.345580e-01,3A
23,EFProdTotGHA,3.885366e+05,3.673652e+03,1923548.003,1.574918e+06,1.467565e+05,7.384533e+04,4.111278e+06,2A
458,BiocapTotGHA,3.481890e+06,2.652845e+07,19647202.72,4.903261e+06,3.968700e+05,0.000000e+00,5.495768e+07,3A
90,BiocapPerCap,1.535538e+00,6.321249e-02,0.703360396,4.861915e-03,1.599070e-01,0.000000e+00,2.466880e+00,2A
...,...,...,...,...,...,...,...,...,...
129,AreaTotHA,1.364000e+06,9.177000e+06,629000,8.150000e+05,2.541710e+05,0.000000e+00,1.223917e+07,2A
144,AreaPerCap,1.071839e-01,9.125742e+00,0.05138316,6.589367e-01,4.798187e-02,0.000000e+00,9.991228e+00,2A
72,AreaPerCap,2.500265e-01,2.356310e+00,11.71841435,2.352371e+00,3.898873e-02,0.000000e+00,1.671611e+01,2A
235,BiocapTotGHA,8.966961e+07,6.457412e+06,19268322.04,6.920056e+06,3.952345e+06,0.000000e+00,1.262677e+08,2A


We have split the dataset into testing and training sets, however, there is still an imbalance we have to address.
 
We can use the SMOTE method only on the training set to achieve this. But firstly, we are going to encode the `record` column in both the training as testing sets.

In [16]:
# Encode categorical variable
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
 
x_train['record'] = encoder.fit_transform(x_train.record)
x_test['record'] = encoder.transform(x_test.record)

In [17]:
import imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1)
x_train_bl, y_train_bl = smote.fit_sample(x_train, y_train)

In [18]:
x_train_bl

Unnamed: 0,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total
0,7,4.123462e+06,1.649261e+07,1.74675e+06,3.547436e+06,6.448009e+05,3.338860e+07,5.994366e+07
1,7,8.210214e+06,5.003670e+06,3.37877e+06,3.097418e+05,1.187524e+06,4.701701e+07,6.510693e+07
2,4,3.392847e-01,1.303845e-02,0.108188,2.529100e-02,1.545818e-02,1.453861e-01,6.466469e-01
3,2,9.140566e-02,6.791892e-02,82.5909,1.435518e+01,5.091948e-03,0.000000e+00,9.711047e+01
4,2,1.050735e+00,1.178528e-01,1.12085,9.772940e-04,1.596657e-01,0.000000e+00,2.450081e+00
...,...,...,...,...,...,...,...,...
491,4,1.018711e+07,1.298333e+06,8.5238e+06,6.974036e+05,1.252204e+06,2.459410e+06,2.441826e+07
492,2,7.680009e+05,2.574039e+06,5.74145e+06,1.374741e+06,1.784972e+05,0.000000e+00,1.063673e+07
493,3,3.005711e+06,2.455156e+06,446309,1.755009e+05,6.121505e+05,8.261153e+05,7.520943e+06
494,5,9.229434e+05,1.564709e+07,457822,8.536439e+03,1.802586e+05,6.014504e+06,2.323116e+07


In [19]:
# Normalize the x_train and x_test set
# Drop the 'record' column before normalizing
# because we don't want to normalize encoded variables
# and append back to the dataframe after normalizing
 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
 
normalized_x_train = scaler.fit_transform(x_train_bl.drop('record', axis=1))
normalized_x_train = pd.DataFrame(normalized_x_train, columns=x_train_bl.drop('record', axis=1).columns)
normalized_x_train['record'] = x_train_bl.record

# Reset the index values of x_test so as to ensure
# parity in later operations 
x_test.reset_index(drop=True, inplace=True)
normalized_x_test = scaler.fit_transform(x_test.drop('record', axis=1))
normalized_x_test = pd.DataFrame(normalized_x_test, columns=x_test.drop('record', axis=1).columns)
normalized_x_test['record'] = x_test.record

### Creating a logistic regression model

In [20]:
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

# Instantiate into a variable
log_reg = LogisticRegression(random_state=1)

# Fit the model
log_reg.fit(normalized_x_train, y_train_bl)

# Make predictions on test set
y_test_predict = log_reg.predict(normalized_x_test)

### Measuring Performance

In [21]:
# Import essential libraries
from sklearn.metrics import (accuracy_score, precision_score, f1_score,
                             recall_score, confusion_matrix)

In [22]:
metrics = [recall_score, precision_score,
           f1_score, accuracy_score]

for metric in metrics:
    if metric != accuracy_score:
        print(metric.__name__.replace('_', ' ').title()+':', round(metric(y_test, y_test_predict, pos_label='2A'), 2))
    else:
        print(metric.__name__.replace('_', ' ').title()+':', round(metric(y_test, y_test_predict), 2))
        

Recall Score: 0.37
Precision Score: 0.47
F1 Score: 0.41
Accuracy Score: 0.55


In [23]:
from sklearn.metrics import confusion_matrix
cnf_mat = confusion_matrix(y_test, y_test_predict, labels=['3A', '2A'])
print(cnf_mat)

[[70 32]
 [47 28]]


The confusion matrix above is interpreted as follows:

- **true negatives:** 70
- **false positives:** 32
- **false negatives:** 47
- **true positives:** 28

If you substitute these values into the formulae for calculating the metrics (i.e `accuracy scores`, `precision scores` etc), you should get consistent results. I suggest you try it out.

### K-Fold Cross Validation

In [24]:
# Import the library
from sklearn.model_selection import KFold

# Instantiate it in a variable and set 
# number of splits
kf = KFold(n_splits=5, random_state=1)

# Perform the split and save in a variable
splits = kf.split(normalized_x_train)

# Initialized a list to store our f1_scores
f1_scores = []

# Loop through the splits and for each split,
# use the returned indexes to select the x_train
# and x_test set for the current fold
for train_index, test_index in splits:
    x_train = normalized_x_train.iloc[train_index]
    x_test = normalized_x_train.iloc[test_index]
    
    # Use the same returned indexes to select the
    # y_train and y_test for the current fold
    y_train = y_train_bl[train_index]
    y_test = y_train_bl[test_index]
    
    # Instantiate a logistic regression model
    log_reg = LogisticRegression(random_state=1)
    
    # Fit the logistic regression model for the current fold
    log_reg.fit(x_train, y_train)
    
    # Make new predictions for the current fold
    # with the model
    y_test_predict = log_reg.predict(x_test)
    
    # calculate the f1_score for the current fold 
    # and set pos_label as '2A'
    score = f1_score(y_test, y_test_predict, average='binary', pos_label='2A')
    
    # Append f1_score to the list, f1_scores
    # initialized above
    f1_scores.append(round(score, 2))
    
    # The for loop repeats itself until it's
    # done for the last fold
    
# Print f1_scores
print(f1_scores)

[0.59, 0.47, 0.59, 0.52, 0.0]


### Stratified K - Fold Cross Validation

In [25]:
# Import the library
from sklearn.model_selection import StratifiedKFold

# Instantiate it in a variable and set 
# number of splits
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# Perform the split and save in a variable
splits = skf.split(normalized_x_train, y_train_bl)

# Initialized a list to store our f1_scores
f1_scores = []

# Loop through the splits and for each split,
# use the returned indexes to select the x_train
# and x_test set for the current fold
for train_index, test_index in splits:
    x_train = normalized_x_train.iloc[train_index]
    x_test = normalized_x_train.iloc[test_index]
    
    # Use the same returned indexes to select the
    # y_train and y_test for the current fold
    y_train = y_train_bl[train_index]
    y_test = y_train_bl[test_index]
    
    # Instantiate a logistic regression model
    log_reg = LogisticRegression(random_state=1)
    
    # Fit the logistic regression model for the current fold
    log_reg.fit(x_train, y_train)
    
    # Make new predictions for the current fold
    # with the model
    y_test_predict = log_reg.predict(x_test)
    
    # calculate the f1_score for the current fold 
    # and set pos_label as '2A'
    score = f1_score(y_test, y_test_predict, average='binary', pos_label='2A')
    
    # Append f1_score to the list, f1_scores
    # initialized above
    f1_scores.append(round(score, 2))
    
    # The for loop repeats itself until it's
    # done for the last fold
    
# Print f1_scores
print(f1_scores)

[0.49, 0.53, 0.51, 0.55, 0.55]


### Cross Validation Score

In [26]:
# Import the model
from sklearn.model_selection import cross_val_score

# Evaluate the score
scores = cross_val_score(log_reg, normalized_x_train,
                        y_train_bl, cv=5, scoring='f1_macro')

# Display the scores
print (scores)


[0.56896552 0.44444444 0.46442789 0.41249349 0.53492647]


### Leave One Out Cross Validation

In [27]:
# Import the model
from sklearn.model_selection import LeaveOneOut

# Instantiate in a variable
loo = LeaveOneOut()

# Calculate scores
scores = cross_val_score(log_reg, normalized_x_train,
                         y_train_bl, cv=loo, scoring='f1_macro')

# Find the average
average_score = scores.mean()

# Display score
print (round(average_score, 2))

0.53
