# Summary of this notebook

In this notebook, we use the non-image data associated with each of the villages in our training data set in order to train candidate baseline models.  We select the model with the highest accuracy score on the test data as our baseline model, and then we evaluate its performance in terms of accuracy and recall scores on the unseen validation data.  This baseline model will be an important point of comparison for our production model in the [final notebook](06_final_modeling.ipynb).

In [1]:
#If using Google Colab and Google Drive, run the following commands

#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
#REPLACE THIS COMMAND WITH THE APPROPRIATE PATH TO THE "code" FOLDER ON YOUR GOOGLE DRIVE
#%cd ./drive/MyDrive/poverty_project/group_project/code

### Imports

In [3]:
import pandas as pd
import numpy as np

#from os import listdir
#from os.path import isfile, join
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

### Load data

In [4]:
# Load data
train = pd.read_csv('../data/train.csv', index_col='index')
test = pd.read_csv('../data/test.csv', index_col='index')
val = pd.read_csv('../data/val.csv', index_col='index')

In [5]:
# Dimension check
print(train.shape)
print(test.shape)
print(val.shape)

(17520, 8)
(974, 8)
(973, 8)


In [6]:
# Normalize check
print(train['is_poorest'].value_counts(normalize=True))
print(test['is_poorest'].value_counts(normalize=True))
print(val['is_poorest'].value_counts(normalize=True))

0    0.665411
1    0.334589
Name: is_poorest, dtype: float64
0    0.663244
1    0.336756
Name: is_poorest, dtype: float64
0    0.692703
1    0.307297
Name: is_poorest, dtype: float64


### Create dummies

In [7]:
train = pd.get_dummies(train, columns=['country'], drop_first=True)
test = pd.get_dummies(test, columns=['country'], drop_first=True)
val = pd.get_dummies(val, columns=['country'], drop_first=True)

In [8]:
# Dummies check: value must return 1.0 for both line
print(np.mean([int(x) for x in (train.columns == test.columns)]))
print(np.mean([int(x) for x in (test.columns == val.columns)]))

1.0
1.0


### Create funtion to run models

In [9]:
def fit_and_report(model, df_train, df_test, df_val, is_print=False):
    
    # Prepare variable
    only_country = ['year', 'wealth_index', 'households', 'is_poorest', 'latitude', 'longitude', 'is_urban']
    country_is_urban = ['year', 'wealth_index', 'households', 'is_poorest', 'latitude', 'longitude']
    country_is_urban_lat_long = ['year', 'wealth_index', 'households', 'is_poorest']
    only_lat_long = ['latitude', 'longitude']
    country_lat_long = ['year', 'wealth_index', 'households', 'is_poorest', 'is_urban']
    y = 'is_poorest'
    
    # Split X, y --- Train
    df_train_f1 = df_train.drop(columns=only_country)
    df_train_f2 = df_train.drop(columns=country_is_urban)
    df_train_f3 = df_train.drop(columns=country_is_urban_lat_long)
    df_train_f4 = df_train[only_lat_long] # easier to just select lat&long columns
    df_train_f5 = df_train.drop(columns=country_lat_long)
    X_train = [df_train_f1, df_train_f2, df_train_f3, df_train_f4, df_train_f5]
    y_train = df_train[y]
    
    # Split X, y --- Test
    df_test_f1 = df_test.drop(columns=only_country)
    df_test_f2 = df_test.drop(columns=country_is_urban)
    df_test_f3 = df_test.drop(columns=country_is_urban_lat_long)
    df_test_f4 = df_test[only_lat_long] # easier to just select lat&long columns
    df_test_f5 = df_test.drop(columns=country_lat_long)
    X_test = [df_test_f1, df_test_f2, df_test_f3, df_test_f4, df_test_f5]
    y_test = df_test[y]
    
    # Split X, y --- Val
    df_val_f1 = df_val.drop(columns=only_country)
    df_val_f2 = df_val.drop(columns=country_is_urban)
    df_val_f3 = df_val.drop(columns=country_is_urban_lat_long)
    df_val_f4 = df_val[only_lat_long] # easier to just select lat&long columns
    df_val_f5 = df_val.drop(columns=country_lat_long)
    X_val = [df_val_f1, df_val_f2, df_val_f3, df_val_f4, df_val_f5]
    y_val = df_val[y]
    
    # Fitting and store results
    scores = []
    cm = []
    cm_val = []
    for i in range(len(model)):
        model[i].fit(X_train[i], y_train) # fit the model here
        scores.append(model[i].score(X_train[i], y_train))
        scores.append(model[i].score(X_test[i], y_test))
        scores.append(model[i].score(X_val[i], y_val))
        cm.append(confusion_matrix(y_test, model[i].predict(X_test[i])))
        cm_val.append(confusion_matrix(y_val, model[i].predict(X_val[i])))
        if is_print:
            print(f'feature {i+1} train = {model[i].score(X_train[i], y_train)}')
            print(f'feature {i+1} test = {model[i].score(X_test[i], y_test)}\n')
    
    return scores, cm, cm_val

### Modeling & Result

In [10]:
# run the model
n_features = 5
df_score = pd.DataFrame(index=['only_country_train', 'only_country_test', 'only_country_val',
                               'country_is_urban_train', 'country_is_urban_test', 'country_is_urban_val',
                               'country_is_urban_lat_long_train', 'country_is_urban_lat_long_test', 'country_is_urban_lat_long_val',
                               'only_lat_long_train', 'only_lat_long_test', 'only_lat_long_val',
                               'country_lat_long_train', 'country_lat_long_test', 'country_lat_long_val',
                              ])

# Iterate through all models for making the baseline model
logr, rf, ada, knn = [], [], [], []
for _ in range(n_features):
    logr.append(LogisticRegression(random_state=42))
    rf.append(RandomForestClassifier(random_state=42))
    ada.append(AdaBoostClassifier(random_state=42))
    knn.append(KNeighborsClassifier())
df_score['LogisticRegression'], lr_cm, lr_cm_val = fit_and_report(logr, train, test, val)
df_score['RandomForestClassifier'], rf_cm, rf_cm_val = fit_and_report(rf, train, test, val)
df_score['AdaBoostClassifier'], ada_cm, ada_cm_val = fit_and_report(ada, train, test, val)
df_score['KNeighborsClassifier'], knn_cm, knn_cm_val = fit_and_report(knn, train, test, val)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mod

In [11]:
df_score

Unnamed: 0,LogisticRegression,RandomForestClassifier,AdaBoostClassifier,KNeighborsClassifier
only_country_train,0.709247,0.709075,0.709247,0.628596
only_country_test,0.709446,0.704312,0.709446,0.605749
only_country_val,0.73073,0.729702,0.73073,0.627955
country_is_urban_train,0.799886,0.799886,0.799886,0.782763
country_is_urban_test,0.783368,0.783368,0.783368,0.764887
country_is_urban_val,0.782117,0.782117,0.782117,0.786228
country_is_urban_lat_long_train,0.790354,0.999829,0.817637,0.880936
country_is_urban_lat_long_test,0.781314,0.820329,0.808008,0.813142
country_is_urban_lat_long_val,0.786228,0.818088,0.806783,0.827338
only_lat_long_train,0.675628,0.999772,0.729909,0.856164


In [12]:
# Print score function
def print_score(cm, is_print=False):
    tn, fp, fn, tp = cm.ravel()
    # calculate accuracy
    conf_accuracy = (float (tp+tn) / float(tp + tn + fp + fn))
    
    # calculate the recall
    conf_recall = (tp / float(tp + fn))

    if is_print:
        decimal = 3
        print(f'Accuracy: \t{round(conf_accuracy,decimal)}') 
        print(f'Recall: \t{round(conf_recall,decimal)}') 
    return conf_accuracy, conf_recall

In [13]:
# View baseline accuracy and recall score for test and val data
print('-'*50)
print('ฺBaseline score: Random Forest + country_lat_long + test')
print_score(rf_cm[4], True)
print('-'*50)
print('Baseline score: Random Forest + country_lat_long + val')
print_score(rf_cm_val[4], True)
print('-'*50)

--------------------------------------------------
ฺBaseline score: Random Forest + country_lat_long + test
Accuracy: 	0.813
Recall: 	0.716
--------------------------------------------------
Baseline score: Random Forest + country_lat_long + val
Accuracy: 	0.804
Recall: 	0.672
--------------------------------------------------


In [14]:
# Confusion matrix: Random Forest + country_lat_long + val
print('Confusion matrix: Random Forest + country_lat_long + val')
cm_df = pd.DataFrame(rf_cm_val[4], 
            columns = ['Predicted not Poorest', 'Predicted Poorest'],
            index = ['Actual not Poorest', 'Actual Poorest'])
cm_df

Confusion matrix: Random Forest + country_lat_long + val


Unnamed: 0,Predicted not Poorest,Predicted Poorest
Actual not Poorest,581,93
Actual Poorest,98,201


In [15]:
df_best_model = df_score.iloc[-3:,1:2]
acc_rec = pd.DataFrame(data=[print_score(rf_cm_val[4])[0], print_score(rf_cm_val[4])[1]], index=['accuracy_val', 'recall_val'], columns=['RandomForestClassifier'])
df_best_model = pd.concat([df_best_model, acc_rec])
df_best_model

Unnamed: 0,RandomForestClassifier
country_lat_long_train,0.999886
country_lat_long_test,0.813142
country_lat_long_val,0.8037
accuracy_val,0.8037
recall_val,0.672241


### Export

In [16]:
df_score.to_csv('../data/all_baseline_model_score.csv')

In [17]:
df_best_model.to_csv('../data/best_baseline_model_score_with_recall.csv')