Importing Modules/Libraries

In [182]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import pointbiserialr, spearmanr
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, roc_auc_score

# for logistic regression. 
# statsmodel is chosen because it outputs descriptive stats for the model
import statsmodels.api as sm

# for SVM
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# for random forest
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

Importing Dataframe

In [183]:
df = pd.read_csv("/content/drive/MyDrive/open_one_time_covid_education_impact.csv")
df.head()

Unnamed: 0,submission_id,submission_date,gender,age,geography,financial_situation,education,employment_status,submission_state,are_there_children_0_to_2_yrs_out_of_educational_system,...,are_children_3_to_17_yrs_dealing_with_irregular_school_activity,are_children_being_teached_by_unqualified_people,did_teachers_leave_the_educational_system,do_school_and_the_teachers_have_internet_connection,do_children_have_internet_connection,do_children_3_to_17_yrs_miss_virtual_class_due_to_lack_of_electricity,does_home_shows_severe_deficit_of_electricity,does_home_shows_severe_deficit_of_internet,do_children_3_to_17_yrs_miss_class_or_in_lower_grade,are_children_promoted_with_a_modality_different_from_formal_evaluation
0,5647462894403584,2021-04-26,Male,26 to 35 years old,Suburban/Peri-urban,"I can afford food and regular expenses, but no...",Some university or college,"I work part-time, either as an employee or sel...",Miranda,0,...,0,1,0,1,1,0,1,1,0,1
1,5818608063348736,2021-03-14,Male,36 to 45 years old,Suburban/Peri-urban,"I can afford food and regular expenses, but no...",University or college degree completed,"I work part-time, either as an employee or sel...",Miranda,1,...,1,0,1,0,0,0,0,1,0,0
2,6232241062805504,2021-03-13,Male,36 to 45 years old,Rural,"I can afford food, but nothing else",University or college degree completed,I am a student and I work part-time,Miranda,0,...,0,0,1,0,0,1,0,1,0,0
3,6440166989496320,2021-03-29,Male,26 to 35 years old,Suburban/Peri-urban,I cannot afford enough food for my family,University or college degree completed,I am unemployed,Miranda,0,...,1,0,1,0,1,1,0,0,0,0
4,5819398672875520,2021-03-15,Female,36 to 45 years old,Suburban/Peri-urban,"I can afford food, regular expenses, and cloth...",University or college degree completed,"I work full-time, either as an employee or sel...",Miranda,1,...,1,0,1,0,1,1,1,1,0,1


In [184]:
df.shape

(4750, 27)

In [185]:
df['financial_situation'].value_counts()

I can afford food, but nothing else                                                1524
I cannot afford enough food for my family                                          1203
I can afford food and regular expenses, but nothing else                           1164
I can afford food, regular expenses, and clothes, but nothing else                  283
Prefer not to answer                                                                259
I can comfortably afford food, clothes, and furniture, and I have savings           175
I can comfortably afford food, clothes, and furniture, but I don’t have savings     141
Not Available                                                                         1
Name: financial_situation, dtype: int64

Converting String data into Numeric

In [186]:
df.replace(['I can afford food, but nothing else', 'I cannot afford enough food for my family','I can afford food and regular expenses, but nothing else', 'I can afford food, regular expenses, and clothes, but nothing else'],
             [1, 1, 1, 1], inplace = True)
df.replace(['Prefer not to answer', 'I can comfortably afford food, clothes, and furniture, and I have savings', 'I can comfortably afford food, clothes, and furniture, but I don’t have savings', 'Not Available'],
             [0, 0, 0, 0], inplace = True)

In [187]:
df['gender'].value_counts()

Female        2576
Male          2130
0               39
Non-Binary       5
Name: gender, dtype: int64

In [188]:
# Converting Gender column into binary i.e. Male = 1 & Female = 0
df.replace(['Non-Binary', 'Male', 'Female'], [2, 1, 0], inplace = True)

In [189]:
df['age'].value_counts()

26 to 35 years old    1399
18 to 25 years old    1265
36 to 45 years old    1170
Over 45 years old      798
Under 18               103
0                       15
Name: age, dtype: int64

In [190]:
# Converting the String into Numeric Data by picking the Lower Bound
df.replace(['Under 18', '18 to 25 years old', '26 to 35 years old', '36 to 45 years old', 'Over 45 years old', 'Not Available'],
             [15, 18, 25, 35, 45, 90], inplace = True)

In [191]:
df['do_children_3_and_17_yrs_receive_regular_school_meals'].value_counts()

No           4064
Every day     210
2 days        193
1 day         139
3 days        108
4 days         36
Name: do_children_3_and_17_yrs_receive_regular_school_meals, dtype: int64

In [192]:
# Converting the String into Numeric Data by picking the Lower Bound
df.replace(['No', '1 day', '2 days', '3 days', '4 days', 'Every day'],
             [0, 1, 2, 3, 4, 5], inplace = True)

In [193]:
df

Unnamed: 0,submission_id,submission_date,gender,age,geography,financial_situation,education,employment_status,submission_state,are_there_children_0_to_2_yrs_out_of_educational_system,...,are_children_3_to_17_yrs_dealing_with_irregular_school_activity,are_children_being_teached_by_unqualified_people,did_teachers_leave_the_educational_system,do_school_and_the_teachers_have_internet_connection,do_children_have_internet_connection,do_children_3_to_17_yrs_miss_virtual_class_due_to_lack_of_electricity,does_home_shows_severe_deficit_of_electricity,does_home_shows_severe_deficit_of_internet,do_children_3_to_17_yrs_miss_class_or_in_lower_grade,are_children_promoted_with_a_modality_different_from_formal_evaluation
0,5647462894403584,2021-04-26,1,25,Suburban/Peri-urban,1,Some university or college,"I work part-time, either as an employee or sel...",Miranda,0,...,0,1,0,1,1,0,1,1,0,1
1,5818608063348736,2021-03-14,1,35,Suburban/Peri-urban,1,University or college degree completed,"I work part-time, either as an employee or sel...",Miranda,1,...,1,0,1,0,0,0,0,1,0,0
2,6232241062805504,2021-03-13,1,35,Rural,1,University or college degree completed,I am a student and I work part-time,Miranda,0,...,0,0,1,0,0,1,0,1,0,0
3,6440166989496320,2021-03-29,1,25,Suburban/Peri-urban,1,University or college degree completed,I am unemployed,Miranda,0,...,1,0,1,0,1,1,0,0,0,0
4,5819398672875520,2021-03-15,0,35,Suburban/Peri-urban,1,University or college degree completed,"I work full-time, either as an employee or sel...",Miranda,1,...,1,0,1,0,1,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4745,6070973496557568,2021-03-16,0,25,Suburban/Peri-urban,1,University or college degree completed,"I work full-time, either as an employee or sel...",Anzoátegui,1,...,1,0,1,0,0,1,0,1,0,1
4746,6520660045332480,2021-03-19,0,45,Rural,1,Primary school completed,"I work part-time, either as an employee or sel...",Anzoátegui,0,...,0,0,1,0,0,1,0,1,0,0
4747,6661587720208384,2021-03-16,1,25,Suburban/Peri-urban,1,Some university or college,I am a student and I work part-time,Anzoátegui,1,...,0,0,0,0,0,1,0,0,0,0
4748,4801448004288512,2021-03-13,0,35,City center or metropolitan area,1,University or college degree completed,"I work part-time, either as an employee or sel...",Anzoátegui,0,...,1,0,1,1,1,0,0,1,0,0


In [216]:
predictors = ['gender','age', 'are_children_attending_face_to_face_classes', 
              'can_children_observe_deterioration_of_basic_services_of_school',	'do_children_3_and_17_yrs_receive_regular_school_meals', 
              'do_children_have_internet_connection',	'do_children_3_to_17_yrs_miss_virtual_class_due_to_lack_of_electricity',
              'does_home_shows_severe_deficit_of_electricity', 'does_home_shows_severe_deficit_of_internet']

target = df['financial_situation']

In [195]:
#for x in predictors:
#  df[x] = pd.to_numeric(df[x], errors='coerce')

SVM

In [217]:
algorithms = [ 
    #linear kernel
    [Pipeline([('scaler',StandardScaler()), 
               ('svc',LinearSVC(random_state=1))]), predictors],
    #rbf kernel
    [Pipeline([('scaler',StandardScaler()),
               ('svc',SVC(kernel="rbf", random_state=1))]), predictors],
    #polynomial kernel
    [Pipeline([('scaler',StandardScaler()),
               ('svc',SVC(kernel='poly', random_state=1))]), predictors],
    #sigmoidf kernel
    [Pipeline([('scaler',StandardScaler()),
               ('svc',SVC(kernel='sigmoid', random_state=1))]), predictors]
]

In [218]:
alg_acc = {}
alg_auc = {}
for alg, predictors in algorithms:
    alg_acc[alg] = 0
    alg_auc[alg] = 0
i=0

pred_data = df[predictors] # X

#stratified sampling
#random_state=1: we get the same splits every time we run this
# sss = StratifiedShuffleSplit(target, 10, test_size=0.1, random_state=1) 
cv = StratifiedShuffleSplit(10, test_size=0.1, random_state=1) 
for train_index, test_index in cv.split(pred_data, target):
    i += 1
    train_data = df.iloc[train_index]
    test_data = df.iloc[test_index]
    train_data = pd.concat([train_data,
                            train_data[train_data["financial_situation"]==1],
                            train_data[train_data["financial_situation"]==1]]) 
    X_train, X_test = train_data[predictors], test_data[predictors] 
    y_train, y_test = train_data["financial_situation"], test_data["financial_situation"]
    
    #Make predictions for each algorithm on each fold for alg, predictors in algorithms:
    for alg, predictors in algorithms:
        alg.fit(X_train, y_train)
        y_pred = alg.predict(X_test)
        acc_score = accuracy_score(y_test, y_pred) 
        #print("y target prediction = ", y_pred)
        print("Accuracy =", acc_score)
        alg_acc[alg] += acc_score
        auc_score = roc_auc_score(y_test, y_pred) 
        #print("y target prediction = ", y_pred)
        print("ROC AUC =", auc_score)
        alg_auc[alg] += auc_score

for alg, predictors in algorithms:
    alg_acc[alg] /= 1
    alg_auc[alg] /= 1
    print ("## %s ACC=%f" % (alg, alg_acc[alg]))
    print ("## %s AUC=%f" % (alg, alg_auc[alg]))

Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8610526315789474
ROC AUC = 0.5126726205242702




Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.848421052631579
ROC AUC = 0.49063507814438106




Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8589473684210527
ROC AUC = 0.5114735797568841
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8652631578947368
ROC AUC = 0.49280575539568344
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8652631578947368
ROC AUC = 0.5002274042834698




Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8589473684210527
ROC AUC = 0.49663028198131154




Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8589473684210527
ROC AUC = 0.4892086330935252




Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8568421052631578
ROC AUC = 0.5028528901017116




Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8652631578947368
ROC AUC = 0.5002274042834698




Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8778947368421053
ROC AUC = 0.5
Accuracy = 0.8652631578947368
ROC AUC = 0.5076490531712561
## Pipeline(steps=[('scaler', StandardScaler()),
                ('svc', LinearSVC(random_state=1))]) ACC=8.778947
## Pipeline(steps=[('scaler', StandardScaler()),
                ('svc', LinearSVC(random_state=1))]) AUC=5.000000
## Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC(random_state=1))]) ACC=8.778947
## Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC(random_state=1))]) AUC=5.000000
## Pipeline(steps=[('scaler', StandardScaler()),
                ('svc', SVC(kernel='poly', random_state=1))]) ACC=8.778947
## Pipeline(steps=[('scaler', StandardScaler()),
                ('svc', SVC(kernel='poly', random_state=1))]) AUC=5.000000
## Pipeline(steps=[('scaler', StandardScaler()),
                ('svc', SVC(kernel='sigmoid', random_state=1))]) ACC=8.604211
## Pipeline(steps=

Random Forest

In [219]:
#Bagging
tree_count = 10 
bag_proportion = 0.6 
predictions = []

cv = StratifiedShuffleSplit(test_size=0.1, random_state=1) 
for train_index, test_index in cv.split(df[predictors], target):
    train_data = df.iloc[train_index] 
    test_data = df.iloc[test_index]
    
    for i in range(tree_count):
        bag = train_data.sample(frac=bag_proportion, replace = True, random_state=i)
        X_train, X_test = bag[predictors], test_data[predictors]
        y_train, y_test = bag["financial_situation"], test_data["financial_situation"]
        clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=75) 
        clf.fit(X_train, y_train) 
        predictions.append(clf.predict_proba(X_test)[:,1])

combined = np.sum(predictions, axis=0)/100 
rounded= np.round(combined)

print(accuracy_score(rounded, y_test))

0.8778947368421053


Logistic Regression

In [220]:
predictors = ['gender','age', 'are_children_attending_face_to_face_classes', 
              'can_children_observe_deterioration_of_basic_services_of_school',	'do_children_3_and_17_yrs_receive_regular_school_meals', 
              'do_school_and_the_teachers_have_internet_connection', 'do_children_have_internet_connection',	'do_children_3_to_17_yrs_miss_virtual_class_due_to_lack_of_electricity',
              'does_home_shows_severe_deficit_of_electricity', 'does_home_shows_severe_deficit_of_internet']

# stratified sampling
#80% to train set
train = df.sample(frac=0.8, random_state=1)
y_train = train["financial_situation"]
X_train = train[predictors]

#10% to test set
test = df.sample(frac=0.1, random_state=1)
y_test = test["financial_situation"]
X_test = test[predictors]

#10% to CV set
cross = df.sample(frac=0.1, random_state=2)
y_cross = cross["financial_situation"]
X_cross = cross[predictors]

In [215]:
#train
print("train set result\n")
model = sm.Logit(y_train, X_train) 
result_train = model.fit()

y_train_pred = result_train.predict(X_train) 
y_train_pred = (y_train_pred > 0.5).astype(int) 
acc = accuracy_score(y_train, y_train_pred) 
print("ACC=%f" % (acc))
auc = roc_auc_score(y_train, y_train_pred) 
print("AUC=%f" % (auc))

print("\n CV set result\n")
y_cross_pred = result_train.predict(X_cross) 
y_cross_pred = (y_cross_pred > 0.5).astype(int)
acc = accuracy_score(y_cross, y_cross_pred) 
print("ACC=%f" % (acc))
auc = roc_auc_score(y_cross, y_cross_pred) 
print ("AUC=%f" % (auc))

print("\n test set result\n")
y_test_pred = result_train.predict(X_test) 
y_test_pred = (y_test_pred > 0.5).astype(int) 
acc = accuracy_score(y_test, y_test_pred)
print("ACC=%f" % (acc))
auc = roc_auc_score(y_test, y_test_pred)
print("AUC=%f" % (auc))

train set result

Optimization terminated successfully.
         Current function value: 0.382617
         Iterations 6
ACC=0.873947
AUC=0.500000

 CV set result

ACC=0.867368
AUC=0.500000

 test set result

ACC=0.867368
AUC=0.500000


XGBoost

In [235]:
import xgboost as xgb

In [236]:
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)
xg_train.save_binary('train.buffer')
xg_test.save_binary('train.buffer')
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
param['silent'] = 1 # cleans up the output
param['num_class'] = 3 # number of classes in target label
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
num_round = 30
bst = xgb.train(param, xg_train, num_round, watchlist)

[0]	train-merror:0.124737	test-merror:0.128421
[1]	train-merror:0.125789	test-merror:0.130526
[2]	train-merror:0.125789	test-merror:0.132632
[3]	train-merror:0.125526	test-merror:0.130526
[4]	train-merror:0.125	test-merror:0.130526
[5]	train-merror:0.124737	test-merror:0.130526
[6]	train-merror:0.123947	test-merror:0.130526
[7]	train-merror:0.123684	test-merror:0.130526
[8]	train-merror:0.123947	test-merror:0.130526
[9]	train-merror:0.123947	test-merror:0.130526
[10]	train-merror:0.123947	test-merror:0.130526
[11]	train-merror:0.122105	test-merror:0.130526
[12]	train-merror:0.122105	test-merror:0.130526
[13]	train-merror:0.121579	test-merror:0.130526
[14]	train-merror:0.121053	test-merror:0.130526
[15]	train-merror:0.120789	test-merror:0.130526
[16]	train-merror:0.120263	test-merror:0.130526
[17]	train-merror:0.120263	test-merror:0.128421
[18]	train-merror:0.119474	test-merror:0.130526
[19]	train-merror:0.119211	test-merror:0.126316
[20]	train-merror:0.118421	test-merror:0.126316
[21]	

In [237]:
# get prediction
y_pred1 = bst.predict(xg_train)
y_pred2 = bst.predict(xg_test)
print('Train accuracy score:',accuracy_score(y_train,y_pred1))
print('Test accuracy score:',accuracy_score(y_test,bst.predict(xg_test)))

Train accuracy score: 0.885
Test accuracy score: 0.88
