In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RepeatedStratifiedKFold, StratifiedKFold

from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve, roc_auc_score, precision_score, recall_score, precision_recall_curve
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np

In [2]:
# Reading data and formatting
df_state = pd.read_csv("../kenyaState_modelAll.csv")
df_city = pd.read_csv("../kenyaCity_modelAll.csv")


state_cols = df_state[["clights_KenyaF182013", "cken_pd_2013", "slum"]]
city_cols = df_city[["clights_KenyaF182013", "cken_pd_2013", "slum"]]

CTlight_dummy = pd.get_dummies(df_city["clights_KenyaF182013"], 
                              drop_first = True, 
                              prefix = "light")
CTpop_dummy = pd.get_dummies(df_city["cken_pd_2013"], 
                              drop_first = True, 
                              prefix = "pop")

STlight_dummy = pd.get_dummies(df_state["clights_KenyaF182013"], 
                              drop_first = True, 
                              prefix = "light")
STpop_dummy = pd.get_dummies(df_state["cken_pd_2013"], 
                              drop_first = True, 
                              prefix = "pop")



city_features = np.array(pd.concat([CTlight_dummy, CTpop_dummy], axis = 1))
city_labels = np.array(city_cols["slum"])

state_features = np.array(pd.concat([STlight_dummy, STpop_dummy], axis = 1))
state_labels = np.array(state_cols["slum"])



  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
trainST_features, testST_features, trainST_labels, testST_labels = train_test_split(state_features, 
                                                                                    state_labels, 
                                                                                    test_size = 0.25, 
                                                                                    random_state = 42) 

trainCT_features, testCT_features, trainCT_labels, testCT_labels = train_test_split(city_features, 
                                                                                    city_labels, 
                                                                                    test_size = 0.25, 
                                                                                    random_state = 42) 

In [None]:
_, slum_counts = np.unique(state_labels, return_counts = True)
print(slum_counts)

w_state = {0:1, 1:slum_counts[0]/slum_counts[1]}

#lg_state = LogisticRegression(random_state=13, class_weight=w_state)
lg_state = LogisticRegression(random_state=13, class_weight= "balanced")

lg_state.fit(trainST_features, trainST_labels)
# test
y_pred = lg_state.predict(testST_features)

print(f'Accuracy Score: {accuracy_score(testST_labels,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(testST_labels, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(testST_labels, y_pred)}')
print(f'Recall score: {recall_score(testST_labels,y_pred)}')


[0 1] [370935  23020]
Accuracy Score: 0.6632009666054077
Confusion Matrix: 
[[60535 32217]
 [  954  4783]]
Area Under Curve: 0.7431826944908393
Recall score: 0.8337109987798501


In [5]:
_, slum_counts = np.unique(city_labels, return_counts = True)
print( slum_counts)

w_city = {0:1, 1:100}
#w_city = {0:1, 1:slum_counts[0]/slum_counts[1]}

#lg_city = LogisticRegression(random_state=13, class_weight=w_city)
lg_city = LogisticRegression(random_state=13, class_weight= "balanced")

lg_city.fit(trainCT_features, trainCT_labels)
# test
y_pred = lg_city.predict(testCT_features)
print(f"coeffs:{lg_city.coef_}")

print(f'Accuracy Score: {accuracy_score(testCT_labels,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(testCT_labels, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(testCT_labels, y_pred)}')
print(f'Recall score: {recall_score(testCT_labels,y_pred)}')

[30026 22991]
coeffs:[[ 0.33900397  0.27711585  0.84845934  0.71032945  0.57252531  0.77013278
  -0.01596121]]
Accuracy Score: 0.5888344021124104
Confusion Matrix: 
[[4651 2901]
 [2549 3154]]
Area Under Curve: 0.5844528029590432
Recall score: 0.5530422584604594
