In [21]:
import imblearn
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import r2_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
pd.set_option('display.max_columns', 500)

In [2]:
df = pd.read_csv('T2.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
predictor_vars = ['YearsAsCoach', 'DaysTSubmitNoMissing',
       'FitNoMissing', 'RelationshipBuildingNoMissing',
       'FinalAchievementNoMissing', 'CommAndPresNoMissing',
       'PredModelScoreNoMissing', 'CMProspectRatingNoMissing',
       'AgeAtFDOSNoMissing', 'SchoolSelectivityNoMissing',
       'CumuGPANoMissing', 'PerseveranceNoMissing', 'FRPLNoMissing',
       'YearsSchoolPartnerNoMissing', 'YearsPartnerPartnerNoMissing',
       'FirstYearsInSchoolNoMissing', 'SecondYearsInSchoolNoMissing',
       'CMsInSchoolNoMissing', 'CertProgramCostNoMissing',
       'AvgFirstMonthSalNoMissing', 'RentPropNoMissing',
       'AvgRentNoMissing', 'CorpsSizeNoMissing', 'RegPrefLevNoMissing',
       'CSI6NoMissing', 'CSI5NoMissing', 'CSI3NoMissing',
       'CSI12NoMissing', 'CLI8NoMissing', 'CLI6NoMissing',
       'OtherCMsSameCoachNoMissing', 'OtherCMsPrevCoachNoMissing',
       'K12TeachNo', 'K12TeachYes', 'K12TeachNull', 'HadFamRespNo',
       'HadFamRespYes', 'HadFamRespNull', 'PellGrantNoOrMissing',
       'PellGrantYes', 'GenderFemale', 'GenderMale', 'CalcGradStudent',
       'CalcJunior', 'CalcProfessional', 'CalcUndergrad', 'CalcUnknown',
       'AttendLIC_HS_No', 'AttendLIC_HS_Yes', 'AttendLIC_HS_Unknown',
       'GrewUpLIC_No_Or_Unknown', 'GrewUpLIC_Yes',
       'LIC_Served_No_Or_Unknown', 'LIC_Served_Yes', 'Eth_Black',
       'Eth_Native', 'Eth_AsianAmPacIsl', 'Eth_Latinx',
       'Eth_MultiEthMultiRacial', 'Eth_OtherOrUnknown', 'Eth_White',
       'VarsitySport_No_Or_Unknown', 'VarsitySport_Yes',
       'DeferralRequested', 'DeferralNotRequested', 'Title1No',
       'Title1Yes', 'Grade_ECE', 'Grade_HIGH', 'Grade_LOWELEM',
       'Grade_MIDDLE', 'Grade_UPPERELEM', 'Grade_Unknown',
       'School_Charter', 'School_Public', 'School_Other', 'LGB_NoProtect',
       'LGB_LocalProtect', 'LGB_StateProtect', 'TeacherPP_No',
       'TeacherPP_Yes', 'Region_Large', 'Region_Medium', 'Region_Small',
       'Urbanicity_Both', 'Urbanicity_Rural', 'Urbanicity_Urban',
       'COL_High', 'COL_Low', 'COL_Medium', 'MajorTeaching_Match',
       'MajorTeaching_Mismatch', 'MajorOffered_Match',
       'MajorOffered_Mismatch']

In [4]:
# training with no scaling or oversampling
ind_vars = df[predictor_vars]
label = df['SwellOneExit']

train_features, test_features, train_labels, test_labels = train_test_split(ind_vars, label, test_size = 0.2, stratify=label)
ratio_neg_pos = (len(test_labels) - test_labels.sum()) / test_labels.sum()

model = xgb.XGBClassifier(scale_pos_weight = ratio_neg_pos)
model.fit(train_features, train_labels)
y_pred = model.predict(test_features)
acc_score = accuracy_score(test_labels, y_pred)
rec_score = recall_score(test_labels, y_pred)
r2 = r2_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)
print('accuracy', acc_score, 'recall', rec_score, 'precision', precision)
print(confusion_matrix(test_labels, y_pred))

accuracy 0.7436399217221135 recall 0.3128491620111732 precision 0.050359712230215826
[[3364 1056]
 [ 123   56]]


In [5]:
# training with scaling but still no oversampling
ind_vars = df[predictor_vars]
label = df['SwellOneExit']

ind_vars = StandardScaler().fit_transform(ind_vars)
train_features, test_features, train_labels, test_labels = train_test_split(ind_vars, label, test_size = 0.2, stratify=label)
ratio_neg_pos = (len(test_labels) - test_labels.sum()) / test_labels.sum()

model = xgb.XGBClassifier(scale_pos_weight = ratio_neg_pos)
model.fit(train_features, train_labels)
y_pred = model.predict(test_features)
acc_score = accuracy_score(test_labels, y_pred)
rec_score = recall_score(test_labels, y_pred)
r2 = r2_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)
print('accuracy', acc_score, 'recall', rec_score, 'precision', precision)
print(confusion_matrix(test_labels, y_pred))

accuracy 0.7673407262448358 recall 0.4134078212290503 precision 0.0712223291626564
[[3455  965]
 [ 105   74]]


In [6]:
# with SMOTE oversampling
ind_vars = df[predictor_vars]
label = df['SwellOneExit']

ind_vars = StandardScaler().fit_transform(ind_vars)
train_features, test_features, train_labels, test_labels = train_test_split(ind_vars, label, test_size = 0.2, stratify=label)

print('before oversampling label counts:', '\n', train_labels.value_counts())
oversample = SMOTE()
train_features, train_labels = oversample.fit_resample(train_features, train_labels)
print('after label counts', '\n', pd.Series(train_labels).value_counts())

model = xgb.XGBClassifier()
model.fit(train_features, train_labels)
y_pred = model.predict(test_features)
acc_score = accuracy_score(test_labels, y_pred)
rec_score = recall_score(test_labels, y_pred)
r2 = r2_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)
print('accuracy', acc_score, 'recall', rec_score, 'precision', precision)
print(confusion_matrix(test_labels, y_pred))

before oversampling label counts: 
 0    17676
1      717
Name: SwellOneExit, dtype: int64
after label counts 
 1    17676
0    17676
dtype: int64
accuracy 0.9599913024570559 recall 0.0 precision 0.0
[[4415    5]
 [ 179    0]]


In [7]:
ind_vars = df[predictor_vars]
label = df['SwellOneExit']

ind_vars = StandardScaler().fit_transform(ind_vars)
train_features, test_features, train_labels, test_labels = train_test_split(ind_vars, label, test_size = 0.2, stratify=label)

print('before oversampling label counts:', '\n', train_labels.value_counts())
oversample = SMOTE()
train_features, train_labels = oversample.fit_resample(train_features, train_labels)
print('after label counts', '\n', pd.Series(train_labels).value_counts())

model = xgb.XGBClassifier(scale_pos_weight = 10)
model.fit(train_features, train_labels)
y_pred = model.predict(test_features)
acc_score = accuracy_score(test_labels, y_pred)
rec_score = recall_score(test_labels, y_pred)
r2 = r2_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)
print('accuracy', acc_score, 'recall', rec_score, 'precision', precision)
print(confusion_matrix(test_labels, y_pred))

before oversampling label counts: 
 0    17676
1      717
Name: SwellOneExit, dtype: int64
after label counts 
 1    17676
0    17676
dtype: int64
accuracy 0.7747336377473364 recall 0.3687150837988827 precision 0.06673407482305359
[[3497  923]
 [ 113   66]]


In [43]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=False)

X = df[predictor_vars]
Y = df['SwellOneExit']

recall_list = []
precision_list = []
f1_list = []
confusion_list = []

for train_index, test_index in skf.split(X, Y):
    train_features, test_features = X.iloc[train_index, :], X.iloc[test_index, :]
    train_labels, test_labels = Y[train_index], Y[test_index]
    
    # oversample
    df_ovsmpl = pd.concat([train_features, train_labels], axis=1)
    df_majority = df_ovsmpl[df_ovsmpl.SwellOneExit==0]
    df_minority = df_ovsmpl[df_ovsmpl.SwellOneExit==1]
    # Upsample minority class
    df_minority_upsampled = resample(df_minority, 
                                     replace=True,     # sample with replacement
                                     n_samples=len(df_majority),    # to match majority class
                                     random_state=123) # reproducible results
    # Combine majority class with upsampled minority class
    df_upsampled = pd.concat([df_majority, df_minority_upsampled])
    upsmpl_Y = df_upsampled['SwellOneExit']
    upsmpl_X = df_upsampled.drop('SwellOneExit', axis=1)

    # training test split of the oversampled dataframe

#     upsmpl_train_features, upsmpl_test_features, upsmpl_train_labels, upsmpl_test_labels = \
#     train_test_split(upsmpl_X, upsmpl_Y, test_size = 0.2, random_state=123)
    lr_upsampled = LogisticRegression(solver='liblinear')

    # train model
    lr_upsampled_mdl = lr_upsampled.fit(upsmpl_X, upsmpl_Y)
    lr_upsampled_pred = lr_upsampled_mdl.predict(test_features)
    precision_list.append(precision_score(test_labels, lr_upsampled_pred, average='weighted'))
    recall_list.append(recall_score(test_labels, lr_upsampled_pred))
    f1_list.append(f1_score(test_labels, lr_upsampled_pred))
    confusion_list.append(confusion_matrix(test_labels, lr_upsampled_pred))

KeyboardInterrupt: 

In [40]:
recall_list

[0.49444444444444446,
 0.547486033519553,
 0.4860335195530726,
 0.48044692737430167,
 0.5027932960893855]

In [41]:
precision_list

[0.9325452447027485,
 0.9371819420746383,
 0.9339276724140311,
 0.9330324155465521,
 0.9342676880518695]

In [42]:
f1_list

[0.0937335439705108,
 0.10834715312327252,
 0.09897610921501707,
 0.09518539014941892,
 0.09884678747940692]

In [38]:
for i in confusion_list:
    print(i)

[[1724 2696]
 [  55  125]]
[[3040 1379]
 [  87   92]]
[[2570 1849]
 [  87   92]]
[[3252 1167]
 [ 108   71]]
[[2743 1676]
 [  80   99]]
