In [17]:
import imblearn
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import r2_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
pd.set_option('display.max_columns', 500)

In [6]:
df = pd.read_csv('T2.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
predictor_vars = ['YearsAsCoach', 'DaysTSubmitNoMissing',
       'FitNoMissing', 'RelationshipBuildingNoMissing',
       'FinalAchievementNoMissing', 'CommAndPresNoMissing',
       'PredModelScoreNoMissing', 'CMProspectRatingNoMissing',
       'AgeAtFDOSNoMissing', 'SchoolSelectivityNoMissing',
       'CumuGPANoMissing', 'PerseveranceNoMissing', 'FRPLNoMissing',
       'YearsSchoolPartnerNoMissing', 'YearsPartnerPartnerNoMissing',
       'FirstYearsInSchoolNoMissing', 'SecondYearsInSchoolNoMissing',
       'CMsInSchoolNoMissing', 'CertProgramCostNoMissing',
       'AvgFirstMonthSalNoMissing', 'RentPropNoMissing',
       'AvgRentNoMissing', 'CorpsSizeNoMissing', 'RegPrefLevNoMissing',
       'CSI6NoMissing', 'CSI5NoMissing', 'CSI3NoMissing',
       'CSI12NoMissing', 'CLI8NoMissing', 'CLI6NoMissing',
       'OtherCMsSameCoachNoMissing', 'OtherCMsPrevCoachNoMissing',
       'K12TeachNo', 'K12TeachYes', 'K12TeachNull', 'HadFamRespNo',
       'HadFamRespYes', 'HadFamRespNull', 'PellGrantNoOrMissing',
       'PellGrantYes', 'GenderFemale', 'GenderMale', 'CalcGradStudent',
       'CalcJunior', 'CalcProfessional', 'CalcUndergrad', 'CalcUnknown',
       'AttendLIC_HS_No', 'AttendLIC_HS_Yes', 'AttendLIC_HS_Unknown',
       'GrewUpLIC_No_Or_Unknown', 'GrewUpLIC_Yes',
       'LIC_Served_No_Or_Unknown', 'LIC_Served_Yes', 'Eth_Black',
       'Eth_Native', 'Eth_AsianAmPacIsl', 'Eth_Latinx',
       'Eth_MultiEthMultiRacial', 'Eth_OtherOrUnknown', 'Eth_White',
       'VarsitySport_No_Or_Unknown', 'VarsitySport_Yes',
       'DeferralRequested', 'DeferralNotRequested', 'Title1No',
       'Title1Yes', 'Grade_ECE', 'Grade_HIGH', 'Grade_LOWELEM',
       'Grade_MIDDLE', 'Grade_UPPERELEM', 'Grade_Unknown',
       'School_Charter', 'School_Public', 'School_Other', 'LGB_NoProtect',
       'LGB_LocalProtect', 'LGB_StateProtect', 'TeacherPP_No',
       'TeacherPP_Yes', 'Region_Large', 'Region_Medium', 'Region_Small',
       'Urbanicity_Both', 'Urbanicity_Rural', 'Urbanicity_Urban',
       'COL_High', 'COL_Low', 'COL_Medium', 'MajorTeaching_Match',
       'MajorTeaching_Mismatch', 'MajorOffered_Match',
       'MajorOffered_Mismatch']

In [11]:
# training with no scaling or oversampling
ind_vars = df[predictor_vars]
label = df['SwellOneExit']

train_features, test_features, train_labels, test_labels = train_test_split(ind_vars, label, test_size = 0.2, stratify=label)
ratio_neg_pos = (len(test_labels) - test_labels.sum()) / test_labels.sum()

model = xgb.XGBClassifier(scale_pos_weight = ratio_neg_pos)
model.fit(train_features, train_labels)
y_pred = model.predict(test_features)
acc_score = accuracy_score(test_labels, y_pred)
rec_score = recall_score(test_labels, y_pred)
r2 = r2_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)
print('accuracy', acc_score, 'recall', rec_score, 'precision', precision)
print(confusion_matrix(test_labels, y_pred))

accuracy 0.7523374646662317 recall 0.40782122905027934 precision 0.06600361663652803
[[3387 1033]
 [ 106   73]]


In [16]:
# training with scaling but still no oversampling
ind_vars = df[predictor_vars]
label = df['SwellOneExit']

ind_vars = StandardScaler().fit_transform(ind_vars)
train_features, test_features, train_labels, test_labels = train_test_split(ind_vars, label, test_size = 0.2, stratify=label)
ratio_neg_pos = (len(test_labels) - test_labels.sum()) / test_labels.sum()

model = xgb.XGBClassifier(scale_pos_weight = ratio_neg_pos)
model.fit(train_features, train_labels)
y_pred = model.predict(test_features)
acc_score = accuracy_score(test_labels, y_pred)
rec_score = recall_score(test_labels, y_pred)
r2 = r2_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)
print('accuracy', acc_score, 'recall', rec_score, 'precision', precision)
print(confusion_matrix(test_labels, y_pred))

accuracy 0.7475538160469667 recall 0.3854748603351955 precision 0.06160714285714286
[[3369 1051]
 [ 110   69]]


In [27]:
# with SMOTE oversampling
ind_vars = df[predictor_vars]
label = df['SwellOneExit']

ind_vars = StandardScaler().fit_transform(ind_vars)
train_features, test_features, train_labels, test_labels = train_test_split(ind_vars, label, test_size = 0.2, stratify=label)

print('before oversampling label counts:', '\n', train_labels.value_counts())
oversample = SMOTE()
train_features, train_labels = oversample.fit_resample(train_features, train_labels)
print('after label counts', '\n', pd.Series(train_labels).value_counts())

model = xgb.XGBClassifier()
model.fit(train_features, train_labels)
y_pred = model.predict(test_features)
acc_score = accuracy_score(test_labels, y_pred)
rec_score = recall_score(test_labels, y_pred)
r2 = r2_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)
print('accuracy', acc_score, 'recall', rec_score, 'precision', precision)
print(confusion_matrix(test_labels, y_pred))

before oversampling label counts: 
 0    17676
1      717
Name: SwellOneExit, dtype: int64
after label counts 
 1    17676
0    17676
dtype: int64
accuracy 0.9606436181778647 recall 0.0 precision 0.0
[[4418    2]
 [ 179    0]]


In [29]:
ind_vars = df[predictor_vars]
label = df['SwellOneExit']

ind_vars = StandardScaler().fit_transform(ind_vars)
train_features, test_features, train_labels, test_labels = train_test_split(ind_vars, label, test_size = 0.2, stratify=label)

print('before oversampling label counts:', '\n', train_labels.value_counts())
oversample = SMOTE()
train_features, train_labels = oversample.fit_resample(train_features, train_labels)
print('after label counts', '\n', pd.Series(train_labels).value_counts())

model = xgb.XGBClassifier(scale_pos_weight = 10)
model.fit(train_features, train_labels)
y_pred = model.predict(test_features)
acc_score = accuracy_score(test_labels, y_pred)
rec_score = recall_score(test_labels, y_pred)
r2 = r2_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)
print('accuracy', acc_score, 'recall', rec_score, 'precision', precision)
print(confusion_matrix(test_labels, y_pred))

before oversampling label counts: 
 0    17676
1      717
Name: SwellOneExit, dtype: int64
after label counts 
 1    17676
0    17676
dtype: int64
accuracy 0.7692976734072624 recall 0.39664804469273746 precision 0.0693359375
[[3467  953]
 [ 108   71]]
