In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
# see https://ipython.readthedocs.io/en/stable/interactive/magics.html
%pylab inline
# sets backend to render higher res images
%config InlineBackend.figure_formats = ['retina']
import seaborn as sns
sns.set_style("whitegrid")

Populating the interactive namespace from numpy and matplotlib


# Load X_df and Y_df pickled in Feature Engineering

In [2]:
X_df = pd.read_pickle('./data/interim/X_df.pkl')
Y_df = pd.read_pickle('./data/interim/Y_df.pkl')

# X and Y are only columns used in modeling

In [3]:
X = X_df.drop(labels=['G1_Fall_RIT', 'G1_Winter_RIT', 'G1_Spring_RIT',\
                      'Last_G1_RIT_Season', 'Treatments'],axis=1)

In [4]:
X.head()

Unnamed: 0,StudentID,Last_G1_RIT,nTreatments,Female,HomeLanIsEng,PrimaryLanIsEng,LivingWithBothParents,RacialEthnicGroup__American Indian,RacialEthnicGroup__Asian,RacialEthnicGroup__Black,...,MostAttendedSchool__292,MostAttendedSchool__930,MostAttendedSchool__935,MostAttendedSchool__939,MostAttendedSchool__945,MostAttendedSchool__949,MostAttendedSchool__955,MostAttendedSchool__972,MostAttendedSchool__974,MostAttendedSchool__977
0,3967736,178,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,3381735,173,0,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3567417,196,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3130417,211,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3670417,192,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# X = X_df[['StudentID','Female','HomeLanIsEng','PrimaryLanIsEng',\
#           'LivingWithBothParents','Last_G1_RIT','nTreatments']].copy()
Y = Y_df[['Score','LevelCode','MetStandard']].copy()

# Cleaning up X and Y
Including:
* Replacing Last_G1_RIT None values with NaN
* Dropping NA's

In [6]:
df = pd.concat([X, Y.reset_index(drop=True)], axis=1)
df.set_index('StudentID',inplace=True)
df['Last_G1_RIT'].replace({None:np.nan},inplace=True)
df.dropna(inplace=True)
print('The next dataframe should have no rows if successful:')
df[df.isnull().any(axis=1)]

The next dataframe should have no rows if successful:


Unnamed: 0_level_0,Last_G1_RIT,nTreatments,Female,HomeLanIsEng,PrimaryLanIsEng,LivingWithBothParents,RacialEthnicGroup__American Indian,RacialEthnicGroup__Asian,RacialEthnicGroup__Black,RacialEthnicGroup__Hispanic,...,MostAttendedSchool__939,MostAttendedSchool__945,MostAttendedSchool__949,MostAttendedSchool__955,MostAttendedSchool__972,MostAttendedSchool__974,MostAttendedSchool__977,Score,LevelCode,MetStandard
StudentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


# Extracting Cleaned X and Y

In [7]:
X = df.iloc[:,:-3]
Y = df[['MetStandard']]

# Train-Test split data:
I'll split off test data and then create a CV data set.

In [8]:
X_tr, X_te, Y_tr, Y_te = train_test_split(X, Y, test_size=0.2, random_state=42)
x_train, x_cv, y_train, y_cv = train_test_split(X_tr,Y_tr, test_size=0.2, random_state=42)

# Fit Logistic Regression Model

In [9]:
logR = LogisticRegression(C=500,solver='lbfgs',verbose=True,max_iter=1000)
logR.fit(x_train, y_train.MetStandard.ravel())
y_cv_pred = logR.predict(x_cv)
f1_score(y_cv_pred, y_cv.MetStandard.ravel(),pos_label='Y')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s finished


0.8764044943820225

In [10]:
print(classification_report(y_cv,y_cv_pred))

              precision    recall  f1-score   support

           N       0.70      0.71      0.71       167
           Y       0.88      0.87      0.88       402

    accuracy                           0.83       569
   macro avg       0.79      0.79      0.79       569
weighted avg       0.83      0.83      0.83       569



In [11]:
coef = pd.DataFrame({'feature':X.columns,'coefficient':logR.coef_[0]})
coef.head(13)

Unnamed: 0,feature,coefficient
0,Last_G1_RIT,0.126218
1,nTreatments,-1.061937
2,Female,0.242113
3,HomeLanIsEng,-0.154898
4,PrimaryLanIsEng,0.370983
5,LivingWithBothParents,0.585249
6,RacialEthnicGroup__American Indian,-4.33587
7,RacialEthnicGroup__Asian,-1.936229
8,RacialEthnicGroup__Black,-2.729931
9,RacialEthnicGroup__Hispanic,-2.768585


In [12]:
logRcv = LogisticRegressionCV(Cs=500,cv=5,max_iter=1000)
logRcv.fit(x_train, y_train.MetStandard.ravel())
y_cv_pred = logRcv.predict(x_cv)
f1_score(y_cv_pred, y_cv.MetStandard.ravel(),pos_label='Y')

0.8759305210918114

In [13]:
print(classification_report(y_cv,y_cv_pred))

              precision    recall  f1-score   support

           N       0.70      0.69      0.70       167
           Y       0.87      0.88      0.88       402

    accuracy                           0.82       569
   macro avg       0.79      0.79      0.79       569
weighted avg       0.82      0.82      0.82       569

