In [31]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"  # shows multiple outputs

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier

# Get Processed Data

In [3]:
!ls 

LICENSE                   [34mdata[m[m                      nces_analysis.ipynb
README.md                 [34mdata_docs[m[m                 nces_features.ipynb
Untitled.ipynb            data_features-Copy1.ipynb [34mprocessed[m[m


In [4]:
!ls processed

hsls_16_student_engineered.csv hsls_16_student_processed.csv


In [5]:
data_path='processed/hsls_16_student_engineered.csv'
df = pd.read_csv(data_path)

# 9th Grade Math -> AP Math

In [8]:
# see if 9th grade self assessment and math class can 
# predict if a student takes AP Math

In [9]:
# variables describing students self assessment of math ability
# and current math class

ind_self_beg = df.columns.get_loc('S1MPERSON1')
ind_self_end = df.columns.get_loc('S1MASSEXCL')

math_self_list = df.columns[ind_self_beg:ind_self_end+1].to_list()

In [10]:
# create dataframe with features of interest

df_09math_AP = df[math_self_list + ['S3APMATH_ynm']]
df_09math_AP.shape

(23503, 40)

In [11]:
df_09math_AP['S3APMATH_ynm'].value_counts()

 0    13748
-1     6279
 1     3476
Name: S3APMATH_ynm, dtype: int64

In [13]:
# drop data where APmath is unknown

df_09math_AP = df_09math_AP[df_09math_AP['S3APMATH_ynm']>=0]

In [59]:
# for col in math_self_list:
#     df_09math_AP[col].value_counts()

In [17]:
# get rid of unit non-response

for i, col in enumerate(math_self_list):
    if i==0:
        no_unit_nr = ~df_09math_AP[col].isin([-8])
    else:
        no_unit_nr *= ~df_09math_AP[col].isin([-8]) 

df_09math_AP = df_09math_AP[no_unit_nr]
df_09math_AP.shape

(15911, 40)

In [18]:
# get rid of missing (**later do imputation**)

for i, col in enumerate(math_self_list):
    if i==0:
        no_missing = ~df_09math_AP[col].isin([-9])
    else:
        no_missing *= ~df_09math_AP[col].isin([-9]) 

df_09math_AP = df_09math_AP[no_missing]
df_09math_AP.shape

(15006, 40)

In [19]:
# values of -7 are legitimate skips. set to 0

def n7to0(x):
    if x==-7: 
        return 0
    else:
        return x

In [20]:
df_09math_AP = df_09math_AP.applymap(n7to0)

## Logistic Regression

In [21]:
# separate features into predictors and outcome

X = df_09math_AP[math_self_list]
y = df_09math_AP['S3APMATH_ynm']

In [22]:
# split into train/test 

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [23]:
# define and fit model 

logit_model = LogisticRegression(random_state=0, max_iter=200, C=1)
logit_model.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
# find probability score 

probs = logit_model.predict_proba(X_test)
probs = probs[:,1]
loss = roc_auc_score(y_test, probs)
loss

0.8104897282269548

In [48]:
%%capture
# tune parameters

d = {}

Cs = [0.01,0.1,1,10,100]
m_is = [50,100,200]

for c in Cs:
    d[str(c)]={}
    for mi in m_is:       
        lr_model = LogisticRegression(random_state=0, max_iter=mi, C=c)
        lr_model.fit(X_train, y_train)
        probs = lr_model.predict_proba(X_test)
        probs = probs[:,1]
        loss = roc_auc_score(y_test, probs)
        d[str(c)][str(mi)]=loss


In [54]:
# look at auc scores for different parameter combinations

lr_auc_df = pd.DataFrame.from_dict(d)

cm = sns.light_palette("green", as_cmap=True)

(lr_auc_df.style
  .background_gradient(cmap=cm))

Unnamed: 0,0.01,0.1,1,10,100
50,0.803144,0.808654,0.81049,0.810581,0.810537
100,0.803144,0.808654,0.81049,0.810581,0.810537
200,0.803144,0.808654,0.81049,0.810581,0.810537


## Tree Ensemble

In [57]:
%%capture
# fit model/tune parameters

d2 = {}

LR = [0.05,0.08,0.1,0.15,0.2]
NE = [20,30,50,70,100,200]

for lr in LR:
    d2[str(lr)]={}
    for ne in NE:       
        gbc_model = GradientBoostingClassifier(learning_rate=lr, 
                                       n_estimators=ne, max_depth=4,
                                      random_state=0)
        gbc_model.fit(X_train, y_train)
        probs = gbc_model.predict_proba(X_test)
        probs = probs[:,1]
        loss = roc_auc_score(y_test, probs)
        d2[str(lr)][str(ne)]=loss

In [58]:
# look at auc scores for different parameter combinations

gbc_auc_df = pd.DataFrame.from_dict(d2)

cm = sns.light_palette("green", as_cmap=True)

(gbc_auc_df.style
  .background_gradient(cmap=cm))

Unnamed: 0,0.05,0.08,0.1,0.15,0.2
20,0.800201,0.801481,0.80307,0.807152,0.807516
30,0.800823,0.805649,0.80775,0.808573,0.807671
50,0.805239,0.807779,0.809339,0.808231,0.805765
70,0.807397,0.808444,0.808945,0.806872,0.804532
100,0.808391,0.808838,0.809104,0.806585,0.802624
200,0.807651,0.807124,0.805374,0.800495,0.796868
