In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import os
import sys
import xgboost as xgb
import shap
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, auc, f1_score

import warnings
warnings.filterwarnings('ignore')

sys.path.insert(0, '../src/visualization/')
import visualize as vis

In [2]:
df = pd.read_csv('../data/processed/CriticalPath_Data_EM_Confidential_lessNoise.csv').drop(columns=['Unnamed: 0'])

### Which columns should be dropped???

In [26]:
for col in df.columns.values:
    drop_cols = []
    l = len(df[col][~df[col].isnull()]) 
    if l<1000:
        print(col,l)
        drop_cols.append(col)

IntsROTC 757
IntsWVCR 565
TEC 7
Religious_denomination 981
Recruited_athlete 502
Street2_perm_res 59
Admitted_off_waitlist 73
Other_federal_grants 347
ISTFR 730
ROTC_based_inst_aid 4
Tuition_waivers_and_exchanges 69
Outside_aid 464
Work_study 764
MERIT_APPEAL_STATUS 57


## Split into training and test data, and fit a regression model.

In [None]:
X = df.drop(columns=['Enrolled','Admission_status',
                     'Unique_student_ID']).select_dtypes([float,bool,int]).fillna(-999)

Y = df['Enrolled'].fillna(-999)


X_train, X_test, y_train, y_test = train_test_split(X, Y)

In [None]:
model = xgb.XGBRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print( "R2 Score: ", r2_score(y_test, y_pred) )

# The below three cells are mostly taken from [Alex Furrier](https://github.com/safurrier/hackathon/blob/master/notebooks/03-model.ipynb)

## Brute force scan for all parameters, here are the tricks
* usually max_depth is 6,7,8
* learning rate is around 0.05, but small changes may make big diff
* tuning min_child_weight subsample colsample_bytree can have 
* much fun of fighting against overfit 
* n_estimators is how many round of boosting
* finally, ensemble xgboost with multiple seeds may reduce variance

In [None]:
xgb_model = xgb.XGBRegressor()

parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'learning_rate': [.001, 0.05, .01], #so called `eta` value
              'max_depth': [2, 5, 10, 20],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [.2, .5, 0.8],
              'colsample_bytree': [.2, .5, 0.8],
              'n_estimators': [5, 50, 500], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [42]}


clf = GridSearchCV(xgb_model, parameters, n_jobs=-1, 
                   cv=3, verbose=2, refit=True)

## Try to find best parameters
This cell takes ***FOREVER*** to run.  

In [None]:
%%time
clf.fit(X_train, y_train)

In [None]:
# use the search to produce the best parameters
clf.cv_results_['params'][clf.best_index_]