In [None]:
#get the notebook, env ready
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [None]:
#read the data
df=pd.read_csv('college.csv',index_col=0)
df.describe(include='all')

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
count,777,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0
unique,2,,,,,,,,,,,,,,,,,
top,Yes,,,,,,,,,,,,,,,,,
freq,565,,,,,,,,,,,,,,,,,
mean,,3001.638353,2018.804376,779.972973,27.558559,55.796654,3699.907336,855.298584,10440.669241,4357.526384,549.380952,1340.642214,72.660232,79.702703,14.089704,22.743887,9660.171171,65.46332
std,,3870.201484,2451.113971,929.17619,17.640364,19.804778,4850.420531,1522.431887,4023.016484,1096.696416,165.10536,677.071454,16.328155,14.722359,3.958349,12.391801,5221.76844,17.17771
min,,81.0,72.0,35.0,1.0,9.0,139.0,1.0,2340.0,1780.0,96.0,250.0,8.0,24.0,2.5,0.0,3186.0,10.0
25%,,776.0,604.0,242.0,15.0,41.0,992.0,95.0,7320.0,3597.0,470.0,850.0,62.0,71.0,11.5,13.0,6751.0,53.0
50%,,1558.0,1110.0,434.0,23.0,54.0,1707.0,353.0,9990.0,4200.0,500.0,1200.0,75.0,82.0,13.6,21.0,8377.0,65.0
75%,,3624.0,2424.0,902.0,35.0,69.0,4005.0,967.0,12925.0,5050.0,600.0,1700.0,85.0,92.0,16.5,31.0,10830.0,78.0


In [None]:
#grad rate cannot be higher than 100 so we just clean up and max at 100
df=df[df['Grad.Rate']<=100]

In [None]:
y=df['Private']
X=df.drop(['Private'],axis=1)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
rf = RandomForestClassifier(n_estimators=50, random_state=0, class_weight='balanced')
rf.fit(X_train, y_train)

In [None]:
#first is accuracy, then is AUC score
print(rf.score(X_val, y_val))
print(roc_auc_score(y_val, rf.predict_proba(X_val)[:,1]))

0.927038626609442
0.9773109243697479


In [None]:
#set the parameters. If "max features" is = to total, bootsrap. If n-1, random sample without replace.
param = {
    'n_estimators':[50,100],
    'max_features':[1,18]
}

In [None]:
#create 2nd random forest
#similar to first forest, except that this uses the above parameters.
#creating a total of 34 random forest algorithms. One with 50 estimators, one with 100.
#Both with 17 each, so 34
#This is bagging since the sample variables is being used in all the trees
rf2=RandomForestClassifier(random_state=0, class_weight='balanced')
grid=GridSearchCV(estimator=rf2, param_grid=param, n_jobs=-1)
grid.fit(X_train, y_train)

In [None]:
#check the best parameter
#based on average performance of auto bag sample, 50 trees with each 1 feature.
grid.best_params_

{'max_features': 1, 'n_estimators': 50}

In [None]:
print(grid.score(X_val, y_val))
print(roc_auc_score(y_val, grid.predict_proba(X_val)[:,1]))

0.9098712446351931
0.9731092436974791


In [None]:
final_model=grid.best_estimator_

In [None]:
feature_importances = pd.DataFrame(final_model.feature_importances_,
                                   index = X_train.columns, columns=['importance']).sort_values('importance',ascending=False)
display(feature_importances)

Unnamed: 0,importance
Enroll,0.116129
Outstate,0.109878
F.Undergrad,0.105072
Accept,0.079118
P.Undergrad,0.077881
Room.Board,0.061203
S.F.Ratio,0.059768
Expend,0.055433
Grad.Rate,0.050795
perc.alumni,0.048885
