# Random Forest Model

### About Random Forest

Random forests or random decision forests are an ensemble learning method for classification, regression and other tasks, that operate by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees. Random decision forests correct for decision trees' habit of overfitting to their training set 

More information about the [Scikit Learn Logistic Regression module can be found here](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html).

### A Random Forest Model for Predicting College Readiness

In [100]:
import re
import statsmodels
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import pickle

from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [3]:
# Import cleaned ELSI public school data

with open('../data/processed/0215_all_features_cleaned.pkl', 'rb') as picklefile: 
    public_school_df = pickle.load(picklefile)
print(public_school_df.shape)
public_school_df.head()

(1368, 51)


Unnamed: 0,district,college_readiness,school_name_x,city_x,state_x,graduation_rate,school_id,state_name,agency_id,county_name,...,hawaiian_nat_pacific_isl_students,two_or_more_races_students,male_students,female_students,pct_eligible_for_free_lunch,pct_eligible_for_reduced_lunch,pct_white,pct_black,pct_hispanic,total_students_calc
0,Bay,40.9,A. Crawford Mosley High School,Lynn Haven,FL,82.0,120009000067,Florida,1200090,BAY COUNTY,...,2,64,844,843,0.346177,0.085359,0.808536,0.08773,0.046829,1687
1,Harford County Public Schools,28.6,Aberdeen High,Aberdeen,MD,87.0,240039000679,Maryland,2400390,HARFORD COUNTY,...,6,130,779,665,0.354571,0.086565,0.427978,0.348338,0.081025,1444
2,Abington Heights SD,35.1,Abington Heights High School,Clarks Summit,PA,93.0,420201005091,Pennsylvania,4202010,LACKAWANNA COUNTY,...,0,7,542,492,0.102515,0.020309,0.907157,0.018375,0.020309,1034
3,Abington SD,33.0,Abington High School,Abington,PA,93.0,420204003241,Pennsylvania,4202040,MONTGOMERY COUNTY,...,3,32,920,823,0.154332,0.023523,0.64716,0.232358,0.055077,1743
4,San Jose Unified School District,47.1,Abraham Lincoln High,San Jose,CA,94.0,"=""063459005696""",California,"=""0634590""",SANTA CLARA COUNTY,...,7,32,899,952,0.438142,0.097245,0.154511,0.041059,0.71745,1851


In [8]:
public_school_df['state_county_name'] = public_school_df['county_name'] + " (" + public_school_df['state_x'] + ")"
public_school_df['state_county_name'] = public_school_df['state_county_name'].apply(lambda x: x.lower())

In [14]:
public_school_df['pct_asian'] = public_school_df['asian_or_asian_pacif_isl_students'] / public_school_df['total_students_calc']

In [15]:
pctasn = public_school_df['pct_asian'].mean()
public_school_df['pct_asian'].fillna(value=pctasn, inplace=True)

In [23]:
for index, row in public_school_df.iterrows():
    if 'Suburb' in row['urban_centric_locale']:
        public_school_df.loc[index, 'urban_centric_locale'] = "Suburb"
    elif 'City' in row['urban_centric_locale']:
        public_school_df.loc[index, 'urban_centric_locale'] = "City"
    elif 'Town' in row['urban_centric_locale']:
        public_school_df.loc[index,'urban_centric_locale'] = "Town"
    else:
        public_school_df.loc[index, 'urban_centric_locale'] = "Rural"

In [24]:
public_school_df['total_students_calc'] = pd.qcut(public_school_df['total_students_calc'], 4, labels=["small","small_medium","medium_large", "large"])

In [42]:
# classify_college_readiness = lambda x : 1 if x > 36.1 else 0

# public_school_df['college_readiness'] = public_school_df['college_readiness'].apply(classify_college_readiness)

# public_school_df['college_readiness'].value_counts()

0    685
1    683
Name: college_readiness, dtype: int64

In [5]:
# Import cleaned county data

with open('../data/processed/0219_county_data_cleaned.pkl', 'rb') as picklefile: 
    county_df = pickle.load(picklefile)
print(count_df.shape)
county_df.head()

(3195, 6)


Unnamed: 0,Year,county_id,state_county_name,pct_all_ages_in_poverty,pct_under_18_in_poverty,median_hh_income
0,2015.0,0.0,united states,14.7,20.7,55775.0
1,2015.0,1000.0,alabama,18.5,26.5,44833.0
2,2015.0,1001.0,autauga county (al),12.7,18.8,56580.0
3,2015.0,1003.0,baldwin county (al),12.9,19.6,52387.0
4,2015.0,1005.0,barbour county (al),32.0,45.2,31433.0


### Join public school and county data

In [44]:
join_df = pd.merge(public_school_df, county_df, how='left', on=['state_county_name'])

In [67]:
join_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1368 entries, 0 to 1367
Data columns (total 58 columns):
district                                 1368 non-null object
college_readiness                        1368 non-null int64
school_name_x                            1368 non-null object
city_x                                   1368 non-null object
state_x                                  1368 non-null object
graduation_rate                          1365 non-null float64
school_id                                1368 non-null object
state_name                               1368 non-null object
agency_id                                1368 non-null object
county_name                              1368 non-null object
fips_state_code                          1368 non-null object
charter_school                           1308 non-null object
magnet_school                            1265 non-null object
shared_time_school                       1368 non-null object
urban_centric_locale     

In [46]:
dummies = ['urban_centric_locale', 'total_students_calc']

In [47]:
join_wdumm_df = pd.get_dummies(join_df, columns=dummies, drop_first=True)

In [43]:
# join_wdumm_df.info()

In [None]:
# Run first model with college readiness as target

In [96]:
model_cols =  ['college_readiness',\
           'pupil_teacher_ratio',\
           'pct_eligible_for_free_lunch',\
           'pct_eligible_for_reduced_lunch',\
           'median_hh_income',\
           'pct_white',\
           'pct_black',\
           'pct_hispanic',\
           'pct_asian',\
           'urban_centric_locale_Rural',\
           'urban_centric_locale_Suburb',\
           'urban_centric_locale_Town',\
           'total_students_calc_medium_large',\
           'total_students_calc_small',\
           'total_students_calc_small_medium']

In [97]:
model_df = join_wdumm_df[model_cols].dropna()
model_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1308 entries, 0 to 1367
Data columns (total 15 columns):
college_readiness                   1308 non-null int64
pupil_teacher_ratio                 1308 non-null float64
pct_eligible_for_free_lunch         1308 non-null float64
pct_eligible_for_reduced_lunch      1308 non-null float64
median_hh_income                    1308 non-null float64
pct_white                           1308 non-null float64
pct_black                           1308 non-null float64
pct_hispanic                        1308 non-null float64
pct_asian                           1308 non-null float64
urban_centric_locale_Rural          1308 non-null uint8
urban_centric_locale_Suburb         1308 non-null uint8
urban_centric_locale_Town           1308 non-null uint8
total_students_calc_medium_large    1308 non-null uint8
total_students_calc_small           1308 non-null uint8
total_students_calc_small_medium    1308 non-null uint8
dtypes: float64(8), int64(1), uint8(6

In [98]:
# Fit regression model
X = model_df.ix[:,1:]
y = model_df['college_readiness']

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4444)

results = sm.OLS(y_train,X_train).fit()
# Inspect the results
results.summary()

0,1,2,3
Dep. Variable:,college_readiness,R-squared:,0.623
Model:,OLS,Adj. R-squared:,0.617
Method:,Least Squares,F-statistic:,106.4
Date:,"Tue, 21 Feb 2017",Prob (F-statistic):,9.139999999999999e-180
Time:,14:44:31,Log-Likelihood:,-546.19
No. Observations:,915,AIC:,1120.0
Df Residuals:,901,BIC:,1188.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
pupil_teacher_ratio,0.0035,0.002,1.720,0.086,-0.000 0.008
pct_eligible_for_free_lunch,-0.5307,0.100,-5.309,0.000,-0.727 -0.334
pct_eligible_for_reduced_lunch,-0.7715,0.291,-2.655,0.008,-1.342 -0.201
median_hh_income,5.226e-06,1e-06,5.228,0.000,3.26e-06 7.19e-06
pct_white,0.2362,0.094,2.504,0.012,0.051 0.421
pct_black,0.2681,0.136,1.970,0.049,0.001 0.535
pct_hispanic,0.3879,0.124,3.117,0.002,0.144 0.632
pct_asian,0.9198,0.155,5.949,0.000,0.616 1.223
urban_centric_locale_Rural,-0.2320,0.052,-4.443,0.000,-0.334 -0.130

0,1,2,3
Omnibus:,597.872,Durbin-Watson:,1.945
Prob(Omnibus):,0.0,Jarque-Bera (JB):,53.722
Skew:,0.014,Prob(JB):,2.16e-12
Kurtosis:,1.813,Cond. No.,1320000.0


### Run Random Forrest Model

In [51]:
rf = RandomForestClassifier(random_state=4444)

In [52]:
X_raw = model_df.ix[:,1:]
X = preprocessing.scale(X_raw)

y = model_df['college_readiness']

# STEP 1: split X and y into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [60]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.702290076336


In [61]:
for metric in ['accuracy', 'precision', 'recall', 'f1']:
    print(metric, np.mean(cross_val_score(rf, X_train, y_train, cv=10, scoring=metric)))

accuracy 0.685577055794
precision 0.716600713022
recall 0.623381642512
f1 0.664288541834


### Visualizations

In [74]:
viz_cols =  ['school_name_x',\
           'state_x',\
           'college_readiness',\
           'pupil_teacher_ratio',\
           'pct_eligible_for_free_lunch',\
           'median_hh_income',\
           'pct_white',\
           'pct_black',\
           'pct_hispanic',\
           'pct_asian',\
           'urban_centric_locale',\
           'total_students_calc']

In [75]:
viz_df = join_df[viz_cols].dropna()
viz_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1308 entries, 0 to 1367
Data columns (total 12 columns):
school_name_x                  1308 non-null object
state_x                        1308 non-null object
college_readiness              1308 non-null int64
pupil_teacher_ratio            1308 non-null float64
pct_eligible_for_free_lunch    1308 non-null float64
median_hh_income               1308 non-null float64
pct_white                      1308 non-null float64
pct_black                      1308 non-null float64
pct_hispanic                   1308 non-null float64
pct_asian                      1308 non-null float64
urban_centric_locale           1308 non-null object
total_students_calc            1308 non-null object
dtypes: float64(7), int64(1), object(4)
memory usage: 132.8+ KB


In [76]:
mn_viz_df = viz_df[viz_df['state_x'] == 'MN']

In [79]:
mn_viz_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35 entries, 59 to 1366
Data columns (total 12 columns):
school_name_x                  35 non-null object
state_x                        35 non-null object
college_readiness              35 non-null int64
pupil_teacher_ratio            35 non-null float64
pct_eligible_for_free_lunch    35 non-null float64
median_hh_income               35 non-null float64
pct_white                      35 non-null float64
pct_black                      35 non-null float64
pct_hispanic                   35 non-null float64
pct_asian                      35 non-null float64
urban_centric_locale           35 non-null object
total_students_calc            35 non-null object
dtypes: float64(7), int64(1), object(4)
memory usage: 3.6+ KB


In [85]:
mn_viz_df[['median_hh_income','college_readiness','urban_centric_locale','total_students_calc']].to_json('mn_viz.json', orient='records')

In [93]:
for col in df.columns:
    print('"' + col + '",')

"district",
"college_readiness",
"school_name_x",
"city_x",
"state_x",
"graduation_rate",
"school_id",
"state_name",
"agency_id",
"county_name",
"fips_state_code",
"charter_school",
"magnet_school",
"shared_time_school",
"urban_centric_locale",
"start_of_year_status",
"agency_type_dis",
"school_wide_title_I",
"title_I_eligible_school",
"longitude",
"latitude",
"state_school_id",
"congressional_code",
"national_school_lunch_program",
"total_students_all_grades_excl_ae",
"total_students_all_grades_incl_ae",
"free_lunch_eligible",
"reduced_price_lunch_eligible_students",
"pupil_teacher_ratio",
"fte_teachers",
"school_id_x",
"agency_id_x",
"county_name_x",
"ansi_fips_state_code",
"address",
"zip",
"american_indian_students",
"asian_or_asian_pacif_isl_students",
"hispanic_students",
"black_students",
"white_students",
"hawaiian_nat_pacific_isl_students",
"two_or_more_races_students",
"male_students",
"female_students",
"pct_eligible_for_free_lunch",
"pct_eligible_for_reduced_lunch",
"pct_wh

In [95]:
df.columns[1:11]

Index(['college_readiness', 'school_name_x', 'city_x', 'state_x',
       'graduation_rate', 'school_id', 'state_name', 'agency_id',
       'county_name', 'fips_state_code'],
      dtype='object')