In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestRegressor

# Setup Seaborn
sns.set_style("whitegrid")
sns.set_context("poster")

In [None]:
from IPython.core.display import HTML
style_table = open('/Users/Scott/Desktop/Data/style-table.css').read()
style_notebook = open('/Users/Scott/Desktop/Data/style-notebook.css').read()
css = style_table + style_notebook
HTML('<style>{}</style>'.format(css))

# Best predictors of GPA success from a private school admissions profile using Random Forest


## Cleaning the Admissions Data
##### The first step was to take the admissions files and format them in Excel so that students who took the ISEE or SSAT entrance exams could be compared by looking at national percentile score. Also, admissions recommendations ratings shifted from a scale out of 4 to out of 5 over the years, so the scale needed to be standardized.

In [None]:
# Combines edited admissions files into a single dataframe with both ISEE and SSAT percentiles, retaining that info

# Puts admissions data files into a list, reads them, and creates a dataframe
counter = 4
files = []

for _ in range(5):
    files.append('/Users/Scott/Desktop/Data/Admissions/0'+
                 str(counter)+'_0'+str(counter+1)+'.xlsx')
    counter += 1
    
for _ in range(1):
    files.append('/Users/Scott/Desktop/Data/Admissions/0'+
                 str(counter)+'_'+str(counter+1)+'.xlsx')
    counter += 1
    
for _ in range(8):
    files.append('/Users/Scott/Desktop/Data/Admissions/'+
                 str(counter)+'_'+str(counter+1)+'.xlsx')
    counter += 1
    
counter = 0
for file in files:
    files[counter] = pd.read_excel(file)
    files[counter] = files[counter].dropna(axis=1, how="all")
    files[counter] = files[counter].dropna(axis=0, how="all")
    counter += 1

# df_allstudents is a dataframe of all admissions data
counter = 0
df_allstudents = pd.DataFrame()
for _ in range(len(files)):
    df_allstudents = pd.concat([df_allstudents, files[counter]], sort=True)
    counter += 1

In [None]:
# Isolates certain columns to show cum. GPA, grades, testing, recommendation scores, sex, year of entry and exit, 
# grade applied to, financial aid, city, zip, has a sibling, has a parent who attended, and has a faculty parent.
# create a copy to void SettingwithCopyWarning later on

allstudents_cleaner = df_allstudents[['Cum','Eng_Rec_Rating','English_1','Ethnicity','FA_Request',
                                      'FL_1','Grade_apply','History_1','Inquiry_source',
                                      'Interview','Math','Math_1','Math_rec_rating',
                                      'Princ_Rec_Rating','Quantitative','Reading',
                                      'School_admprevious','Science_1','Sex','Test','Verbal',
                                      'Writing','YOE','YOX','inq_FAM Family 1 [c]::P_city',
                                      'isFacultyStudent', 'isLegacy', 'isSibling',
                                      'NameLast']].copy()

In [None]:
# if no data, fill with 0 
fillna_list = ['FA_Request','isFacultyStudent','isLegacy','isSibling','Inquiry_source']

for col in fillna_list:
    allstudents_cleaner[col] = allstudents_cleaner.loc[:, col].fillna(value=0)

# replaces infrequent values with more general values
changes = [('3+',2.78), ('A/A-','A'),('A+/A','A+'),('A-/B-','B+'),('C+/B+','B'),('C+/A /A-','A-'),
           ('Multi-ethnic or Other (please describe)','Multi-Ethnic or Other'),
           ('Multi-Ethnic or Other/AsAm','Multi-Ethnic or Other'),
           ('Multi-Ethnic or Other/Asian, cauc','Multi-Ethnic or Other'),
           ('Multi-EthKoreameri','Multi-Ethnic or Other'),
           ('Multi-Eth As/Cauc','Multi-Ethnic or Other'),
           ('Middle Eastern Americasian amer','Multi-Ethnic or Other'),
           ('?','sf'),(44.00,4.00),('4-',3.06),('A-/sf','A-'),('Pass','sf')]
for change in changes:
    allstudents_cleaner = allstudents_cleaner.replace(change[0], change[1])
    
allstudents_cleaner = allstudents_cleaner[pd.notnull(allstudents_cleaner['YOX'])]

# chooses the most common value for empty values for any empty values
allstudents_cleaner = allstudents_cleaner.apply(lambda x:x.fillna(x.value_counts().index[0]))



## Cleaning the GPA Data


In [None]:
# creates ClassYear object that keeps the filename and year of exit (graduation) together
class ClassYear:
    def __init__(self, year):
        year_abbrev = '\'' + str(year)[2:]    # i.e. '08
        self.filename = '/Users/Scott/Desktop/Data/Academics/Class of ' + year_abbrev + ' GPA.xls'
        self.YOX = str(year-1) + "-" + str(year)
    
    def get_filename(self):
        return self.filename
    
    def get_YOX(self):
        return self.YOX

In [None]:
# Puts GPA data files into a list, reads them, and creates a dataframe of name, year of exit, and final GPA for merging

class_years = [ClassYear(year) for year in range(2008, 2018)]
    
# df_allacademics is a dataframe of all admissions data, include year of exit (graduation) column

df_allacademics = pd.DataFrame()
df_allacademics['YOX'] = pd.Series()

for grade in class_years:
    df_allacademics = pd.concat([df_allacademics, pd.read_excel(grade.filename)], sort=True)
    df_allacademics['YOX'] = df_allacademics['YOX'].fillna(value=grade.get_YOX())

In [None]:
# creates df of only name and GPA, sorted by name
df_allacademics = df_allacademics[['NameLast',
                                   'YOX','3-8']].dropna(axis=0, how="all").sort_values(['NameLast'])
df_allacademics.columns = ['NameLast','YOX','Grad_GPA']


## Merge Admissions and GPA data to prepare for RandomForest

In [None]:
admissions_with_GPA = pd.merge(allstudents_cleaner, df_allacademics, on=['NameLast','YOX'])
admissions_with_GPA = admissions_with_GPA[admissions_with_GPA.Grad_GPA.notnull()]

In [None]:
# fun finding: students who come in 6th graduate with higher GPAs than those who come in 9th

sixes_or_nines = admissions_with_GPA.groupby(admissions_with_GPA.Grade_apply == 6).mean()
sixes_or_nines = sixes_or_nines['Grad_GPA'].to_frame()
sixes_or_nines = sixes_or_nines.rename({False: '9s', True: '6s'})
sixes_or_nines.columns = ['Cumulative GPA']
sixes_or_nines

In [None]:
# create dummy variables

courses = ['Ethnicity','Inquiry_source','Sex','YOE','YOX','inq_FAM Family 1 [c]::P_city',
        'English_1','FL_1','History_1','Math_1','Science_1','Test', 'School_admprevious']

data_w_dummies = admissions_with_GPA.drop(['NameLast','Cum'], axis=1)

counter = 0
for col in courses:
    foo = pd.get_dummies(data_w_dummies[col], prefix=col)
    data_w_dummies = pd.concat([data_w_dummies.drop(col, axis=1), foo], axis=1, join='inner')
    counter += 1

In [None]:
# set up training and testing data

# move Grad_GPA to end of the df for easy feature selection slicing
data_w_dummies = data_w_dummies.reset_index().drop(['index'], axis=1).copy()
cols = data_w_dummies.columns.tolist()
cols.insert(len(cols)-1, cols.pop(cols.index('Grad_GPA')))
df = data_w_dummies[cols].copy()

sixes = df[df.Grade_apply==6].copy()
nines = df[df.Grade_apply==9].copy()

df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

sixes['is_train'] = np.random.uniform(0, 1, len(sixes)) <= .75
nines['is_train'] = np.random.uniform(0, 1, len(nines)) <= .75

train, test = df[df['is_train']==True], df[df['is_train']==False]

train_sixes, test_sixes = sixes[sixes['is_train']==True], sixes[sixes['is_train']==False]
train_nines, test_nines = nines[nines['is_train']==True], nines[nines['is_train']==False]

# features are everything but last two columns
features = df.columns[:(len(df.columns)-2)]
features_sixes = sixes.columns[:(len(sixes.columns)-2)]
features_nines = nines.columns[:(len(nines.columns)-2)]

In [None]:
# random forests for all, sixes, and nines

clf = RandomForestRegressor()
sixes_rf = RandomForestRegressor()
nines_rf = RandomForestRegressor()

X = train[features]
y = train['Grad_GPA']
clf.fit(X,y)

X_sixes = train_sixes[features_sixes]
y_sixes = train_sixes['Grad_GPA']
sixes_rf.fit(X_sixes,y_sixes)

X_nines = train_nines[features_nines]
y_nines = train_nines['Grad_GPA']
nines_rf.fit(X_nines,y_nines)

preds = clf.predict(test[features])
results = pd.crosstab(test['Grad_GPA'], preds, rownames=['actual'], colnames=['preds'])

preds_sixes = sixes_rf.predict(test_sixes[features_sixes])
results_sixes = pd.crosstab(test_sixes['Grad_GPA'], preds_sixes, rownames=['actual'], colnames=['preds'])

preds_nines = nines_rf.predict(test_nines[features_nines])
results_nines = pd.crosstab(test_nines['Grad_GPA'], preds_nines, rownames=['actual'], colnames=['preds'])

In [None]:
def Grid_Search_CV_RFR(X_train, y_train):
    estimator = RandomForestRegressor()
    param_grid = { 
            "n_estimators"      : [10,20,30],
            "max_features"      : ["auto", "sqrt", "log2"],
            "min_samples_split" : [2,4,8],
            "bootstrap": [True, False],
            }

    grid = GridSearchCV(estimator, param_grid, n_jobs=-1, cv=5)

    grid.fit(X_train, y_train)

    return grid.best_score_ , grid.best_params_


In [None]:
Grid_Search_CV_RFR(X,y)

In [None]:
Grid_Search_CV_RFR(X_sixes,y_sixes)

In [None]:
Grid_Search_CV_RFR(X_nines,y_nines)

In [None]:
# regression for whole dataset

from sklearn.ensemble import RandomForestRegressor

test_x = test[features]
train_x = train[features]
X = train_x
y= train['Grad_GPA']


estimator = RandomForestRegressor(n_jobs=-1).set_params(bootstrap=False, max_features='sqrt', 
                                                        min_samples_split=8, n_estimators=30)
estimator.fit(X,y)

GPA_reshape = test['Grad_GPA'].values

print("R2 score:", estimator.score(test_x, GPA_reshape))

estimator.feature_importances_

df_all = pd.DataFrame(estimator.feature_importances_, test_x.columns.values)
df_all.columns = ['correlations']
df_all = df_all.sort_values('correlations', ascending=False)

In [None]:
df_all = df_all.round(3)
df_all.head(10)

In [None]:
df_all[0:8].plot(kind='bar'); plt.axhline(0, color='k')

In [None]:
# regression for sixes

from sklearn.ensemble import RandomForestRegressor

test_x_sixes= test_sixes[features_sixes]
train_x_sixes= train_sixes[features_sixes]
X = train_x_sixes
y = train_sixes['Grad_GPA']


estimator = RandomForestRegressor(n_jobs=-1).set_params(bootstrap=True, max_features='log2', 
                                                        min_samples_split=8, n_estimators=20)
estimator.fit(X,y)

cum_reshape = test_sixes['Grad_GPA'].values

print ("R2 score:", estimator.score(test_x_sixes, cum_reshape))

estimator.feature_importances_

df_6 = pd.DataFrame(estimator.feature_importances_, test_x_sixes.columns.values)
df_6.columns = ['correlations']
df_6 = df_6.sort_values('correlations', ascending=False)
df_6.head(10)

In [None]:
df_6[0:8].plot(kind='bar'); plt.axhline(0, color='k')

In [None]:
# regression for nines

from sklearn.ensemble import RandomForestRegressor

test_x_nines= test_nines[features_nines]
train_x_nines= train_nines[features_nines]
X = train_x_nines
y = train_nines['Grad_GPA']


estimator = RandomForestRegressor(n_jobs=-1).set_params(bootstrap=False, max_features='sqrt', 
                                                        min_samples_split=8, n_estimators=20)
estimator.fit(X,y)

cum = test_nines['Grad_GPA'].values

print ("R2 score:", estimator.score(test_x_nines, cum))

estimator.feature_importances_

df_9 = pd.DataFrame(estimator.feature_importances_, test_x_nines.columns.values)
df_9.columns = ['correlations']
df_9 = df_9.sort_values('correlations', ascending=False)
df_9.head(10)

In [None]:
df_9[0:8].plot(kind='bar'); plt.axhline(0, color='k')

## Takeaways
#### While the r^2 shows that an admissions profile is quite a poor predictor of academic success, it is still interesting to see that 9th grade Reading standardized test scores are much more predictive of success than, say, math scores.
#### Another interesting finding, below, is that while 9th graders have slightly higher admissions test scores, students who come in 6th grade graduate with higher GPAs, suggesting that the strong middle school education gives the students an advantage in high school.

In [None]:
# plot changes over time by finding average 
df_tests = list(['Reading']+['Quantitative']+
                      ['Verbal']+['Math'])
tests = df.copy()[df_tests]
tests['avg']= tests.mean(1)

In [None]:
df_tests_sixes = list(['Reading']+['Quantitative']+
                      ['Verbal']+['Math'])

tests_sixes = sixes.copy()[df_tests_sixes]
tests_sixes['avg6']= tests_sixes.mean(1)
tests_sixes.describe()

In [None]:
df_tests_nines = list(['Reading']+['Quantitative']+
                      ['Verbal']+['Math'])

tests_nines = nines.copy()[df_tests_nines]
tests_nines['avg9']= tests_nines.mean(1)
tests_nines.describe()

tests = pd.concat([tests_sixes['avg6'].to_frame(), tests_nines['avg9'].to_frame()], axis = 1)
tests = tests.rename(index=str, columns={'avg6':'6th entrance tests','avg9':'9th entrance tests'})

tests.boxplot()
desc = tests.describe().drop((['count','mean', 'std', 'min']))
desc



In [None]:
sixes_v_nines = pd.DataFrame({ '6s GPAs' : sixes['Grad_GPA'],'9s GPAs' : nines['Grad_GPA']})
sixes_v_nines.boxplot()

desc = sixes_v_nines.describe().drop((['count','mean', 'std', 'min', 'max']))
desc.round(2)