In [331]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import sys

%matplotlib inline

In [332]:
data = pd.read_csv('MERGED2013_PP.csv')
print "Number of features: {}".format(len(data.columns))
print "Number of rows: {}".format(len(data))

Number of features: 1729
Number of rows: 7804


In [333]:
# potentially interesting features
col_desc = {
    'C150_4_POOLED': 'Completion 4yr pooled',
    'C150_L4_POOLED': 'Completion <4yr pooled',
    'CCSIZSET': 'Carnegie classification-Size & settings',
    'CCUGPROF': 'Carnegie classification-Undergrad profile ',
    'CCBASIC': 'Carnegie classification-basic',
    'CONTROL': 'Control (public/private)',
    'RET_FT4': 'Retention 4yr',
    'RET_FTL4': 'Retention <4yr',
    'ACTCMMID': 'ACT',
    'SAT_AVG': 'SAT',
    'SAT_AVG_ALL': 'SAT all',
    'SATVRMID': 'SAT reading',
    'SATMTMID': 'SAT math',
    'SATWRMID': 'SAT writing',
    'AVGFACSAL': 'Avg faculty salary',
    'PFTFAC': 'Full time faculty rate',
    'ADM_RATE_ALL': 'Admission rate',
    'DISTANCEONLY': 'Distance only',
    'NPT4_PUB': 'Avg net price title IV institut public',
    'NPT4_PRIV': 'Avg net price title IV institut private',
    'NUM4_PUB': 'Num Title IV student, public',
    'NUM4_PRIV': 'Num Title IV student, private',
    'COSTT4_A': 'Avg cost academic year',
    'COSTT4_P': 'Avg cost program year',
    'TUITIONFEE_IN': 'In state tuition',
    'TUITIONFEE_OUT': 'Out of state tuition',
    'TUITIONFEE_PROG': 'Tuition fee program year',
    'TUITFTE': 'Net revenue per FTE student',
    'INEXPFTE': 'Expense per FTE student',
    'PCTPELL': '% Pell Grant receiver',
    'PCTFLOAN': '% Fed student loan',
    'UG25abv': '% undergrad > 25 yr',
    'PFTFTUG1_EF': 'Undergrad 1st-time degree seeking',
    'UGDS': 'Number of Undergrad degree seeking',
    'PAR_ED_PCT_1STGEN': '% 1st gen students',
    'PAR_ED_PCT_MS': '% parent education middle school',
    'PAR_ED_PCT_HS': '% parent education high school',
    'PAR_ED_PCT_PS': '% parent education post secondary',
    'DEP_INC_AVG': 'Avg income dependent stu',
    'IND_INC_AVG': 'Avg income independent stu',
    'DEBT_MDN': 'Median debt',
    'GRAD_DEBT_MDN': 'Median debt complete',
    'WDRAW_DEBT_MDN': 'Median debt non-completer',
}
# print len(col_desc)


In [334]:
data = data[sorted(col_desc.keys())]

In [335]:
# add column that indicates whether it's a less than 4yr college
data['L4_COLLEGE'] = data.C150_4_POOLED.isnull()

# combine completion data for 4 year and <4 year institution
data['C150'] = pd.concat([data.C150_4_POOLED.dropna(), data.C150_L4_POOLED.dropna()]).reindex_like(data)

# combine retention data for 4 year and <4 year institution
data['RET_FT'] = pd.concat([data.RET_FT4.dropna(), data.RET_FTL4.dropna()]).reindex_like(data)

# combine net price title iv for public and private
data['NPT4'] = pd.concat([data.NPT4_PRIV.dropna(), data.NPT4_PUB.dropna()]).reindex_like(data)
data['NUM4'] = pd.concat([data.NUM4_PRIV.dropna(), data.NUM4_PUB.dropna()]).reindex_like(data)


In [336]:
# clean up extra columns after combining
del_columns = ['NPT4_PUB', 'NPT4_PRIV', 'NUM4_PUB', 'NUM4_PRIV', 'C150_4_POOLED', 'C150_L4_POOLED']
for col in del_columns:
    if col in data.keys():
        del data[col]
        del col_desc[col]

col_desc['L4_COLLEGE'] = '<4 years college'
col_desc['C150'] = 'Completion'
col_desc['RET_FT'] = 'Retention'
col_desc['NPT4'] = 'Avg net price Title IV'
col_desc['NUM4'] = 'Num Title IV student'

data = data[~data['C150'].isnull()]
data = data[~data['RET_FT'].isnull()]

# remove data containing 'PrivacySuppressed'
for col in col_desc.keys():
    if data.dtypes[col] == 'object':
        data[col] = data[col].replace(['PrivacySuppressed'], [float('NaN')]).astype(float)


print "Num data after removing missing completion and retention rate: {}".format(len(data))
print "Num features: {}".format(len(data.columns))
print data.shape


Num data after removing missing completion and retention rate: 6007
Num features: 42
(6007, 42)


In [337]:
X = data[['CONTROL', 'DEBT_MDN', 'DEP_INC_AVG', 'GRAD_DEBT_MDN', 'IND_INC_AVG', 'INEXPFTE', 'PAR_ED_PCT_1STGEN', 
          'PAR_ED_PCT_HS', 'PAR_ED_PCT_MS', 'PAR_ED_PCT_PS', 'PCTFLOAN', 'PCTPELL', 'UG25abv', 'UGDS', 
          'WDRAW_DEBT_MDN', 'L4_COLLEGE', 'NPT4', 'NUM4', 
          'PFTFTUG1_EF', 'PFTFAC',
          'SAT_AVG_ALL', 'ACTCMMID', 'ADM_RATE_ALL', 'AVGFACSAL', 'COSTT4_A',
          'C150', 'RET_FT']].dropna()
# y = X[['C150', 'RET_FT']]
y1 = X['C150']
y2 = X['RET_FT']
X = X.drop('C150', 1)
X = X.drop('RET_FT', 1)
print X.shape
print len(y1)
print len(y2)
# print X.PFTFTUG1_EF.describe()

(1210, 25)
1210
1210


In [338]:
from sklearn import preprocessing

tmpX = X[['DEBT_MDN', 'DEP_INC_AVG', 'GRAD_DEBT_MDN', 'IND_INC_AVG', 'INEXPFTE', 'UGDS', 'WDRAW_DEBT_MDN', 'NPT4', 
          'NUM4', 'SAT_AVG_ALL', 'ACTCMMID', 'AVGFACSAL', 'COSTT4_A']]

scaler = preprocessing.MinMaxScaler()
scaledX = scaler.fit_transform(tmpX)

# print np.array(tmpX.ix[0])
# print scaledX[0]

# X_for_PCA = pd.DataFrame(scaledX)
pctX = X[['PAR_ED_PCT_1STGEN', 'PAR_ED_PCT_HS', 'PAR_ED_PCT_MS', 'PAR_ED_PCT_PS', 'PCTFLOAN', 'PCTPELL', 'UG25abv', 
          'PFTFTUG1_EF', 'PFTFAC', 'ADM_RATE_ALL']]
pctX = np.array(pctX)

forPcaX = np.concatenate((scaledX, pctX), axis=1)
# print scaledX.shape
# print pctX.shape
print forPcaX.shape


(1210, 23)


In [339]:
from sklearn.decomposition import PCA

pca = PCA(n_components=forPcaX.shape[1])
pca.fit(forPcaX)

# print pca.components_
print pca.explained_variance_ratio_


[  3.16257628e-01   1.73164702e-01   1.12819213e-01   7.91939545e-02
   7.80674005e-02   6.51524359e-02   3.85663143e-02   2.91864611e-02
   2.18495298e-02   1.80977204e-02   1.31349331e-02   1.03917544e-02
   1.01671138e-02   8.62855457e-03   7.64238311e-03   5.54657921e-03
   3.96454892e-03   3.27242079e-03   2.96323839e-03   1.27343293e-03
   6.59681299e-04   1.85675207e-19   9.47273654e-20]


In [340]:
pca = PCA(n_components=12)
pca.fit(forPcaX)
reducedX = pca.transform(forPcaX)
print reducedX.shape

(1210, 12)


In [341]:
# add the categorical feature

tmp1X = np.array(pd.get_dummies(X['CONTROL']))
tmp2X = X['L4_COLLEGE'].astype(int)
tmp2X = np.reshape(tmp2X, (len(tmp2X), 1)) 
print tmp1X.shape
print tmp2X.shape
print reducedX.shape
finalX = np.concatenate((reducedX, tmp1X, tmp2X), axis=1)
print finalX.shape

(1210, 3)
(1210, 1)
(1210, 12)
(1210, 16)


In [342]:
from sklearn import metrics
from sklearn import cross_validation as cv
from sklearn.tree import DecisionTreeRegressor

y1 = np.array(y1)
y2 = np.array(y2)
X1_train, X1_test, y1_train, y1_test = cv.train_test_split(finalX, y1, train_size=0.8)
X2_train, X2_test, y2_train, y2_test = cv.train_test_split(finalX, y2, train_size=0.8)


In [343]:
def print_r2score(reg, X, y, test=False):
    t = 'test' if test else 'train'
    print "R2 score on {} data: {}".format(t, metrics.r2_score(y, reg.predict(np.array(X))))
    

In [381]:
from sklearn.grid_search import GridSearchCV

parameters = {'max_depth': range(1,10) } # , 'min_samples_leaf': [4,5,6,7]}
scorer = metrics.make_scorer(metrics.r2_score, greater_is_better=True)
reg = DecisionTreeRegressor()
best_reg = GridSearchCV(reg, parameters, scoring=scorer, cv=3)
best_reg.fit(X1_train, y1_train)
print best_reg.best_params_

print_r2score(best_reg, X1_train, y1_train)
print_r2score(best_reg, X1_test, y1_test, test=True)

reg = DecisionTreeRegressor()
best_reg = GridSearchCV(reg, parameters, scoring=scorer, cv=3)
best_reg.fit(X2_train, y2_train)
print best_reg.best_params_

print_r2score(best_reg, X2_train, y2_train)
print_r2score(best_reg, X2_test, y2_test, test=True)


{'max_depth': 4}
R2 score on train data: 0.776016731743
R2 score on test data: 0.725433541581
{'max_depth': 4}
R2 score on train data: 0.639101918398
R2 score on test data: 0.567485445203


In [358]:
from sklearn.svm import SVR

params = {'C': np.logspace(-1, 1, 2), 'gamma': np.logspace(-1, 1, 2), 'epsilon': np.logspace(-1, 1, 2)}
reg = SVR()
best_reg = GridSearchCV(reg, params, scoring=scorer, cv=4)
best_reg.fit(X1_train, y1_train)

print best_reg.best_params_
print_r2score(best_reg, X1_train, y1_train)
print_r2score(best_reg, X1_test, y1_test, test=True)

{'epsilon': 0.10000000000000001, 'C': 10.0, 'gamma': 0.10000000000000001}
R2 score on train data: 0.843993310307
R2 score on test data: 0.811862254817


In [361]:
reg = SVR()
params = {'C': np.logspace(-1, 1, 2), 'gamma': np.logspace(-1, 1, 2), 'epsilon': np.logspace(-1, 1, 2)}
best_reg = GridSearchCV(reg, params, scoring=scorer, cv=4)
best_reg.fit(X2_train, y2_train)

print best_reg.best_params_
print_r2score(best_reg, X2_train, y2_train)
print_r2score(best_reg, X2_test, y2_test, test=True)

{'epsilon': 0.10000000000000001, 'C': 10.0, 'gamma': 0.10000000000000001}
R2 score on train data: 0.72439684816
R2 score on test data: 0.650611878577


In [364]:
from sklearn.neighbors import KNeighborsRegressor

parameters = {'n_neighbors': range(5,20)}
reg = KNeighborsRegressor()
best_reg = GridSearchCV(reg, parameters, scoring=scorer, cv=2)
best_reg.fit(X1_train, y1_train)
print best_reg.best_params_
print_r2score(best_reg, X1_train, y1_train)
print_r2score(best_reg, X1_test, y1_test, test=True)

parameters = {'n_neighbors': range(5, 20)}
reg = KNeighborsRegressor()
best_reg = GridSearchCV(reg, parameters, scoring=scorer, cv=2)
best_reg.fit(X2_train, y2_train)
print best_reg.best_params_
print_r2score(best_reg, X2_train, y2_train)
print_r2score(best_reg, X2_test, y2_test, test=True)



{'n_neighbors': 10}
R2 score on train data: 0.814169201028
R2 score on test data: 0.753269058512
{'n_neighbors': 12}
R2 score on train data: 0.702113916177
R2 score on test data: 0.643763198282


In [380]:
from sklearn.ensemble import RandomForestRegressor
# , max_depth=None, min_samples_split=1, random_state=0
reg1 = RandomForestRegressor(n_estimators=10)
reg1.fit(X1_train, y1_train)

print_r2score(reg1, X1_train, y1_train)
print_r2score(reg1, X1_test, y1_test, test=True)

reg1 = RandomForestRegressor(n_estimators=10)
reg1.fit(X2_train, y2_train)

print_r2score(reg1, X2_train, y2_train)
print_r2score(reg1, X2_test, y2_test, test=True)


R2 score on train data: 0.963511794177
R2 score on test data: 0.76295393497
R2 score on train data: 0.944744413061
R2 score on test data: 0.652043305626
