In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import sys
import p5lib
import pickle
import os.path

%matplotlib inline

In [3]:
pickle_file = 'reduced_data.pickle'
if os.path.isfile(pickle_file):
    data = pickle.load(open(pickle_file, "rb"))
else:
    data = pd.read_csv('MERGED2013_PP.csv')
    # print data.shape
    data = p5lib.preprocess_data(data)
    pickle.dump(data, open(pickle_file, "wb"))

print data.shape

(6007, 45)


In [6]:

X = data[['CONTROL', 'DEBT_MDN', 'DEP_INC_AVG', 'GRAD_DEBT_MDN', 'IND_INC_AVG', 'INEXPFTE', 'PAR_ED_PCT_1STGEN', 
          'PAR_ED_PCT_HS', 'PAR_ED_PCT_MS', 'PAR_ED_PCT_PS', 'PCTFLOAN', 'PCTPELL', 'UG25abv', 'UGDS', 
          'WDRAW_DEBT_MDN', 'L4_COLLEGE', 'NPT4', 'NUM4', 
          'PFTFTUG1_EF', 'PFTFAC',
          'SAT_AVG_ALL', 'ACTCMMID', 'ADM_RATE_ALL', 'AVGFACSAL', 'COSTT4_A',
          'C150', 'RET_FT']]

X = X[~((X.C150 == 1) & (X.RET_FT < 0.8))]
X = X[~((X.RET_FT == 1) & (X.C150 < 0.8))]

# X = X[~(X.C150 == 0)]
# X = X[~(X.RET_FT == 1)]
# X = X[~(X.RET_FT == 0)]

y = X[['C150', 'RET_FT']]
X = X.drop('C150', 1)
X = X.drop('RET_FT', 1)
print X.shape
print y.shape


(5835, 25)
(5835, 2)


In [20]:
fill_cols_with_mean = ['DEBT_MDN', 'DEP_INC_AVG', 'GRAD_DEBT_MDN', 'IND_INC_AVG', 'INEXPFTE', 'WDRAW_DEBT_MDN',
                       'PAR_ED_PCT_1STGEN', 'PAR_ED_PCT_HS', 'PAR_ED_PCT_MS', 'PAR_ED_PCT_PS', 'PCTFLOAN', 'PCTPELL', 
                       'UG25abv', 'NPT4', 'NUM4', 'PFTFTUG1_EF', 'PFTFAC', 'SAT_AVG_ALL', 'ACTCMMID', 'ADM_RATE_ALL', 
                       'AVGFACSAL', 'COSTT4_A']
for col in fill_cols_with_mean:
    X[col] = X[col].fillna(X[col].mean())

In [21]:
p5lib.print_num_data_for_each_features(X)

Number of available data for each feature (not counting the NaN values)
CONTROL             Control (public/private)                      5679
DEBT_MDN            Median debt                                   5679
DEP_INC_AVG         Avg income dependent stu                      5679
GRAD_DEBT_MDN       Median debt complete                          5679
IND_INC_AVG         Avg income independent stu                    5679
INEXPFTE            Expense per FTE student                       5679
PAR_ED_PCT_1STGEN   % 1st gen students                            5679
PAR_ED_PCT_HS       % parent education high school                5679
PAR_ED_PCT_MS       % parent education middle school              5679
PAR_ED_PCT_PS       % parent education post secondary             5679
PCTFLOAN            % Fed student loan                            5679
PCTPELL             % Pell Grant receiver                         5679
UG25abv             % undergrad > 25 yr                           5679
UGDS 

In [22]:
from sklearn import preprocessing

tmpX = X[['DEBT_MDN', 'DEP_INC_AVG', 'GRAD_DEBT_MDN', 'IND_INC_AVG', 'INEXPFTE', 'UGDS', 'WDRAW_DEBT_MDN', 'NPT4', 
          'NUM4', 'SAT_AVG_ALL', 'ACTCMMID', 'AVGFACSAL', 'COSTT4_A']]

scaler = preprocessing.MinMaxScaler()
scaledX = scaler.fit_transform(tmpX)

pctX = X[['PAR_ED_PCT_1STGEN', 'PAR_ED_PCT_HS', 'PAR_ED_PCT_MS', 'PAR_ED_PCT_PS', 'PCTFLOAN', 'PCTPELL', 'UG25abv', 
          'PFTFTUG1_EF', 'PFTFAC', 'ADM_RATE_ALL']]
pctX = np.array(pctX)

forPcaX = np.concatenate((scaledX, pctX), axis=1)
# print scaledX.shape
# print pctX.shape
print forPcaX.shape


(5679, 23)


In [23]:
from sklearn.decomposition import PCA

pca = PCA(n_components=forPcaX.shape[1])
pca.fit(forPcaX)

print pca.explained_variance_ratio_


[  3.17458069e-01   2.18690648e-01   1.14152021e-01   7.20907501e-02
   5.85495814e-02   5.05199896e-02   3.61521430e-02   2.88999116e-02
   2.24281183e-02   1.66695153e-02   1.60975749e-02   1.04620074e-02
   8.22713765e-03   6.58827899e-03   6.09738337e-03   4.53253358e-03
   4.17314314e-03   3.52558818e-03   2.97649547e-03   8.99144645e-04
   3.77359463e-04   3.11746877e-04   1.20859669e-04]


In [24]:
pca = PCA(n_components=18)
pca.fit(forPcaX)
reducedX = pca.transform(forPcaX)
print reducedX.shape

(5679, 18)


In [25]:
# add the categorical feature

tmp1X = np.array(pd.get_dummies(X['CONTROL']))
tmp2X = X['L4_COLLEGE'].astype(int)
tmp2X = np.reshape(tmp2X, (len(tmp2X), 1)) 
print tmp1X.shape
print tmp2X.shape
print reducedX.shape
finalX = np.concatenate((reducedX, tmp1X, tmp2X), axis=1)
print finalX.shape

(5679, 3)
(5679, 1)
(5679, 18)
(5679, 22)


In [40]:
from sklearn import cross_validation as cv

y = np.array(y)
X_train, X_test, y_train, y_test = cv.train_test_split(finalX, y, train_size=0.8)
print X_train.shape
print X_test.shape
print y_train.shape
print y_test.shape

(4543, 22)
(1136, 22)
(4543, 2)
(1136, 2)


In [41]:
SVR_reg1, SVR_reg2 = p5lib.build_SVR_model(X_train, X_test, y_train, y_test)

{'epsilon': 0.10000000000000001, 'C': 10.0, 'gamma': 0.10000000000000001}
R2 score on train data: 0.721466725969
R2 score on test  data: 0.694863651076
{'epsilon': 0.10000000000000001, 'C': 10.0, 'gamma': 0.10000000000000001}
R2 score on train data: 0.424986372931
R2 score on test  data: 0.437118522014


In [43]:
from sklearn.svm import SVR
from sklearn.grid_search import GridSearchCV

scorer = p5lib.scorer

y1_train, y1_test, y2_train, y2_test = p5lib.split_y(y_train, y_test)

params = {'C': np.logspace(-1, 1, 2), 'gamma': np.logspace(-1, 1, 2), 'epsilon': np.logspace(-1, 1, 2)}
reg = SVR()
best_reg1 = GridSearchCV(reg, params, scoring=scorer, cv=5)
best_reg1.fit(X_train, y1_train)

print best_reg1.best_params_
p5lib.print_r2score(best_reg1, X_train, y1_train)
p5lib.print_r2score(best_reg1, X_test, y1_test, test=True)

reg = SVR()
params = {'C': np.logspace(-1, 1, 2), 'gamma': np.logspace(-1, 1, 2), 'epsilon': np.logspace(-1, 1, 2)}
best_reg2 = GridSearchCV(reg, params, scoring=scorer, cv=5)

# use the y1 data to train the y2 prediction
reshaped_y1_train = np.reshape(y1_train, (len(y1_train), 1))
X_train_boosted = np.concatenate((X_train, reshaped_y1_train), axis=1)
best_reg2.fit(X_train_boosted, y2_train)

print best_reg2.best_params_

y1_test_prediction = best_reg1.predict(X_test) # prediction of y1 from the first model
y1_test_prediction = np.reshape(y1_test_prediction, (len(y1_test_prediction), 1))
X_test_boosted = np.concatenate((X_test, y1_test_prediction), axis=1)
p5lib.print_r2score(best_reg2, X_train_boosted, y2_train)
p5lib.print_r2score(best_reg2, X_test_boosted, y2_test, test=True)

{'epsilon': 0.10000000000000001, 'C': 10.0, 'gamma': 0.10000000000000001}
R2 score on train data: 0.721466725969
R2 score on test  data: 0.694863651076
{'epsilon': 0.10000000000000001, 'C': 10.0, 'gamma': 0.10000000000000001}
R2 score on train data: 0.48254382261
R2 score on test  data: 0.43500563102
