In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import sys
import p5lib
import pickle
import os.path

%matplotlib inline

In [2]:
pickle_file = 'reduced_data.pickle'
if os.path.isfile(pickle_file):
    data = pickle.load(open(pickle_file, "rb"))
else:
    data = pd.read_csv('MERGED2013_PP.csv')
    # print data.shape
    data = p5lib.preprocess_data(data)
    pickle.dump(data, open(pickle_file, "wb"))

print data.shape

(6007, 42)


In [3]:
X = data[['CONTROL', 'DEBT_MDN', 'DEP_INC_AVG', 'GRAD_DEBT_MDN', 'IND_INC_AVG', 'INEXPFTE', 'PAR_ED_PCT_1STGEN', 
          'PAR_ED_PCT_HS', 'PAR_ED_PCT_MS', 'PAR_ED_PCT_PS', 'PCTFLOAN', 'PCTPELL', 'UG25abv', 'UGDS', 
          'WDRAW_DEBT_MDN', 'L4_COLLEGE', 'NPT4', 'NUM4', 
          'PFTFTUG1_EF', 'PFTFAC',
          'SAT_AVG_ALL', 'ACTCMMID', 'ADM_RATE_ALL', 'AVGFACSAL', 'COSTT4_A',
          'C150', 'RET_FT']]
y = X[['C150', 'RET_FT']]
X = X.drop('C150', 1)
X = X.drop('RET_FT', 1)
print X.shape
print y.shape


(6007, 25)
(6007, 2)


In [4]:
# p5lib.print_num_data_for_each_features(X)


def print_num_data_for_each_features(data):
    print "Number of available data for each feature (not counting the NaN values)"
    tmp = (data.isnull().sum() - len(data)) * -1
    for k, v in tmp.iteritems():
        print "{0:20s}{1:45s}{2:5d}".format(k, p5lib.col_desc[k], v)


print_num_data_for_each_features(X)

Number of available data for each feature (not counting the NaN values)
CONTROL             Control (public/private)                      6007
DEBT_MDN            Median debt                                   5242
DEP_INC_AVG         Avg income dependent stu                      5703
GRAD_DEBT_MDN       Median debt complete                          5186
IND_INC_AVG         Avg income independent stu                    5688
INEXPFTE            Expense per FTE student                       6006
PAR_ED_PCT_1STGEN   % 1st gen students                            5513
PAR_ED_PCT_HS       % parent education high school                5533
PAR_ED_PCT_MS       % parent education middle school              5423
PAR_ED_PCT_PS       % parent education post secondary             5533
PCTFLOAN            % Fed student loan                            6005
PCTPELL             % Pell Grant receiver                         6005
UG25abv             % undergrad > 25 yr                           5964
UGDS 

In [5]:
fill_cols_with_mean = ['DEBT_MDN', 'DEP_INC_AVG', 'GRAD_DEBT_MDN', 'IND_INC_AVG', 'INEXPFTE', 'WDRAW_DEBT_MDN',
                       'PAR_ED_PCT_1STGEN', 'PAR_ED_PCT_HS', 'PAR_ED_PCT_MS', 'PAR_ED_PCT_PS', 'PCTFLOAN', 'PCTPELL', 
                       'UG25abv', 'NPT4', 'NUM4', 'PFTFTUG1_EF', 'PFTFAC', 'SAT_AVG_ALL', 'ACTCMMID', 'ADM_RATE_ALL', 
                       'AVGFACSAL', 'COSTT4_A']
for col in fill_cols_with_mean:
    X[col] = X[col].fillna(X[col].mean())

In [6]:
p5lib.print_num_data_for_each_features(X)

Number of available data for each feature (not counting the NaN values)
CONTROL             Control (public/private)                      6007
DEBT_MDN            Median debt                                   6007
DEP_INC_AVG         Avg income dependent stu                      6007
GRAD_DEBT_MDN       Median debt complete                          6007
IND_INC_AVG         Avg income independent stu                    6007
INEXPFTE            Expense per FTE student                       6007
PAR_ED_PCT_1STGEN   % 1st gen students                            6007
PAR_ED_PCT_HS       % parent education high school                6007
PAR_ED_PCT_MS       % parent education middle school              6007
PAR_ED_PCT_PS       % parent education post secondary             6007
PCTFLOAN            % Fed student loan                            6007
PCTPELL             % Pell Grant receiver                         6007
UG25abv             % undergrad > 25 yr                           6007
UGDS 

In [7]:
from sklearn import preprocessing

tmpX = X[['DEBT_MDN', 'DEP_INC_AVG', 'GRAD_DEBT_MDN', 'IND_INC_AVG', 'INEXPFTE', 'UGDS', 'WDRAW_DEBT_MDN', 'NPT4', 
          'NUM4', 'SAT_AVG_ALL', 'ACTCMMID', 'AVGFACSAL', 'COSTT4_A']]

scaler = preprocessing.MinMaxScaler()
scaledX = scaler.fit_transform(tmpX)

# print np.array(tmpX.ix[0])
# print scaledX[0]

# X_for_PCA = pd.DataFrame(scaledX)
pctX = X[['PAR_ED_PCT_1STGEN', 'PAR_ED_PCT_HS', 'PAR_ED_PCT_MS', 'PAR_ED_PCT_PS', 'PCTFLOAN', 'PCTPELL', 'UG25abv', 
          'PFTFTUG1_EF', 'PFTFAC', 'ADM_RATE_ALL']]
pctX = np.array(pctX)

forPcaX = np.concatenate((scaledX, pctX), axis=1)
# print scaledX.shape
# print pctX.shape
print forPcaX.shape


(6007, 23)


In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=forPcaX.shape[1])
pca.fit(forPcaX)

print pca.explained_variance_ratio_

[  3.09532908e-01   2.22170061e-01   1.14380354e-01   7.34313010e-02
   5.86618453e-02   4.99781459e-02   3.68281480e-02   2.97028318e-02
   2.28399100e-02   1.71245097e-02   1.60240460e-02   1.03454884e-02
   8.53032146e-03   6.51095597e-03   6.16040757e-03   4.86048428e-03
   4.34824502e-03   3.79707074e-03   2.99796594e-03   8.99067459e-04
   4.41834672e-04   3.01841722e-04   1.32255865e-04]


In [9]:
pca = PCA(n_components=18)
pca.fit(forPcaX)
reducedX = pca.transform(forPcaX)
print reducedX.shape

(6007, 18)


In [10]:
# add the categorical feature

tmp1X = np.array(pd.get_dummies(X['CONTROL']))
tmp2X = X['L4_COLLEGE'].astype(int)
tmp2X = np.reshape(tmp2X, (len(tmp2X), 1)) 
print tmp1X.shape
print tmp2X.shape
print reducedX.shape
finalX = np.concatenate((reducedX, tmp1X, tmp2X), axis=1)
print finalX.shape

(6007, 3)
(6007, 1)
(6007, 18)
(6007, 22)


In [16]:
from sklearn import cross_validation as cv

y = np.array(y)
X_train, X_test, y_train, y_test = cv.train_test_split(finalX, y, train_size=0.8)
print X_train.shape
print X_test.shape
print y_train.shape
print y_test.shape

(4805, 22)
(1202, 22)
(4805, 2)
(1202, 2)


In [17]:
SVR_reg1, SVR_reg2, _, _ = p5lib.build_SVR_model(X_train, X_test, y_train, y_test)

--- Completion ---
{'epsilon': 0.10000000000000001, 'C': 10.0, 'gamma': 0.10000000000000001}
R2 score on train data: 0.70293514498
R2 score on test  data: 0.660491554253
--- Retention ---
{'epsilon': 0.10000000000000001, 'C': 10.0, 'gamma': 0.10000000000000001}
R2 score on train data: 0.380597386109
R2 score on test  data: 0.377663925292


In [13]:
DT_reg1, DT_reg2, _, _ = p5lib.build_DecisionTree_model(X_train, X_test, y_train, y_test)

--- Completion ---
{'max_depth': 6}
R2 score on train data: 0.624980713493
R2 score on test  data: 0.533934179648
--- Retention ---
{'max_depth': 5}
R2 score on train data: 0.294381944722
R2 score on test  data: 0.196899534953


In [14]:
KNN_reg1, KNN_reg2, _, _ = p5lib.build_KNN_model(X_train, X_test, y_train, y_test)

--- Completion ---
{'n_neighbors': 9}
R2 score on train data: 0.728548076762
R2 score on test  data: 0.639855969234
--- Retention ---
{'n_neighbors': 17}
R2 score on train data: 0.406820603812
R2 score on test  data: 0.252219438255


In [15]:
RForest_reg1, RForest_reg2, _, _ = p5lib.build_RandomForest_model(X_train, X_test, y_train, y_test)

--- Completion ---
R2 score on train data: 0.931828306617
R2 score on test  data: 0.626785559267
--- Retention ---
R2 score on train data: 0.873888333692
R2 score on test  data: 0.256130952066


In [34]:
from sklearn.svm import SVR
from sklearn.grid_search import GridSearchCV

scorer = p5lib.scorer

y1_train, y1_test, y2_train, y2_test = p5lib.split_y(y_train, y_test)

params = {'C': np.logspace(-1, 1, 2), 'gamma': np.logspace(-1, 1, 2), 'epsilon': np.logspace(-1, 1, 2)}
reg = SVR()
best_reg1 = GridSearchCV(reg, params, scoring=scorer, cv=4)
best_reg1.fit(X_train, y1_train)

print best_reg1.best_params_
p5lib.print_r2score(best_reg1, X_train, y1_train)
p5lib.print_r2score(best_reg1, X_test, y1_test, test=True)

reg = SVR()
params = {'C': np.logspace(-1, 1, 2), 'gamma': np.logspace(-1, 1, 2), 'epsilon': np.logspace(-1, 1, 2)}
best_reg2 = GridSearchCV(reg, params, scoring=scorer, cv=4)

# use the y1 data to train the y2 prediction
reshaped_y1_train = np.reshape(y1_train, (len(y1_train), 1))
X_train_boosted = np.concatenate((X_train, reshaped_y1_train), axis=1)
best_reg2.fit(X_train_boosted, y2_train)

print best_reg2.best_params_

y1_test_prediction = best_reg1.predict(X_test) # prediction of y1 from the first model
y1_test_prediction = np.reshape(y1_test_prediction, (len(y1_test_prediction), 1))
X_test_boosted = np.concatenate((X_test, y1_test_prediction), axis=1)
p5lib.print_r2score(best_reg2, X_train_boosted, y2_train)
p5lib.print_r2score(best_reg2, X_test_boosted, y2_test, test=True)

{'epsilon': 0.10000000000000001, 'C': 10.0, 'gamma': 0.10000000000000001}
R2 score on train data: 0.700035266771
R2 score on test  data: 0.666317460989
{'epsilon': 0.10000000000000001, 'C': 10.0, 'gamma': 0.10000000000000001}
R2 score on train data: 0.437978629072
R2 score on test  data: 0.351196798513
