In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import sys
import p5lib
import pickle
import os.path

%matplotlib inline

In [2]:
# load data from csv or pickle (much faster)

pickle_file = 'reduced_data.pickle'
if os.path.isfile(pickle_file):
    data = pickle.load(open(pickle_file, "rb"))
else:
    data = pd.read_csv('MERGED2013_PP.csv')
    data = p5lib.preprocess_data(data)
    # save data as pickle because it's much faster
    pickle.dump(data, open(pickle_file, "wb"))

print data.shape

(6007, 45)


In [3]:
X = data[['CONTROL', 'DEBT_MDN', 'DEP_INC_AVG', 'GRAD_DEBT_MDN', 'IND_INC_AVG', 'INEXPFTE', 'PAR_ED_PCT_1STGEN', 
          'PAR_ED_PCT_HS', 'PAR_ED_PCT_MS', 'PAR_ED_PCT_PS', 'PCTFLOAN', 'PCTPELL', 'UG25abv', 'UGDS', 
          'WDRAW_DEBT_MDN', 'L4_COLLEGE', 'NPT4', 'NUM4', 
          'PFTFTUG1_EF', 'PFTFAC',
          'SAT_AVG_ALL', 'ACTCMMID', 'ADM_RATE_ALL', 'AVGFACSAL', 'COSTT4_A',
          'CCSIZSET', 'CCUGPROF', 'CCBASIC', # carnegie classification data (which is not complete)
          'C150', 'RET_FT']]

# remove noise, there are some with 0 retention but high completion rate, and vice versa (which doesn't make sense
# and seems like error in the data)
X = X[~((X.RET_FT == 0) & (X.C150 > 0.5))]
X = X[~((X.C150 == 0) & (X.RET_FT > 0.5))]
X = X[~((X.C150 == 1) & (X.RET_FT < 0.5))]
X = X[~((X.RET_FT == 1) & (X.C150 < 0.5))]

y = X[['C150', 'RET_FT']]
X = X.drop('C150', 1)
X = X.drop('RET_FT', 1)
print X.shape
print y.shape


(5930, 28)
(5930, 2)


In [4]:
p5lib.print_num_data_for_each_features(X)

Number of available data for each feature (not counting the NaN values)
CONTROL             Control (public/private)                      5930
DEBT_MDN            Median debt                                   5187
DEP_INC_AVG         Avg income dependent stu                      5642
GRAD_DEBT_MDN       Median debt complete                          5129
IND_INC_AVG         Avg income independent stu                    5629
INEXPFTE            Expense per FTE student                       5929
PAR_ED_PCT_1STGEN   % 1st gen students                            5461
PAR_ED_PCT_HS       % parent education high school                5479
PAR_ED_PCT_MS       % parent education middle school              5369
PAR_ED_PCT_PS       % parent education post secondary             5479
PCTFLOAN            % Fed student loan                            5928
PCTPELL             % Pell Grant receiver                         5928
UG25abv             % undergrad > 25 yr                           5892
UGDS 

In [5]:
from sklearn import cross_validation as cv

# Split the data into train & test and keep it the same (so train and test data doesn't keep changing when we test 
# different ways of building the model)
pickle_file = 'split_data.pickle'
if os.path.isfile(pickle_file):
    print "loading split_data from pickle"
    split_data = pickle.load(open(pickle_file, "rb"))
    X_train, X_test, y_train, y_test = split_data['X_train'], split_data['X_test'], split_data['y_train'], split_data['y_test']
else:
    y = np.array(y)
    X_train, X_test, y_train, y_test = cv.train_test_split(X, y, train_size=0.8)
    split_data = {
        'X_train': X_train,
        'X_test':  X_test,
        'y_train': y_train,
        'y_test':  y_test,        
    }
    pickle.dump(split_data, open(pickle_file, "wb"))
    
print X_train.shape
print X_test.shape
print y_train.shape
print y_test.shape

# Check if the train / test has similar distribution
print pd.DataFrame(data={'y_train c150': pd.Series(y_train[:,0]).describe(),
                         'y_test c150': pd.Series(y_test[:,0]).describe(),
                        })
print pd.DataFrame(data={'y_train ret': pd.Series(y_train[:,1]).describe(),
                         'y_test ret': pd.Series(y_test[:,1]).describe(),
                        })


loading split_data from pickle
(4744, 28)
(1186, 28)
(4744, 2)
(1186, 2)
       y_test c150  y_train c150
count  1186.000000   4744.000000
mean      0.537681      0.526076
std       0.232230      0.237625
min       0.023489      0.000000
25%       0.354547      0.333560
50%       0.561215      0.552041
75%       0.728422      0.717390
max       1.000000      1.000000
        y_test ret  y_train ret
count  1186.000000  4744.000000
mean      0.694168     0.690989
std       0.175161     0.173520
min       0.000000     0.000000
25%       0.593050     0.585400
50%       0.712250     0.704500
75%       0.818200     0.813500
max       1.000000     1.000000


In [6]:
non_categorical_cols = ['DEBT_MDN', 'DEP_INC_AVG', 'GRAD_DEBT_MDN', 'IND_INC_AVG', 'INEXPFTE', 'WDRAW_DEBT_MDN',
                       'PAR_ED_PCT_1STGEN', 'PAR_ED_PCT_HS', 'PAR_ED_PCT_MS', 'PAR_ED_PCT_PS', 'PCTFLOAN', 'PCTPELL', 
                       'UG25abv', 'NPT4', 'NUM4', 'PFTFTUG1_EF', 'PFTFAC', 'SAT_AVG_ALL', 'ACTCMMID', 'ADM_RATE_ALL', 
                       'AVGFACSAL', 'COSTT4_A', 'UGDS']

# Fill missing values with the means
col_mean = {}
for col in non_categorical_cols:
    if col in X_train:
        col_mean[col] = X_train[col].mean()
        X_train[col] = X_train[col].fillna(col_mean[col])
        X_test[col] = X_test[col].fillna(col_mean[col])


In [7]:
from sklearn import preprocessing

# take features of type cost/money, and percentage (between 0 and 1) and preprocess with standard scaler
# and then use these preprocessed features for PCA

scaler = preprocessing.StandardScaler()
scaledX_train = scaler.fit_transform(X_train[non_categorical_cols])
scaledX_test = scaler.transform(X_test[non_categorical_cols])
print scaledX_train.shape
print scaledX_test.shape

(4744, 23)
(1186, 23)


In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=scaledX_train.shape[1])
pca.fit(scaledX_train)

print pca.explained_variance_ratio_

[  3.24655650e-01   1.24585911e-01   9.71020152e-02   6.92096771e-02
   5.83126789e-02   4.75745623e-02   4.23884493e-02   3.96563859e-02
   3.40878378e-02   2.76905353e-02   2.53336893e-02   2.23706918e-02
   2.17563065e-02   1.64152943e-02   1.13183641e-02   9.20963988e-03
   7.61050814e-03   7.27016931e-03   4.97367489e-03   4.51767546e-03
   3.17124132e-03   5.08458877e-04   2.80583198e-04]


In [9]:
pca = PCA(n_components=15)
pca.fit(scaledX_train)
reducedX_train = pca.transform(scaledX_train)
reducedX_test = pca.transform(scaledX_test)

# reducedX is the chosen top PCA components
print reducedX_train.shape
print reducedX_test.shape

(4744, 15)
(1186, 15)


In [10]:
# add the categorical features
# only CONTROL and L4_COLLEGE because the carnegie classification data is not complete

tmp1X = np.array(pd.get_dummies(X_train['CONTROL']))
tmp2X = X_train['L4_COLLEGE'].astype(int)
tmp2X = np.reshape(tmp2X, (len(tmp2X), 1)) 
finalX_train = np.concatenate((reducedX_train, tmp1X, tmp2X), axis=1)
print finalX_train.shape

tmp1X = np.array(pd.get_dummies(X_test['CONTROL']))
tmp2X = X_test['L4_COLLEGE'].astype(int)
tmp2X = np.reshape(tmp2X, (len(tmp2X), 1)) 
finalX_test = np.concatenate((reducedX_test, tmp1X, tmp2X), axis=1)
print finalX_test.shape

print y_train.shape
print y_test.shape

(4744, 19)
(1186, 19)
(4744, 2)
(1186, 2)


In [11]:
DT_reg1, DT_reg2, _, _ = p5lib.build_DecisionTree_model(finalX_train, finalX_test, y_train, y_test)

--- Completion ---
{'max_depth': 6}
R2 score on train data: 0.628203066121
R2 score on test  data: 0.554932450279
--- Retention ---
{'max_depth': 5}
R2 score on train data: 0.329589485813
R2 score on test  data: 0.225267074998


In [12]:
SVR_reg1, SVR_reg2, _, _ = p5lib.build_SVR_model(finalX_train, finalX_test, y_train, y_test)

--- Completion ---
{'epsilon': 0.10000000000000001, 'C': 0.10000000000000001, 'gamma': 0.10000000000000001}
R2 score on train data: 0.745802250984
R2 score on test  data: 0.647835401393
--- Retention ---
{'epsilon': 0.10000000000000001, 'C': 0.10000000000000001, 'gamma': 0.10000000000000001}
R2 score on train data: 0.496864683516
R2 score on test  data: 0.313152875409


In [13]:
KNN_reg1, KNN_reg2, _, _ = p5lib.build_KNN_model(finalX_train, finalX_test, y_train, y_test)

--- Completion ---
{'n_neighbors': 8}
R2 score on train data: 0.754742923815
R2 score on test  data: 0.666227406057
--- Retention ---
{'n_neighbors': 19}
R2 score on train data: 0.441614377296
R2 score on test  data: 0.329813539016


In [14]:
RForest_reg1, RForest_reg2, _, _ = p5lib.build_RandomForest_model(finalX_train, finalX_test, y_train, y_test, n_estimators=50)

--- Completion ---
R2 score on train data: 0.949578007476
R2 score on test  data: 0.659434508598
--- Retention ---
R2 score on train data: 0.908819971635
R2 score on test  data: 0.316207018986


In [15]:
# add more categorical features
# CCSIZSET, CCUGPROF, CCBASIC

tmp1X = np.array(pd.get_dummies(X_train['CCSIZSET']))
tmp2X = np.array(pd.get_dummies(X_train['CCUGPROF']))
tmp3X = np.array(pd.get_dummies(X_train['CCBASIC']))
print len(X_train['CCSIZSET'].unique())
print len(X_train['CCUGPROF'].unique())
print len(X_train['CCBASIC'].unique())

r2X_train = np.concatenate((finalX_train, tmp1X), axis=1)
print r2X_train.shape

tmp1X = np.array(pd.get_dummies(X_test['CCSIZSET']))
tmp2X = np.array(pd.get_dummies(X_test['CCUGPROF']))
tmp3X = np.array(pd.get_dummies(X_test['CCBASIC']))
print len(X_test['CCSIZSET'].unique())
print len(X_test['CCUGPROF'].unique())
print len(X_test['CCBASIC'].unique())
r2X_test = np.concatenate((finalX_test, tmp1X), axis=1)
print r2X_test.shape


18
15
34
(4744, 36)
18
14
31
(1186, 36)


In [16]:
DT_reg1, DT_reg2, _, _ = p5lib.build_DecisionTree_model(r2X_train, r2X_test, y_train, y_test)

--- Completion ---
{'max_depth': 6}
R2 score on train data: 0.629370742909
R2 score on test  data: 0.539633254322
--- Retention ---
{'max_depth': 5}
R2 score on train data: 0.329589485813
R2 score on test  data: 0.225267074998


In [17]:
SVR_reg1, SVR_reg2, _, _ = p5lib.build_SVR_model(r2X_train, r2X_test, y_train, y_test)

--- Completion ---
{'epsilon': 0.10000000000000001, 'C': 0.10000000000000001, 'gamma': 0.10000000000000001}
R2 score on train data: 0.753314727964
R2 score on test  data: 0.64912983366
--- Retention ---
{'epsilon': 0.10000000000000001, 'C': 0.10000000000000001, 'gamma': 0.10000000000000001}
R2 score on train data: 0.50500078114
R2 score on test  data: 0.312708061307


In [18]:
KNN_reg1, KNN_reg2, _, _ = p5lib.build_KNN_model(r2X_train, r2X_test, y_train, y_test)

--- Completion ---
{'n_neighbors': 13}
R2 score on train data: 0.735691613557
R2 score on test  data: 0.671796896921
--- Retention ---
{'n_neighbors': 19}
R2 score on train data: 0.442638528995
R2 score on test  data: 0.337837858495


In [19]:
RForest_reg1, RForest_reg2, _, _ = p5lib.build_RandomForest_model(r2X_train, r2X_test, y_train, y_test, n_estimators=50)

--- Completion ---
R2 score on train data: 0.950634832275
R2 score on test  data: 0.666180209301
--- Retention ---
R2 score on train data: 0.909761209959
R2 score on test  data: 0.322645207598


In [20]:
RForest_reg1, RForest_reg2, _, _ = p5lib.build_RandomForest_model(r2X_train, r2X_test, y_train, y_test, n_estimators=300)

--- Completion ---
R2 score on train data: 0.955347378575
R2 score on test  data: 0.672237525277
--- Retention ---
R2 score on train data: 0.915343761297
R2 score on test  data: 0.344088807636


In [21]:
p5lib.build_RandomForest_model(r2X_train, r2X_test, y_train, y_test, n_estimators=500)

--- Completion ---
R2 score on train data: 0.955244375152
R2 score on test  data: 0.672881830707
--- Retention ---
R2 score on train data: 0.916929458706
R2 score on test  data: 0.339975827444
