In [2]:
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import Imputer
from IPython.display import Image  
from sklearn.naive_bayes import GaussianNB
import random
import numpy as np
import pydotplus
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
random.seed(123435454)

In [3]:
dat = pd.read_csv('edu_chile_survey_output.csv.gz')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
x = pd.concat([dat.loc[:,'q1':'q6_lang_orig'],dat.loc[:,'female':'rbdRating']], axis=1)
y = dat['search1_clicked_button'].replace(np.nan, 0).replace('yes',1)

In [5]:
# remove all non-numeric characters in the self-estimation variables
estimated_values= ['q4_cost_1_orig','q4_cost_2_orig','q4_cost_3_orig',
                  'q5_mi_ing_1_orig', 'q5_mi_ing_2_orig','q5_mi_ing_3_orig',
                  'q5_tip_ing_1_orig','q5_tip_ing_2_orig','q5_tip_ing_3_orig']

for i in estimated_values:
    x[i] = x[i].str.extract('(?P<digit>([0-9]+))',expand=False)
    x[i] = x[i].astype(float)  
    x[i] = x[i].replace(0, np.nan)
    
x[estimated_values].describe()

Unnamed: 0,q4_cost_1_orig,q4_cost_2_orig,q4_cost_3_orig,q5_mi_ing_1_orig,q5_mi_ing_2_orig,q5_mi_ing_3_orig,q5_tip_ing_1_orig,q5_tip_ing_2_orig,q5_tip_ing_3_orig
count,19639.0,15256.0,12844.0,18507.0,15420.0,13619.0,15197.0,12605.0,11195.0
mean,565767700000000.0,3564151.0,3.114295e+85,1071742.0,988177.8,1005502.0,2351188.0,1072769.0,998731.6
std,7.928624e+16,61902000.0,3.529473e+87,14647480.0,10843580.0,11932830.0,162434500.0,11254970.0,9488580.0
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1200000.0,1330000.0,1330000.0,390000.0,400000.0,380000.0,440000.0,430000.0,420000.0
50%,2440000.0,2500000.0,2500000.0,670000.0,660000.0,620000.0,720000.0,700000.0,700000.0
75%,3500000.0,3520000.0,3500000.0,1000000.0,1000000.0,990000.0,1100000.0,1050000.0,1010000.0
max,1.111111e+19,5000000000.0,4e+89,1400000000.0,1200000000.0,1200000000.0,20000000000.0,1000000000.0,1000000000.0


In [6]:
# It seems like 10^9 might be the maximum values for each of these variables.
# Anything higher than that I consider as outliers. For simplicity, I assign
# the values of these outliers to NaN.

for i in estimated_values:
    x.loc[x[i]>np.power(10,9),i]=np.nan
    
x[estimated_values].describe()

Unnamed: 0,q4_cost_1_orig,q4_cost_2_orig,q4_cost_3_orig,q5_mi_ing_1_orig,q5_mi_ing_2_orig,q5_mi_ing_3_orig,q5_tip_ing_1_orig,q5_tip_ing_2_orig,q5_tip_ing_3_orig
count,19631.0,15253.0,12840.0,18506.0,15419.0,13618.0,15196.0,12605.0,11195.0
mean,2611960.0,2712561.0,2640066.0,996148.7,910415.8,917456.9,1035206.0,1072769.0,998731.6
std,5156370.0,10037130.0,5606391.0,10430700.0,4933676.0,6068116.0,8168338.0,11254970.0,9488580.0
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1200000.0,1330000.0,1330000.0,390000.0,400000.0,380000.0,440000.0,430000.0,420000.0
50%,2440000.0,2500000.0,2500000.0,670000.0,660000.0,620000.0,720000.0,700000.0,700000.0
75%,3500000.0,3520000.0,3500000.0,1000000.0,1000000.0,990000.0,1100000.0,1050000.0,1010000.0
max,500000000.0,1000000000.0,460000000.0,1000000000.0,600000000.0,600000000.0,1000000000.0,1000000000.0,1000000000.0


In [7]:
# Most of our dataset variables are categorical, the estimated values above 
# and the last two of math and language scores are numerical. I put them together here.
numerical_variables = estimated_values + ['q6_math_orig','q6_lang_orig', 'PSU_leng_2013', 'PSU_mate_2013', 'PSU_2013','SIMCEMath10', 'SIMCELang10']

In [8]:
# temporarily impute the missing values for basic predictive analytics
imp = Imputer(missing_values='NaN', strategy='mean',axis=0)
x[numerical_variables] = imp.fit_transform(x[numerical_variables])

In [9]:
# use decision tree for only numerical variables
def mytree(input_data):
    xtrain, xtest, ytrain, ytest = train_test_split(input_data, y, test_size = 0.2, random_state=0)
    clf = tree.DecisionTreeClassifier()
    clf.fit(xtrain,ytrain)
    y_pred = clf.predict(xtest)
    target_names = ['non-click','click']
    print(classification_report(ytest, y_pred, target_names = target_names))

mytree(x[numerical_variables])

             precision    recall  f1-score   support

  non-click       0.75      0.75      0.75      5233
      click       0.44      0.45      0.44      2368

avg / total       0.65      0.65      0.65      7601



In [10]:
# I tried to graph the tree but it's not clear/nice to look at.
# dot_data = tree.export_graphviz(clf, out_file=None,  filled=True, rounded=True,                          special_characters=True)  
# graph = pydotplus.graph_from_dot_data(dot_data)  
# graph.write_pdf("educ.pdf") 
# the graph is too large for the visual intuition

In [11]:
# use Naive-Bayes for only self-estimated numerical variables
def myNB(input_data):
    xtrain, xtest, ytrain, ytest = train_test_split(input_data, y, test_size = 0.2, random_state=0)
    clf2 = GaussianNB()
    y_pred2 = clf2.fit(xtrain, ytrain).predict(xtest)
    target_names = ['non-click','click']
    print(classification_report(ytest, y_pred2, target_names = target_names))

myNB(x[numerical_variables])

             precision    recall  f1-score   support

  non-click       0.70      0.96      0.81      5233
      click       0.46      0.07      0.13      2368

avg / total       0.62      0.68      0.60      7601



F1-score for the decision tree is so highter for click (search) the self-estimated numerical features.

In [12]:
# check on the number of unique values for each categorical feature
for i in x.columns.values:
    if i not in numerical_variables:
        print(x[i].unique().size,"\t",i)

7 	 q1
4 	 q2_tipo_1_orig
3 	 q2_nivel_1
142 	 q2_inst_1
185 	 q2_carerra_1
4 	 q2_tipo_2_orig
3 	 q2_nivel_2
146 	 q2_inst_2
184 	 q2_carerra_2
4 	 q2_tipo_3_orig
3 	 q2_nivel_3
155 	 q2_inst_3
185 	 q2_carerra_3
6 	 q3
2 	 q4_nose_1_orig
2 	 q4_nose_2_orig
2 	 q4_nose_3_orig
2 	 q5_mi_nose_1_orig
2 	 q5_tip_nose_1_orig
2 	 q5_mi_nose_2_orig
2 	 q5_tip_nose_2_orig
2 	 q5_mi_nose_3_orig
2 	 q5_tip_nose_3_orig
3 	 female
6 	 mom_educ_simce
6 	 dad_educ_simce
4 	 schl_type
6 	 rbdRating


I want to use the tree decision algorithm for this task. Notice that some variables like institution-vhoice and major choice include lots of values. At this time, I will not dig into creating (n-1) binary variables for all of these choices, I will ignore them for now and only consider those with the number of categorical values less than 32 (which is currently the offered capacity of sklearn as far as I remember)

In [13]:
# Now for this new analysis I will include both categorical features (less than 32 values),
# and numerical features

new_x = x
for i in x.columns.values:
    if (i not in numerical_variables): 
        if (x[i].unique().size<=32):
            # I create binary variables for each categorical feature 
            # (d-1 binary variables for each d-value variable), merge them with the data set
            new_x= pd.concat([new_x,pd.get_dummies(new_x[i])],axis=1)
            
            # and drop the first variable
            new_x=new_x.drop(i, axis=1)
        else:
            # since they are institution-choice variables
            new_x = new_x.drop(i, axis = 1)

In [14]:
mytree(new_x)

             precision    recall  f1-score   support

  non-click       0.75      0.74      0.75      5233
      click       0.45      0.46      0.46      2368

avg / total       0.66      0.65      0.66      7601



There is not much improvement in F1-score for including all categorical variables in our analysis (after we tranformed all of these categorical variables into a list of binary variables). There is no guarantee that female/school-type/mother-education/etc make no impact on the prediction. Yet for now, I don't get that better improvement as including all variables. For me, the more simple, the better. Now I want to do some simple linear regression to test the statistical significance of each variable on the prediction of the deed of database search for each student.

In [15]:
LD = LinearRegression()
LD.fit(x[numerical_variables],y)
pd.DataFrame(LD.coef_,numerical_variables)

Unnamed: 0,0
q4_cost_1_orig,-9.461723e-10
q4_cost_2_orig,-1.866362e-10
q4_cost_3_orig,-5.240931e-11
q5_mi_ing_1_orig,-6.925668e-12
q5_mi_ing_2_orig,-4.150525e-10
q5_mi_ing_3_orig,1.211839e-09
q5_tip_ing_1_orig,1.279936e-08
q5_tip_ing_2_orig,-7.821905e-10
q5_tip_ing_3_orig,-1.152505e-08
q6_math_orig,-0.0001153664


In [16]:
x_OLS = sm.add_constant(x[numerical_variables])
results= sm.OLS(y,x_OLS).fit()
print(results.summary())

                              OLS Regression Results                              
Dep. Variable:     search1_clicked_button   R-squared:                       0.013
Model:                                OLS   Adj. R-squared:                  0.012
Method:                     Least Squares   F-statistic:                     30.86
Date:                    Tue, 13 Jun 2017   Prob (F-statistic):           3.18e-94
Time:                            17:01:23   Log-Likelihood:                -24383.
No. Observations:                   38004   AIC:                         4.880e+04
Df Residuals:                       37987   BIC:                         4.894e+04
Df Model:                              16                                         
Covariance Type:                nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

Without testing any statistical significance, it's very clear to see very small (coefficient) potential impacts of these variables on the prediction so far. Further testing (F-test, etc) for model selection and specification should be implemented. Then OLS for statistical signicance could be introduced to see the potential significance of these variables. 

A simple OLS regression result above shows that most of variables of numerical values are statistical significant, except for _q6_lang_orig_.  Yet it seems like those with perceptions of self-estimated high tuitions and salaries from an institution or major choice will not likely to search in the database system.

In [17]:
# FYI
x_OLS2 = sm.add_constant(new_x)
results2= sm.OLS(y,x_OLS2).fit()
print(results2.summary())

                              OLS Regression Results                              
Dep. Variable:     search1_clicked_button   R-squared:                       0.172
Model:                                OLS   Adj. R-squared:                  0.171
Method:                     Least Squares   F-statistic:                     111.3
Date:                    Tue, 13 Jun 2017   Prob (F-statistic):               0.00
Time:                            17:01:24   Log-Likelihood:                -21033.
No. Observations:                   38004   AIC:                         4.221e+04
Df Residuals:                       37932   BIC:                         4.283e+04
Df Model:                              71                                         
Covariance Type:                nonrobust                                         
                                                                                                                                                                      coef