In [935]:
import pandas as pd
import glob
import time
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from datetime import datetime, timedelta
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode(connected=True)
from plotly.graph_objs import *


In [936]:
# READ IN AND MERGE CAMP AND STUDENT FILES
stud = pd.read_csv('StudentCA.csv', parse_dates = ['enroll_date', 'end_date', 'date_of_birth'])
camps = pd.read_csv('MathCampAll.csv', parse_dates = ['open_date'])

students = pd.merge(stud, camps, on='camp_id')
students = students.rename(columns={'city_x': 'city_student', 'country': 'country_student', 'zip_code_x': 'zip_student', 'state_code_x': 'state_student', 'city_y':'city_camp', 'zip_code_y':'zip_camp', 'state_code_y':'state_camp,', 'country_code':'country_camp'})

students.head()

Unnamed: 0,camp_id,student_id,gender,date_of_birth,enroll_date,end_date,grade_level,city_student,country_student,zip_student,state_student,city_camp,zip_camp,"state_camp,",country_camp,open_date,center_days
0,2824,61913,M,2002-01-22,2015-09-01,2015-11-30,12,Santa Rosa,USA,95409,CA,Windsor,95492,CA,US,2015-01-01,"[2, 5]"
1,2824,61922,F,2002-04-28,2015-01-01,2016-09-18,11,Santa Rosa,USA,95403,CA,Windsor,95492,CA,US,2015-01-01,"[2, 5]"
2,2824,61938,M,2000-12-13,2015-01-01,2015-08-30,12,Santa Rosa,USA,95404,CA,Windsor,95492,CA,US,2015-01-01,"[2, 5]"
3,2824,61958,M,2008-07-21,2016-01-05,2016-09-18,6,Windsor,USA,95492,CA,Windsor,95492,CA,US,2015-01-01,"[2, 5]"
4,2824,61981,F,2008-01-21,2016-03-04,2016-05-28,7,Santa Rosa,USA,95409,CA,Windsor,95492,CA,US,2015-01-01,"[2, 5]"


In [937]:
# REMOVE NULLS AND DUPLICATE DATA
print(str(len(students)) + ' Total Students Enrolled')

#print(students.isnull().sum())
# 215 students who did not submit worksheets - dont know if they quit on the first day 
# or are enrolled and did not submit worksheets.. leave them out for now

print(str(len(students.dropna(how='any'))) + ' remaining students after nulls dropped ~5%')
students = students.dropna(how='any')

print(len(students))
print(str(len(students[students.duplicated()]))+' absolute duplicates in the data')
#print(str(len(students[students.duplicated(subset=['student_id'])])))
# no duplicates in data, very clean

4428 Total Students Enrolled
4204 remaining students after nulls dropped ~5%
4204
0 absolute duplicates in the data


In [938]:
# TIME ENROLLED FOR EACH STUDENT
td = (students['end_date'] - students['enroll_date'])
students["time_enrolled"] = (td / np.timedelta64(1, 'D')).astype(int)
#students.head()

In [939]:
#dealing with end dates of those who are still enrolled?
#to make it easy maybe eliminate students who are still enrolled? 
# and do long-term vs short-term enrollment


#last day they pulled data will depend on the center??
#probably need to get last day they pulled data from each camp? 
#- for now ill just look at 1 week bfore

dy = 7
t = students[students['end_date'] <= students.end_date.max()-timedelta(days=dy)]['time_enrolled']
iplot([Histogram(x=t)])

students = students[students['end_date'] <= students.end_date.max()-timedelta(days=dy)]
len(students)

2679

In [940]:
# PICK A BOUNDARY - LESS THAN THAT IS SHORT-TERM, MORE IS LONG-TERM

# WHat is the right boundary to define?


ndays = 120
students['short_term'] = 0
students.ix[students['time_enrolled'] <= ndays, 'short_term'] = 1


In [941]:
# First Feature: Gender - classify by gender - male or female, deal with unclean data
# there re entries in the gender column called: {u'@xml:space': u'preserve'} 
#- i have no idea if these are male or female, i think they were not filled out on the form 
# so they are left as blank - I will eliminate these

grouped = students.groupby('gender')
students = students.drop(grouped.get_group("{u'@xml:space': u'preserve'}").index) 
len(students)         

2672

In [942]:
# AGE

# there are students who are 50 plus years old.. is this correct data?
# are there too many features and should i group the features? - 28 features including the oldies
#  i can eliminate these students, to have fewer features, 
# but actually for now maybe include all then later eliminate to improve algorithm

ag = students.end_date.max() - students['date_of_birth']
students['age'] = (((ag / np.timedelta64(1, 'D')).astype(int))/365).astype(int)
print(students.groupby('age').size())
#print(len(students.groupby('age').size()))
#students[students['age'] == 54]

age
2       2
3       7
4      17
5      42
6      74
7     127
8     165
9     225
10    250
11    286
12    301
13    293
14    206
15    203
16    150
17    102
18     88
19     54
20     31
21     24
22     10
23      4
24      5
29      1
48      1
50      2
51      1
54      1
dtype: int64


In [943]:
# GRADE LEVEL

# do i group students with these 15 features - 1 through 15, 
# or do i split them up into elementary, middle and high school
# what do grade levels 1 through 21 mean
# these are probably way too many features - leave it be for now

students.groupby('grade_level').size()
students["time_enrolled"] = (td / np.timedelta64(1, 'D')).astype(int)


In [944]:
# NUMBER OF DAYS CAMP IS OPEN
# students['center_days'].count
# divide by 3 not subtract 4
students['days_open'] = (students['center_days'].str.len()/3).astype(int)
students.groupby('days_open').size()

days_open
2    1923
3     749
dtype: int64

In [945]:
# CITY CAMP IS IN
# looks good and clean


students.groupby('city_camp').size()

city_camp
Concord          121
Fairfield        107
Fresno            46
Los Alamitos     626
Mission Viejo    234
San Clemente      27
San Francisco    103
San Jose         221
San Rafael       354
Santa Clara       53
Stockton         410
Torrance          71
Victorville      113
Whittier         100
Windsor           86
dtype: int64

In [946]:
# CITY STUDENT IS IN - 
# maybe leave this out for now cuz there are 194 features..

len(students.groupby('city_student').size())
#students.head()

194

In [958]:
students.head()
trace = Scatter(x=students['city_camp'],y=students['time_enrolled'], mode='markers')
data = [trace]
iplot(data, filename='basic-scatter')
trace = Scatter(x=students['grade_level'],y=students['time_enrolled'], mode='markers')
trace = Scatter(x=students['age'],y=students['time_enrolled'], mode='markers')
trace = Scatter(x=students['gender'],y=students['time_enrolled'], mode='markers')


In [928]:
# LOGISTIC REGRESSION

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
students['grade_level'] = 'g' + students['grade_level'].astype(str)
students['age'] = 'a' + students['age'].astype(str)


#students['grade_level'] = students['grade_level'] - 23
#students['age'] = students['age'] + 50

feature_cols = ['city_camp', 'gender', 'days_open', 'grade_level', 'age']
x = students[feature_cols].reset_index()[feature_cols]
y = students['short_term'].reset_index()['short_term']

perm_dict = {}
perm_dict['city_camp'] = dict(zip( np.unique(x['city_camp']) ,range(len(np.unique(x['city_camp'])))))
perm_dict['gender'] = dict(zip( np.unique(x['gender']) ,range(len(np.unique(x['gender'])))))
perm_dict['days_open'] = dict(zip( np.unique(x['days_open']) ,range(len(np.unique(x['days_open'])))))
perm_dict['grade_level'] = dict(zip( np.unique(x['grade_level']) ,range(len(np.unique(x['grade_level'])))))
perm_dict['age'] = dict(zip( np.unique(x['age']) ,range(len(np.unique(x['age'])))))

x = x.replace(to_replace = perm_dict)
enc = OneHotEncoder()
x2= enc.fit_transform(x)
xnew = x2.todense()
print(x)
print(xnew)

      city_camp  gender  days_open  grade_level  age
0            14       1          0            3    4
1            14       1          0            3    5
2            14       0          0           18   26
3            14       0          0           18    1
4            14       1          0            8    6
5            14       1          0           19    1
6            14       1          0            1    5
7            14       0          0           18   26
8            14       1          0           20    1
9            14       0          0            1    1
10           14       0          0            6    3
11           14       0          0            3    8
12           14       0          0           17   26
13           14       0          0           19    0
14           14       0          0            5    4
15           14       1          0           19   27
16           14       0          0            2    2
17           14       0          0           2

In [929]:
logreg = linear_model.LogisticRegression()
X_train,X_test,y_train,y_test = train_test_split(xnew,y,test_size=0.2)
L = logreg.fit(X_train,y_train)
y_pred = logreg.predict(X_test)

print 'Accuracy of model on test set is ' + str(metrics.accuracy_score(y_test,y_pred)*100) +' %'
print('F1 scores are long-term/short-term' + str(f1_score(y_test, y_pred, average=None)))
print('Weighted F1 score is ' + str(f1_score(y_test, y_pred, average='weighted')))
print('Precision scores are lt-st' + str(precision_score(y_test, y_pred, average=None)))
print('Weighted Precision score is ' + str(precision_score(y_test, y_pred, average='weighted')))
print('Recall scores are lt-st' + str(recall_score(y_test, y_pred, average=None)))
print('Weighted Recall score is ' + str(recall_score(y_test, y_pred, average='weighted')))
print metrics.confusion_matrix(y_test, y_pred)
print ("Predicting everything as falling as long term will give accuracy of "+ str((len([x for x in y_test if x == 0])/float(len(y_test)))*100))

fimplr = logreg.coef_
len(fimplr.ravel())
len(names)

data = [Bar(x=names,y=abs(fimplr.ravel()))]
iplot(data, filename='basic-bar')

Accuracy of model on test set is 76.261682243 %
F1 scores are long-term/short-term[ 0.84025157  0.53818182]
Weighted F1 score is 0.749348337902
Precision scores are lt-st[ 0.79334917  0.64912281]
Weighted Precision score is 0.749946469165
Recall scores are lt-st[ 0.89304813  0.45962733]
Weighted Recall score is 0.76261682243
[[334  40]
 [ 87  74]]
Predicting everything as falling as long term will give accuracy of 69.9065420561


In [959]:
# LINEAR REGRESSION, VERY POOR - PROBABLY NEED MORE DATA

y = students['time_enrolled'].reset_index()['time_enrolled']


from sklearn import datasets, metrics
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
lm = LinearRegression()
x_train,x_test,y_train,y_test = train_test_split(xnew, y,test_size=0.2)

lm.fit(x_train, y_train)

print("Sum of squares error: %.2f"
      % np.mean((lm.predict(x_test) - y_test) ** 2))
print('R squared value: %.2f' % lm.score(x_test, y_test))
print("Median of test data: %.2f" % np.mean(y_test))
print("Median of prediction: %.2f" % np.median(lm.predict(x_test)))
print("Std dev of test data: %.2f" % np.std(y_test))
print("Std dev of prediction: %.2f" % np.std(lm.predict(x_test)))

Sum of squares error: 1865523674415529369534464.00
R squared value: -10857903163413223424.00
Median of test data: 383.77
Median of prediction: 337.00
Std dev of test data: 414.50
Std dev of prediction: 1363298318086.51


In [930]:
# RANDOM FORESTS

from sklearn.ensemble import RandomForestClassifier

names = []
names = perm_dict['city_camp'].keys()
names.extend(perm_dict['gender'].keys())
names.extend(perm_dict['days_open'].keys())
names.extend(perm_dict['grade_level'].keys())
names.extend(perm_dict['age'].keys())



rf = RandomForestClassifier(n_estimators=10)
X_train,X_test,y_train,y_test = train_test_split(xnew,y,test_size=0.2)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print 'Accuracy of model on test set is ' + str(metrics.accuracy_score(y_test,y_pred)*100) +' %' + '\n'
print metrics.confusion_matrix(y_test, y_pred)
print ("Predicting everything as falling as long term will still give accuracy of "+ str((len([x for x in y_test if x == 0])/float(len(y_test)))*100))


print(precision_score(y_test, y_pred, average=None))
#print(recall_score(y_test, y_pred, average=None))  
#print('F1 scores are' + str(f1_score(y_test, y_pred, average=None)))
print('Weighted F1 score is ' + str(f1_score(y_test, y_pred, average='weighted')))
#print('Precision scores are' + str(precision_score(y_test, y_pred, average=None)))
print('Weighted Precision score is ' + str(precision_score(y_test, y_pred, average='weighted')))
#print('Recall scores are' + str(recall_score(y_test, y_pred, average=None)))
print('Weighted Recall score is ' + str(recall_score(y_test, y_pred, average='weighted'))) 





Accuracy of model on test set is 69.1588785047 %

[[303  62]
 [103  67]]
Predicting everything as falling as long term will still give accuracy of 68.2242990654
[ 0.74630542  0.51937984]
Weighted F1 score is 0.678643832716
Weighted Precision score is 0.674198227058
Weighted Recall score is 0.691588785047


In [931]:
fimp = sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_),names),reverse=True)
print(fimp[:10])
trace0 = Bar(x=[x[1] for x in fimp[:10]],y=[x[0] for x in fimp[:10]],marker=dict(color='rgb(26, 118, 255)',line=dict(color='rgb(8,48,107)',width=1,)),opacity=0.7)
data = [trace0]
layout = Layout(xaxis=dict(title = 'Features', tickangle=-45, tickfont=dict(size=12,color='rgb(107, 107, 107)')),
                 yaxis=dict(title='Mean Decrease Impurity Score',titlefont=dict(size=16,color='rgb(107, 107, 107)'), tickfont=dict(size=14,color='rgb(107, 107, 107)')),
                title='Feature Importance of Top 20 Features',
                barmode='group',bargap=0.4,bargroupgap=0.1)
fig16 = Figure(data=data, layout=layout)
iplot(fig16, filename='feature-importance-mean-impurity-bar')



[(0.04, 'F'), (0.0379, 'M'), (0.0363, 'g13'), (0.0338, 'a21'), (0.0326, 'San Francisco'), (0.0321, 'a23'), (0.0282, 'g12'), (0.0282, 'a48'), (0.0275, 'g11'), (0.0274, 'a20')]



unorderable dtypes; returning scalar but in the future this will be an error



In [932]:
# plot feature vs response

# plot feature vs feature

# try linear regression?

# understand what coefficients mean?

In [933]:
studentsold.head()

Unnamed: 0,camp_id,student_id,gender,date_of_birth,enroll_date,end_date,grade_level,city_student,country_student,zip_student,...,city_camp,zip_camp,"state_camp,",country_camp,open_date,center_days,time_enrolled,short_term,age,days_open
0,2824,61913,M,2002-01-22,2015-09-01,2015-11-30,g12,Santa Rosa,USA,95409,...,Windsor,95492,CA,US,2015-01-01,"[2, 5]",90,1,a14,2
2,2824,61938,M,2000-12-13,2015-01-01,2015-08-30,g12,Santa Rosa,USA,95404,...,Windsor,95492,CA,US,2015-01-01,"[2, 5]",241,0,a15,2
4,2824,61981,F,2008-01-21,2016-03-04,2016-05-28,g7,Santa Rosa,USA,95409,...,Windsor,95492,CA,US,2015-01-01,"[2, 5]",85,1,a8,2
5,2824,61991,F,2004-09-25,2016-03-04,2016-05-26,g7,Santa Rosa,USA,95409,...,Windsor,95492,CA,US,2015-01-01,"[2, 5]",83,1,a11,2
9,2824,62074,M,2000-06-28,2015-05-01,2015-09-27,g17,Rohnert Park,USA,94928,...,Windsor,95492,CA,US,2015-01-01,"[2, 5]",149,0,a16,2


In [934]:
students.groupby('camp_id').index.get_level_values('end_date')
#['end_date']#.size()
students.end_date.max()

#assume last day of worksheet submission is last day



#t = students[students['end_date'] != students.end_date.max()]['time_enrolled']

trace1 = Histogram(x=t,opacity=0.75)
trace2 = Histogram(x=students['time_enrolled'],opacity=0.75)
data1 = [trace1, trace2]
layout1 = Layout(barmode='overlay')
fig1 = Figure(data=data1, layout=layout1)
iplot(fig1)

# 2 distributions look somewhat similar, should be good for now but not sure..

students = students[students['end_date'] <= students.end_date.max()-timedelta(days=7)]

#students['city'] = students['city'].str.upper()
#students.groupby('city').size()

#clean cities






#students.head()


students[students['camp_city'] == 'San Francisco']

# students who quit within 3-4 months vs those who stayed longer.. ~1000 points

# or students who quit within a year

#are students bound to stay if their grade increases

# are they bound to stay based on their neighborhood

# should i do by state or restrict to CA

Improvements:
    
    Add scores
    #scores = pd.read_csv('ScoresCA.csv')

SyntaxError: invalid syntax (<ipython-input-934-e1d1f2b5602c>, line 47)

In [None]:
# USER SEGMENT AND 2 lr - better models and telling features

# clean up city they are from - 150 cities - cluster around neighborhoods

# RF for feature importance

# decision trees

# toddler / teen / 
# importance of features as a whole