In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.tree as tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import clean
import classifiers
from sklearn.preprocessing import LabelEncoder
%reload_ext autoreload
%autoreload 2

### Reading data

In [8]:
data = pd.read_csv('data/data_set.csv')

### Selecting columns to keep in the data

In [9]:
keep_features = ['school_zip','teacher_teach_for_america',
                 'primary_focus_subject','resource_type','poverty_level', 
                 'grade_level','total_price_including_optional_support', 
                 'students_reached','date_posted','fully_funded']

In [10]:
data = clean.select_features(data, keep_features)
data.head()

Unnamed: 0,school_zip,teacher_teach_for_america,primary_focus_subject,resource_type,poverty_level,grade_level,total_price_including_optional_support,students_reached,date_posted,fully_funded
0,53216,f,Literacy,Supplies,highest poverty,Grades PreK-2,160.66,22.0,12/31/2012,t
1,43211,t,Mathematics,Technology,highest poverty,Grades 6-8,614.62,90.0,12/31/2012,t
2,19720,f,Health & Life Science,Technology,highest poverty,Grades 9-12,626.38,100.0,12/31/2012,f
3,60645,f,Literacy,Supplies,high poverty,Grades PreK-2,882.06,32.0,12/31/2012,t
4,98108,f,Special Needs,Supplies,highest poverty,Grades 3-5,217.94,40.0,12/31/2012,t


In [11]:
clean.drop_na(data, 'any')

### Transformations
'poverty_level' and 'grade_level' are transformed in categorical using numbers; 'primary_focus_subject' and 'resource_type' are transformed in dummies and 'teacher_teach_for_america' and 'fully_funded' are tranformed in the for 0 for f and 1 for t

In [12]:
clean.labels_to_numeric(data, 'poverty_level')

In [13]:
clean.labels_to_numeric(data, 'grade_level')

In [14]:
data = clean.dummirize(data, ['primary_focus_subject', 'resource_type'])

In [15]:
clean.binarize(data, 'teacher_teach_for_america', 't' )

In [16]:
clean.binarize(data, 'fully_funded', 't' )

In [17]:
data.head()

Unnamed: 0,school_zip,teacher_teach_for_america,poverty_level,grade_level,total_price_including_optional_support,students_reached,date_posted,fully_funded,primary_focus_subject_Applied Sciences,primary_focus_subject_Character Education,...,primary_focus_subject_Social Sciences,primary_focus_subject_Special Needs,primary_focus_subject_Sports,primary_focus_subject_Visual Arts,resource_type_Books,resource_type_Other,resource_type_Supplies,resource_type_Technology,resource_type_Trips,resource_type_Visitors
0,53216,0,1,3,160.66,22.0,12/31/2012,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,43211,1,1,1,614.62,90.0,12/31/2012,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,19720,0,1,2,626.38,100.0,12/31/2012,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,60645,0,0,3,882.06,32.0,12/31/2012,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,98108,0,1,0,217.94,40.0,12/31/2012,1,0,0,...,0,1,0,0,0,0,1,0,0,0


### Temporal validation split

In [18]:
t = classifiers.temporal_validation_split(data, ['2011-01-01', '2011-07-01', '2012-01-01', '2012-07-01' ,'2013-01-01'], 'fully_funded', 'date_posted')

In [19]:
len(t)

12

In [20]:
y_train1, x_train1, y_test1, x_test1, = t[0], t[1], t[2], t[3]

In [21]:
y_train2, x_train2, y_test2, x_test2, = t[4], t[5], t[6], t[7]

In [22]:
y_train3, x_train3, y_test3, x_test3, = t[8], t[9], t[10], t[11]

### Trying different methods and parameters for:
* Testing perdiod: second semester 2011
* Training period: first semester 2011

In [23]:
results_dict1 = {}

knn parameters:

In [24]:
k_list = [5, 10, 15, 20, 25]
weights_list =['uniform', 'distance']

Runing models with different parameters:

In [25]:
classifiers.knn_dif_models(k_list, weights_list, results_dict1, x_train1, y_train1, x_test1)

Decision tree parameters:

In [26]:
criterion_list = ['gini', 'entropy']
splitter_list = ['best', 'random']
max_depth_list = [2, 5, 10, 15]

Runing models with different parameters:

In [27]:
classifiers.dtree_dif_models(results_dict1, criterion_list, splitter_list, max_depth_list, x_train1, y_train1, x_test1)

Logistic regression parameters:

In [28]:
c_list = [1, 3, 5, 7, 9]
penalty_list = ['l1', 'l2']

Runing models with different parameters:

In [29]:
classifiers.logistic_dif_models(results_dict1, c_list, penalty_list, x_train1, y_train1, x_test1)

Random Forest parameters:

In [30]:
n_stimators_list = [25, 50, 75, 100]
criterion_list = ['gini', 'entropy']
max_features_list = ['auto', 'log2'] 

Runing models with different parameters:

In [31]:
classifiers.rf_dif_models(results_dict1, n_stimators_list, criterion_list, max_features_list, x_train1, y_train1, x_test1)

Baggin parameters:

In [32]:
n_estimators_list = [25, 50, 75, 100]
max_features_list = [1,2,3]

Runing models with different parameters:

In [33]:
classifiers.bag_dif_models(results_dict1, n_estimators_list, max_features_list, x_train1, y_train1, x_test1)

Boosting parameters:

In [34]:
n_estimators_list = [25, 50, 75, 100]

Runing models with different parameters:

In [35]:
classifiers.boost_dif_models(results_dict1, n_estimators_list, x_train1, y_train1, x_test1)

### Results for :
* Testing perdiod: second semester 2011
* Training period: first semester 2011

In [69]:
final_results1 = classifiers.present_results_simp(y_test1, results_dict1)
final_results1

Unnamed: 0,Model,Accuracy,Precision,Precision top 1%,Precision top 2%,Precision top 5%,Precision top 10%,Precision top 20%,Precision top 20%.1,Precision top 50%,Recall,Recall top 1%,Recall top 2%,Recall top 5%,Recall top 10%,Recall top 20%,Recall top 30%,Recall top 50%,F 1
0,"knn: 5, uniform",0.627275,0.669890,0.726244,0.734375,0.710164,0.705585,0.694788,0.684752,0.669455,0.794437,0.806533,0.840358,0.840778,0.850742,0.860543,0.857299,0.845091,0.726867
1,"knn: 5, distance",0.621693,0.661991,0.694570,0.716518,0.690635,0.687115,0.681542,0.672355,0.658331,0.792834,0.807895,0.839216,0.839266,0.850572,0.859274,0.854337,0.843125,0.721529
2,"knn: 10, uniform",0.611025,0.622044,0.669683,0.690848,0.664447,0.658971,0.647883,0.636695,0.622024,0.808392,0.829132,0.851444,0.848639,0.858247,0.868685,0.864541,0.855255,0.703080
3,"knn: 10, distance",0.638102,0.683940,0.712670,0.742188,0.718154,0.715919,0.705972,0.695617,0.682759,0.798340,0.805627,0.837531,0.838777,0.851910,0.862334,0.857592,0.846387,0.736725
4,"knn: 15, uniform",0.652367,0.706486,0.757919,0.771205,0.756325,0.753958,0.739739,0.728360,0.710968,0.800526,0.813107,0.839611,0.843564,0.852985,0.864155,0.859997,0.849992,0.750572
5,"knn: 15, distance",0.647529,0.698540,0.719457,0.748884,0.737239,0.736148,0.726384,0.716255,0.700690,0.800005,0.798995,0.831475,0.839313,0.852125,0.864229,0.859769,0.849453,0.745838
6,"knn: 20, uniform",0.640884,0.676257,0.721719,0.738839,0.727474,0.729332,0.713138,0.702326,0.680178,0.807396,0.822165,0.844388,0.845281,0.857993,0.869934,0.867189,0.857616,0.736030
7,"knn: 20, distance",0.652650,0.705912,0.742081,0.766741,0.750555,0.753078,0.737459,0.724641,0.709143,0.801266,0.805897,0.835766,0.841294,0.855822,0.866548,0.861465,0.851389,0.750573
8,"knn: 25, uniform",0.660979,0.719746,0.755656,0.779018,0.766977,0.771768,0.755917,0.746810,0.725295,0.802022,0.814634,0.840964,0.840058,0.854015,0.866675,0.863054,0.853054,0.758660
9,"knn: 25, distance",0.656088,0.711130,0.746606,0.773438,0.751886,0.757256,0.741694,0.731642,0.713637,0.801916,0.812808,0.838983,0.841112,0.855440,0.866548,0.861498,0.851282,0.753799


In [None]:
#final_results1.to_csv('C:/Users/jesus/OneDrive/Documentos/results1')

### Trying different methods and parameters for:
* Testing perdiod: first semester 2012
* Training period: 2011

In [37]:
results_dict2 = {}

knn parameters:

In [38]:
k_list = [5, 10, 15, 20, 25]
weights_list =['uniform', 'distance']

Runing models with different parameters:

In [39]:
classifiers.knn_dif_models(k_list, weights_list, results_dict2, x_train2, y_train2, x_test2)

Decision tree parameters:

In [40]:
criterion_list = ['gini', 'entropy']
splitter_list = ['best', 'random']
max_depth_list = [2, 5, 10, 15]

Runing models with different parameters:

In [41]:
classifiers.dtree_dif_models(results_dict2, criterion_list, splitter_list, max_depth_list, x_train2, y_train2, x_test2)

Logistic regression parameters:

In [42]:
c_list = [1, 3, 5, 7, 9]
penalty_list = ['l1', 'l2']

Runing models with different parameters:

In [43]:
classifiers.logistic_dif_models(results_dict2, c_list, penalty_list, x_train2, y_train2, x_test2)

Random Forest parameters:

In [44]:
n_stimators_list = [25, 50, 75, 100]
criterion_list = ['gini', 'entropy']
max_features_list = ['auto', 'log2'] 

Runing models with different parameters:

In [46]:
classifiers.rf_dif_models(results_dict2, n_stimators_list, criterion_list, max_features_list, x_train2, y_train2, x_test2)

Bagging parameters:

In [47]:
n_estimators_list = [25, 50, 75, 100]
max_features_list = [1,2,3]

Runing models with different parameters:

In [48]:
classifiers.bag_dif_models(results_dict2, n_estimators_list, max_features_list, x_train2, y_train2, x_test2)

Boosting parameters:

In [49]:
n_estimators_list = [25, 50, 75, 100]

Runing models with different parameters:

In [50]:
classifiers.boost_dif_models(results_dict2, n_estimators_list, x_train2, y_train2, x_test2)

### Results for:
* Testing perdiod: first semester 2012
* Training period: 2011

In [68]:
final_results2 = classifiers.present_results_simp(y_test2, results_dict2)
final_results2

Unnamed: 0,Model,Accuracy,Precision,Precision top 1%,Precision top 2%,Precision top 5%,Precision top 10%,Precision top 20%,Precision top 20%.1,Precision top 50%,Recall,Recall top 1%,Recall top 2%,Recall top 5%,Recall top 10%,Recall top 20%,Recall top 30%,Recall top 50%,F 1
0,"knn: 5, uniform",0.645821,0.807896,0.760753,0.763441,0.765494,0.757390,0.755927,0.754809,0.765431,0.715425,0.735065,0.763441,0.736305,0.733657,0.749366,0.751730,0.749830,0.758854
1,"knn: 5, distance",0.642084,0.797922,0.741935,0.751344,0.755444,0.743726,0.743939,0.743587,0.756575,0.715803,0.724409,0.762619,0.736527,0.731888,0.749695,0.750270,0.749775,0.754635
2,"knn: 10, uniform",0.646032,0.794365,0.755376,0.747312,0.745394,0.741495,0.737338,0.737264,0.745799,0.720914,0.735602,0.762689,0.740022,0.739433,0.754098,0.759380,0.756698,0.755859
3,"knn: 10, distance",0.654311,0.826901,0.801075,0.802419,0.791178,0.777468,0.774111,0.775205,0.784529,0.715959,0.726829,0.760510,0.731921,0.731759,0.748210,0.752290,0.750383,0.767442
4,"knn: 15, uniform",0.665331,0.856186,0.819892,0.817204,0.820771,0.815393,0.811827,0.809227,0.815844,0.714949,0.736715,0.756219,0.731707,0.730452,0.748975,0.753275,0.751647,0.779219
5,"knn: 15, distance",0.661115,0.845156,0.798387,0.797043,0.802345,0.795315,0.795393,0.793908,0.803521,0.715263,0.729730,0.759283,0.731298,0.729972,0.748131,0.752427,0.750922,0.774803
6,"knn: 20, uniform",0.659582,0.839099,0.782258,0.791667,0.797878,0.786949,0.786773,0.784735,0.792531,0.716132,0.725686,0.749364,0.732070,0.732606,0.750289,0.753721,0.751898,0.772754
7,"knn: 20, distance",0.664258,0.853686,0.809140,0.815860,0.811837,0.800892,0.804014,0.802458,0.810723,0.714917,0.725301,0.760652,0.728822,0.730417,0.748183,0.751084,0.749951,0.778163
8,"knn: 25, uniform",0.666731,0.866633,0.836022,0.833333,0.833054,0.820970,0.823545,0.820805,0.826620,0.712449,0.731765,0.751515,0.726034,0.725481,0.744974,0.749329,0.748551,0.782014
9,"knn: 25, distance",0.667133,0.861326,0.836022,0.838710,0.833613,0.816509,0.815329,0.812344,0.819792,0.714664,0.735225,0.760049,0.729717,0.729447,0.748670,0.751628,0.750195,0.781171


### Trying different methods and parameters for:
* Testing perdiod: second semester 2012
* Training period: first semester 2012 and 2011

In [51]:
results_dict3 = {}

knn parameters:

In [52]:
k_list = [5, 10, 15, 20, 25]
weights_list =['uniform', 'distance']

Runing models with different parameters:

In [53]:
classifiers.knn_dif_models(k_list, weights_list, results_dict3, x_train3, y_train3, x_test3)

Decision tree parameters:

In [54]:
criterion_list = ['gini', 'entropy']
splitter_list = ['best', 'random']
max_depth_list = [2, 5, 10, 15]

Runing models with different parameters:

In [55]:
classifiers.dtree_dif_models(results_dict3, criterion_list, splitter_list, max_depth_list, x_train3, y_train3, x_test3)

Logistic regression parameters:

In [56]:
c_list = [1, 3, 5, 7, 9]
penalty_list = ['l1', 'l2']

Runing models with different parameters:

In [57]:
classifiers.logistic_dif_models(results_dict3, c_list, penalty_list, x_train3, y_train3, x_test3)

Random Forest parameters:

In [58]:
n_stimators_list = [25, 50, 75, 100]
criterion_list = ['gini', 'entropy']
max_features_list = ['auto', 'log2'] 

Runing models with different parameters:

In [60]:
classifiers.rf_dif_models(results_dict3, n_stimators_list, criterion_list, max_features_list, x_train3, y_train3, x_test3)

Baggin parameters:

In [61]:
n_estimators_list = [25, 50, 75, 100]
max_features_list = [1,2,3]

Runing models with different parameters:

In [62]:
classifiers.bag_dif_models(results_dict3, n_estimators_list, max_features_list, x_train3, y_train3, x_test3)

Boosting parameters:

In [63]:
n_estimators_list = [25, 50, 75, 100]

Runing models with different parameters:

In [64]:
classifiers.boost_dif_models(results_dict3, n_estimators_list, x_train3, y_train3, x_test3)

### Trying different methods and parameters for:
* Testing perdiod: second semester 2012
* Training period: first semester 2012 and 2011

In [67]:
final_results3 = classifiers.present_results_simp(y_test3, results_dict3)
final_results3

Unnamed: 0,Model,Accuracy,Precision,Precision top 1%,Precision top 2%,Precision top 5%,Precision top 10%,Precision top 20%,Precision top 20%.1,Precision top 50%,Recall,Recall top 1%,Recall top 2%,Recall top 5%,Recall top 10%,Recall top 20%,Recall top 30%,Recall top 50%,F 1
0,"knn: 5, uniform",0.670390,0.779050,0.824131,0.824365,0.792229,0.758954,0.751772,0.751092,0.759709,0.777225,0.802789,0.800789,0.788360,0.783634,0.773370,0.763576,0.776603,0.778137
1,"knn: 5, distance",0.663407,0.766672,0.803681,0.805076,0.773006,0.743104,0.741976,0.740451,0.749162,0.776767,0.802041,0.801820,0.784884,0.781216,0.773157,0.762703,0.775627,0.771687
2,"knn: 10, uniform",0.666906,0.758702,0.807771,0.807107,0.770143,0.731577,0.727074,0.725300,0.734853,0.785124,0.807771,0.803030,0.797206,0.793127,0.782087,0.774650,0.785673,0.771687
3,"knn: 10, distance",0.683226,0.802632,0.844581,0.844670,0.817178,0.777275,0.777407,0.777167,0.785103,0.777585,0.794231,0.796935,0.788166,0.783240,0.773297,0.765036,0.777578,0.789910
4,"knn: 15, uniform",0.697315,0.834782,0.873211,0.871066,0.847035,0.811857,0.809296,0.808104,0.815377,0.774724,0.798131,0.792976,0.787752,0.783006,0.770819,0.762789,0.774808,0.803632
5,"knn: 15, distance",0.692288,0.822301,0.856851,0.859898,0.839673,0.802388,0.797832,0.796054,0.803094,0.776242,0.795066,0.792329,0.788100,0.784464,0.772865,0.764742,0.776192,0.798608
6,"knn: 20, uniform",0.691738,0.817770,0.873211,0.860914,0.836401,0.797448,0.790329,0.787033,0.796352,0.778076,0.795158,0.789572,0.791103,0.785323,0.773325,0.766033,0.777540,0.797429
7,"knn: 20, distance",0.699042,0.835626,0.858896,0.865990,0.847444,0.811651,0.808670,0.808598,0.815997,0.775967,0.787992,0.789815,0.787234,0.783585,0.771986,0.764220,0.776130,0.804693
8,"knn: 25, uniform",0.705872,0.853524,0.899796,0.902538,0.873620,0.832030,0.827949,0.825159,0.833492,0.773489,0.785714,0.788820,0.785872,0.780158,0.768450,0.761017,0.772708,0.811538
9,"knn: 25, distance",0.702495,0.842897,0.881391,0.880203,0.860123,0.819885,0.816278,0.813601,0.822367,0.775598,0.792279,0.792505,0.788526,0.784364,0.771420,0.762801,0.774842,0.807848
