In [1]:
import pandas as pd
import numpy as np
import copy
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GMM
from matplotlib.colors import LogNorm
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.pyplot as plt


# for Kaggle

In [None]:
#lr_click_prob = lr.predict_proba(mini_dev[mini_dev.columns[11:]].fillna(0))[:,1]

## Create Temporary Data Subset

In [2]:
click_train_data= pd.read_csv("clicks_train_features.csv")
del click_train_data["Unnamed: 0"]

In [7]:
#make a smaller random sample so that notebook runs easily and grid searches etc don't take so long
mini_click =click_train_data.sample(7000, random_state =0)
mini_train = mini_click[:5000]
mini_dev = mini_click[5000:]
mini_train_labels = mini_train["clicked"]
mini_dev_labels = mini_dev["clicked"]
del mini_train["clicked"]
del mini_dev["clicked"]

In [28]:
#select only binary and continuous features
#eliminate missing data
mini_train = mini_train[mini_train.columns[11:]].fillna(0)
mini_dev = mini_dev[mini_dev.columns[11:]].fillna(0)

# Regression

**Rationale:** 

**Challenges:** In order to make the data appropriate for regression we had to create binary (one-hot) variables to represent man

**Approach:** We first fit a logistic regression model without regularization, using all the binarized features (representing document category, topic etc) and continuous features reprenting the percent of ads from that ad campaign that had been clicked, the percent of ads from that advertiser that had been clicked, the frequency of document views, and the percentage of that ad that had been clicked.

Then we use grid search to identify the optimal level of regularization and whether l2 or l1 regularization would perform better. We expected l1 regularization to be superior because it more harshly penalizes very low weighted features. 

Finally, we tried exercising our domain knowledge/judgement and running a regression with only the four predictors we expected to be most predictive, which were the percent of ads from that ad campaign that had been clicked, the percent of ads from that advertiser that had been clicked, the frequency of document views, and the percentage of that ad that had been clicked. 

In [56]:
#with all regressable features
lr = LogisticRegression()
lr.fit(mini_train, mini_train_labels)
lr_score_1 = lr.score(mini_dev, mini_dev_labels)
print("The accuracy is", lr_score_1, "using all possible features")
print("\n")
lr = LogisticRegression()
C = {"C": [.0001, .01, .05, 0.1, .15, .2, .3, .5, 1 ,2, 3, 4, 5, 7, 10, 15, 20, 100, 1000], 'penalty':["l1", "l2"]}
search = GridSearchCV(lr, param_grid = C)
lr_params = search.fit(mini_train, mini_train_labels)
lr_best = lr_params.best_params_
lr = LogisticRegression(C= lr_best["C"], penalty = lr_best["penalty"])
lr.fit(mini_train, mini_train_labels)
lr_score_2 = lr.score(mini_dev, mini_dev_labels)
print("The accuracy is", lr_score_2, "using regularization with the best parameters idenfitied by grid search.")
print ("These parameters are:", lr_best["penalty"], 'penalty and',lr_best["C"], 'C.')
print ("This leaves", np.count_nonzero(lr.coef_), "non-zero parameters.")
features = []
for i in range(len(mini_train.columns)):
    if lr.coef_[0][i] !=0:
        features.append(mini_train.columns[i])
print ("These features are", features)
print("\n")
lr = LogisticRegression()
lr.fit(mini_train[mini_train.columns[-4:]], mini_train_labels)
lr_score_3 = lr.score(mini_dev[mini_train.columns[-4:]], mini_dev_labels)
print("The accuracy is", lr_score_3, "using the top 'logically chosen' features.")
print("\n")

The accuracy is 0.8455 using all possible features


The accuracy is 0.8465 using regularization with the best parameters idenfitied by grid search.
These parameters are: l1 penalty and 0.1 C.
This leaves 3 non-zero parameters.
These features are ['campaign_perc', 'docx_view_freq', 'click_perc']


The accuracy is 0.85 using the top 'logically chosen' features.




So, using the four features chosen for their strongest logical predictive power yields the highest accuracy, although the difference is very small. Regularization choses 3 out of the four chosen "logically". 

# Decision Trees and Random Forest

**Rationale:** A random forest algorithm seemed like the perfect technique to approach this since it is a classification problem (ads are either clicked or not clicked) and we had a very large number of features, many of which are likely irrelevant. Random Forest models address decision trees issues with over fitting by averaging the results of many different decision trees trained on different sub samples of the data. This increases variance but decreases bias, leading to more accurate prediction on test data. 

**Challenges:**

**Approach:** We trained first a standard decision tree, and then trained a random forest model using grid search to optimize the maximum number of features the trees would use, the number of trees, and the information gain criterion (either gini or entropy). We then scored the model using the best features. 
We also then fit an extra trees model, which tends towards a larger number of leaves, using the same grid search parameters as with the random forest. 

In [None]:
#this may be overfit so not good in the longterm
dec_tree = tree.DecisionTreeClassifier()
dec_tree = dec_tree.fit(mini_train, mini_train_labels)
print("A single decision tree got an accuracy of", dec_tree.score(mini_dev, mini_dev_labels))
parameters = {'n_estimators':[50, 100, 400, 500], 'criterion': ["gini", "entropy"]}
forest = RandomForestClassifier(random_state = 1)
search = GridSearchCV(forest, param_grid = parameters, scoring = "f1_micro")
forest_params = search.fit(mini_train, mini_train_labels)
forest_best = forest_params.best_params_
forest = RandomForestClassifier(random_state = 1, max_features = auto, n_estimators = forest_best['n_estimators'], criterion = forest_best['criterion'])
forest.fit(mini_train, mini_train_labels)
forest.score(mini_dev, mini_dev_labels)
print("A random forest with optimal parameters got an accuracy of", forest.score(mini_dev, mini_dev_labels))
print("With", forest_best['n_estimators'], 'trees and', forest_best['criterion'], "as the criterion")
parameters = {'n_estimators':[50, 100, 400, 500], 'criterion': ["gini", "entropy"]}
extrees = ExtraTreesClassifier(random_state = 1)
search = GridSearchCV(extrees, param_grid = parameters, scoring = "f1_micro")
extrees_params = search.fit(mini_train, mini_train_labels)
extrees_best = extrees_params.best_params_
trees = ExtraTreesClassifier(random_state = 1, max_features = 'auto', n_estimators = forest_best['n_estimators'], criterion = forest_best['criterion'])
extrees.fit(mini_train, mini_train_labels)
extrees.score(mini_dev, mini_dev_labels)
print("A random forest with optimal parameters got an accuracy of", extrees.score(mini_dev, mini_dev_labels))
print("With", extrees_best['n_estimators'], 'trees, ', extrees_best['criterion'], "as the criterion, and", extrees_best['max_features'], "as the maximum number of features.")

A single decision tree got an accuracy of 0.81


# Sarah's scratch working, don't delete yet but don't run

In [99]:
dec_tree = tree.DecisionTreeClassifier()
#Baseline model with just the "fabulous four" predictors
dec_tree = dec_tree.fit(mini_train[mini_train.columns[-4:]].fillna(0), mini_train_labels)
dec_tree.score(mini_dev[mini_dev.columns[-4:].fillna(0)], mini_dev_labels)

0.81000000000000005

In [39]:
#RandomForest works well
parameters = {'n_estimators':[10, 30, 40, 100, 400, 500, 700, 1000], 'criterion': ["gini", "entropy"]}
forest = RandomForestClassifier(random_state = 1)
search = GridSearchCV(forest, param_grid = parameters, scoring = "f1_micro" )
forest_params = search.fit(mini_train[mini_train.columns[-4:]].fillna(0), mini_train_labels)
forest_best = forest_params.best_params_
print(forest_best)

{'n_estimators': 400, 'criterion': 'entropy'}


In [42]:
forest = RandomForestClassifier(random_state = 0, n_estimators = forest_best['n_estimators'], criterion = forest_best['criterion'])
forest.fit(mini_train, mini_train_labels)
forest.score(mini_dev, mini_dev_labels)

0.82099999999999995

In [108]:
#RandomForest works well
parameters = {'n_estimators':[5, 10, 15, 20, 25, 30, 40], 'criterion': ["gini", "entropy"], 'max_features': [1, 3, 5, 10, 20, 40, 100, 200, 300]}
forest = RandomForestClassifier(random_state = 1)
search = GridSearchCV(forest, param_grid = parameters, scoring = "f1_micro" )
forest_params = search.fit(mini_train[mini_train.columns[11:]].fillna(0), mini_train_labels)
forest_best = forest_params.best_params_
print(forest_best)

{'n_estimators': 40, 'max_features': 200, 'criterion': 'entropy'}


In [41]:
forest = RandomForestClassifier(random_state = 1, n_estimators = 400, criterion = 'gini')
forest.fit(mini_train[mini_train.columns[11:]].fillna(0), mini_train_labels)
forest.score(mini_dev[mini_dev.columns[11:]].fillna(0), mini_dev_labels)

0.82250000000000001

In [84]:
parameters = {'n_estimators':[5, 10, 15, 20, 25, 30, 40], 'criterion': ["gini", "entropy"]}
trees = ExtraTreesClassifier(random_state = 1)
search = GridSearchCV(trees, param_grid = parameters, scoring = "f1_micro" )
trees_params = search.fit(mini_train[mini_train.columns[-4:]].fillna(0), mini_train_labels)
trees_best = trees_params.best_params_
print(trees_best)

{'n_estimators': 20, 'criterion': 'entropy'}


In [112]:
xtratrees = ExtraTreesClassifier(n_estimators =20, criterion = 'entropy')
xtratrees = xtratrees.fit(mini_train[mini_train.columns[-4:]].fillna(0), mini_train_labels)
xtratrees.score(mini_dev[mini_dev.columns[-4:].fillna(0)], mini_dev_labels)

0.82399999999999995

In [None]:
pca = PCA(n_components=2)
pca.fit(mini_train)
proj = pca.transform(mini_train)
cmap = LinearSegmentedColormap.from_list('mycmap', [(0, "red"), (1, "blue")])
plt.scatter(proj[:,0], proj[:,1], c = mini_train_labels, cmap=cmap)
plt.title("Clicks Data Projected Two Dimensions")
plt.show()

'''pca = PCA(n_components = i)
pca.fit(mini_train[mini_train.columns[11:]].fillna(0))
pca
for i in range(50):
    print("With", i, "components, the explained variance is", sum(pca.explained_variance_ratio_[:i+1]))
    '''

In [51]:
mini_train.head()

Unnamed: 0.1,Unnamed: 0,display_id,ad_id,uuid,document_id_x,timestamp,platform,geo_location,document_id_y,campaign_id,...,cat_2002,cat_2003,cat_2004,cat_2005,cat_2006,cat_2100,campaign_perc,advertiser_perc,docx_view_freq,click_perc
210701,210701,12970337,174546,86559a64294b03,1449981,854783252,2,US>IL>609,1439845,21331,...,0,0,0,0,0,0,0.029557,0.018605,6680,0.033708
255507,255507,15329563,159253,7752ddb03b7fa5,1331987,1015941667,3,NZ>E7,1393979,20101,...,0,0,0,0,0,0,0.315789,0.120755,10,0.25
138897,138897,9087365,51402,f813ccbbac2938,471778,611374801,2,US>ND>687,973567,6345,...,0,0,0,0,0,0,0.200876,0.206905,1537,0.282
206435,206435,12781117,497811,81a107d04e9107,1362397,843154067,1,US>CA>807,1349628,32853,...,0,0,0,0,0,0,0.0,0.088207,5423,0.090909
121461,121461,7856231,143662,244cabf7ce72a2,1291818,546456480,2,US>NJ>501,1085917,18390,...,0,0,0,0,0,0,0.331646,0.316629,10751,0.325926


In [42]:
mini_train.columns[-4:]

Index(['campaign_perc', 'advertiser_perc', 'docx_view_freq', 'click_perc'], dtype='object')

In [49]:
'''
import pydotplus 
dot_data = tree.export_graphviz(dec_tree, out_file=None) 
graph = pydotplus.graph_from_dot_data(dot_data) 
graph.write_pdf("dec_tree.pdf") '''

'\nimport pydotplus \ndot_data = tree.export_graphviz(dec_tree, out_file=None) \ngraph = pydotplus.graph_from_dot_data(dot_data) \ngraph.write_pdf("dec_tree.pdf") '

In [50]:
'''from IPython.display import Image  
dot_data = tree.export_graphviz(dec_tree, out_file=None, 
                     feature_names=["% of Campaign Clicked", "% of Advertizer Clicked", "Document View Frequency", "% of Ad Clicked"],  
                     class_names=["Not Clicked", "Clicked"],  
                     filled=True, rounded=True,  
                     special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())  
'''

'from IPython.display import Image  \ndot_data = tree.export_graphviz(dec_tree, out_file=None, \n                     feature_names=["% of Campaign Clicked", "% of Advertizer Clicked", "Document View Frequency", "% of Ad Clicked"],  \n                     class_names=["Not Clicked", "Clicked"],  \n                     filled=True, rounded=True,  \n                     special_characters=True)  \ngraph = pydotplus.graph_from_dot_data(dot_data)  \nImage(graph.create_png())  \n'

In [41]:
#odd problem when trying to use all the data[data[11:]]
#ValueError: Input contains NaN, infinity or a value too large for dtype('float32')
#not nan or infinite
test = mini_train[mini_train.columns[11:]].fillna(0)
test.isnull().values.any()
np.any(np.isnan(test))
np.all(np.isfinite(test))

True

0.80449999999999999

In [None]:
nb = MultinomialNB()
nb = nb.fit(mini_train[mini_train.columns[-4:]], mini_train_labels)
nb_preds = nb.predict(mini_dev[mini_dev.columns[-4:]])
nb_score = nb.score(mini_dev[mini_dev.columns[-4:]], mini_dev_labels)
#metrics.f1_score(mini_dev_labels, preds , average = "micro")
nb_score

knn = KNN()
knn = knn.fit(mini_train[mini_train.columns[-4:]].fillna(0), mini_train_labels)
knn_preds = knn.predict(mini_dev[mini_dev.columns[-4:]].fillna(0))
knn_score = knn.score(mini_dev[mini_dev.columns[-4:]].fillna(0), mini_dev_labels)
#metrics.f1_score(mini_dev_labels, preds , average = "micro")
knn_score