In [2]:
import pandas as pd
import numpy as np
import pickle

location='./data/'
clickfilename = 'train'

converters = {"site_id": lambda x: int(x, 16),
              "site_domain": lambda x: int(x, 16),
              "site_category": lambda x: int(x, 16),
              "app_id": lambda x: int(x, 16),
              "app_domain": lambda x: int(x, 16),
              "app_category": lambda x: int(x, 16),
              "device_id": lambda x: int(x, 16),
              "device_model": lambda x: int(x, 16),
              "device_type": lambda x: int(x, 16),
              "device_ip": lambda x: int(x, 16),
             }

clickcsvpath = location+clickfilename+'.csv'
clickpicklepath = location+clickfilename+'.pkl'

try:
    print('reading original pickled data...')
    with open(clickpicklepath, 'rb') as handle:
        data = pickle.load(handle)

except:
    print('error: reading original csv file')
    #Import csv file
    data=pd.read_csv(clickcsvpath, converters=converters) 
    # save data
    with open(clickpicklepath, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)


reading original pickled data...


### Equalize number of click vs non-click samples for training

In [9]:
# extract our X and y variables for training
y = data['click'].copy()
X = data[data.columns.values[2:]].copy()

# from the larger dataset, subsample nsamps click and no-click records
y0 = y[y==0]
X0 = X[y==0]
y1 = y[y==1]
X1 = X[y==1]

# nsamps = y1.shape[0] # use as many samples as possible
nsamps = 500000

print("original data = %d rows: %d clicks, %d nonclicks %1.1f%% clicks"%(
    y.shape[0], y1.shape[0], y0.shape[0], 100*y1.shape[0]/y.shape[0]))

y_eq = y1[:nsamps].append(y0[:nsamps], ignore_index=True)
X_eq = X1[:nsamps].append(X0[:nsamps], ignore_index=True)

print("training data = %d rows, equal# clicks/nonclicks "%(y_eq.shape[0]))


original data = 40428967 rows: 6865066 clicks, 33563901 nonclicks 17.0% clicks
training data = 1000000 rows, equal# clicks/nonclicks 


original data = 500000 rows: 82037 clicks, 417963 nonclicks 16.4% clicks
training data = 164074 rows, equal# clicks/nonclicks 

In [10]:
from time import time
from scipy import stats
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

rng = np.random.RandomState(1)

# Define SGDClassifier defaults: 
# define defaults: doing it this way allows us to define our own default params
clf_defaults = {
    'base_estimator' : DecisionTreeClassifier(max_depth=4),
    'n_estimators' : 100, 
    'learning_rate' : 1,
    }

# Create a pipeline, allowing to tune a transformer and the SGDClassifier classifier.
# (transformer not implemented yet)
pipeline = Pipeline([   
    ('clf', AdaBoostClassifier(**clf_defaults, random_state=rng))
])

# Define the parameters and values we want to test.
# Uncommenting more parameters will give better exploring power but will
#   increase processing time in a combinatorial way. I suggest tuning <= 3
#   parameters at a time.
# Note the naming format: pipelineobjectname__paramname
parameters = {
    'clf__n_estimators': (50,100,1000),
    'clf__learning_rate': (.5, 1, 2),
}

# Create the grid search object.
# Note that "n_jobs=-1" means that the search will use all of the 
#  computer's available processing cores to speed things up.
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
# print("parameters:")
# print(parameters)
t0 = time()

# Run the grid search to find the best parameters for the classifier.
grid_search.fit(X_eq, y_eq)

print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Performing grid search...
Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed: 90.5min finished


done in 5615.877s

Best score: 0.716
Best parameters set:
	clf__learning_rate: 0.5
	clf__n_estimators: 50
