In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import (train_test_split,GridSearchCV)
from sklearn.metrics import (accuracy_score,roc_auc_score)
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
#load the data
sample = pd.read_csv("MinorityTransformed500.csv")
sample[:10]

clicked = sample['clicked']
data = sample[['landing_page','campaign_id','advertiser_id']]
data = pd.get_dummies(data)
data['clicked'] = clicked
data.shape
data[:10]

Unnamed: 0,landing_page_1004113,landing_page_1009076,landing_page_1009078,landing_page_1012172,landing_page_1019158,landing_page_1019847,landing_page_1027503,landing_page_1031141,landing_page_1035304,landing_page_1043039,...,advertiser_id_913,advertiser_id_915,advertiser_id_916,advertiser_id_92,advertiser_id_94,advertiser_id_95,advertiser_id_979,advertiser_id_992,advertiser_id_MinorityClass,clicked
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [4]:
#split data into train and test
X = data[:37003]
X_train = X.ix[:, X.columns != 'clicked']
y_train = X['clicked']

y = data[37003:]
X_test = y.ix[:, y.columns != 'clicked']
y_test = y['clicked']

In [5]:
#Use GBDT to predict clicked or not
tuned_parameters = {'n_estimators': [50,100,200], 'learning_rate':[0.01,0.1,0.2] , 'max_depth':[2,3,4]}
gbdt = GradientBoostingClassifier(random_state=42)
gbdt_cv = GridSearchCV(gbdt, param_grid=tuned_parameters, cv=5)

In [7]:
gbdt_cv = gbdt_cv.fit(X_train, y_train)

In [8]:
best_params = gbdt_cv.best_params_
print best_params

{'n_estimators': 200, 'learning_rate': 0.2, 'max_depth': 3}


In [10]:
gbdt_best = GradientBoostingClassifier(n_estimators=best_params["n_estimators"],
                                       learning_rate=best_params['learning_rate'],
                                       max_depth=best_params['max_depth'],
                                       random_state=42)
gbdt_best = gbdt_best.fit(X_train, y_train)

In [11]:
#calculate accuracy score
y_test_pred = gbdt_best.predict(X_test)
accuracy_score_gbdt = accuracy_score(y_test,y_test_pred)
print "Gradient Boosting Decision Tree Classifier testing accuracy score:",accuracy_score_gbdt

Gradient Boosting Decision Tree Classifier testing accuracy score: 0.808562857922


In [12]:
prob = gbdt_best.predict_proba(X_test)
click_prob = prob[:,1]
click_prob[:500]

array([ 0.21013748,  0.18636666,  0.06516295,  0.21348491,  0.35393932,
        0.21013748,  0.14546137,  0.20076504,  0.20076504,  0.20076504,
        0.20076504,  0.20076504,  0.47512864,  0.20076504,  0.20076504,
        0.20076504,  0.18448352,  0.06516295,  0.11443841,  0.07076464,
        0.11194477,  0.20076504,  0.20076504,  0.23800954,  0.20076504,
        0.20076504,  0.20076504,  0.25946749,  0.26631979,  0.20076504,
        0.21013748,  0.12078647,  0.20076504,  0.32991194,  0.20076504,
        0.14856576,  0.27591929,  0.20076504,  0.37912751,  0.21013748,
        0.20076504,  0.20076504,  0.20076504,  0.21013748,  0.45373419,
        0.21013748,  0.20076504,  0.1925789 ,  0.20076504,  0.20076504,
        0.19411025,  0.20076504,  0.21013748,  0.44495325,  0.06223048,
        0.20076504,  0.08662296,  0.20076504,  0.26742902,  0.11194477,
        0.20076504,  0.08662296,  0.0655387 ,  0.20076504,  0.21013748,
        0.43810831,  0.06617625,  0.35570693,  0.20076504,  0.21

In [13]:
test_index_range = sample[37003:].groupby('display_id').count()['ad_id'].tolist()
start = 0
num = 0
map12 = 0
df_test = sample[37003:].reset_index()
for p in test_index_range:
   end = start+p
   y_pred = click_prob.tolist()
   temp_pred = y_pred[start:end]
   temp = pd.DataFrame(df_test.ix[start:end-1][['ad_id','clicked']]).copy().reset_index().drop('index',axis=1)
   df=pd.DataFrame(temp_pred,columns=['prob'])
   temp = pd.concat([temp,df],axis=1)
   temp2= temp.sort_values('prob',ascending=False).reset_index().drop('index',axis=1)
   index = temp2.ix[temp2['clicked']==1].index[0]
   map12 = map12+(1.0/(index+1))
   start = end
print map12/len(test_index_range)

0.571009447884
