##### Importing libraries


In [1]:
import math
import numpy as np
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from collections import Counter
from sklearn.metrics import roc_auc_score, mean_squared_error

from sklearn.feature_extraction.text import CountVectorizer

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search



#### load training validation and test set

In [2]:
train = pd.read_csv('train.csv')
dev = pd.read_csv('validation.csv')
test = pd.read_csv('test.csv')
train=train[train["bidprice"]>train["payprice"]]

#### creating a balanced training set and then concatanating with  validation and test sets

In [4]:
def make_train(zeroMaxImpressions):
 
    
    
    df1_click1=train[train['click']==1]
    df1_click0=train[train['click']==0]
    
    train1 = pd.concat((df1_click1.loc[:,],
                      df1_click0.loc[:zeroMaxImpressions,]))

    train1= train1.sample(frac=1)
    train1=train1.reset_index(drop=True)
    
    

    
    return train1

#concatenate train and val

train_sample=make_train(10000)

train_val = pd.concat((train_sample.loc[:,],
                      dev.loc[:,],test.loc[:,]))
train_val=train_val.reset_index()



### one hot encoding

In [5]:
#One hot encode tags
def one_hot_encode_tags(df):
    df.usertag=df.usertag.str.replace(","," ")
    vect = CountVectorizer()
    X = vect.fit_transform(df.usertag)
    df=df.join(pd.DataFrame(X.toarray(), columns=vect.get_feature_names()))
    #df.drop("usertag",axis=1,inplace=True)
    return df

#One hot variables
def pipeline(data):
    one_hot_features = ['useragent','adexchange','slotvisibility','slotformat','creative']
    new_df = pd.get_dummies(data[one_hot_features])
    
    weekday = pd.get_dummies(data["weekday"], prefix='weekday_')
    hour = pd.get_dummies(data["hour"], prefix='hour_')
    region = pd.get_dummies(data["region"], prefix='region_')
    city = pd.get_dummies(data["city"], prefix='city_')
    slotwidth = pd.get_dummies(data["slotwidth"], prefix='slotwidth_')
    slotheight = pd.get_dummies(data["slotheight"], prefix='slotheight_')
    advertiser = pd.get_dummies(data['advertiser'], prefix='advertiser_')

    new_df = pd.concat([new_df,data['slotprice'], weekday, hour, region, \
                        city, slotwidth, slotheight, advertiser],axis=1)
    
    domain = LabelEncoder()
    new_df['encode_domain'] = domain.fit_transform(data['domain'])
    
    new_df = pd.concat([new_df,one_hot_encode_tags(data)],axis=1)
    
    return new_df

df_train_val=pipeline(train_val)

### pCTR model training

In [6]:
features = list(df_train_val)
remove_feat = ['bidprice', 'payprice','click','weekday','hour','bidid',\
               'logtype','userid','useragent','IP','region','city','adexchange','domain','url','urlid','slotid',\
               'slotwidth','slotheight','slotvisibility','slotformat','slotprice','creative','keypage','advertiser',\
               'usertag']
feat = [feature for feature in features if (feature not in remove_feat)]

In [7]:
X_train = df_train_val[:train_sample.shape[0]][feat]
y_train = train_sample['click']
X_val = df_train_val[train_sample.shape[0]:train_sample.shape[0]+len(dev)][feat]
y_val = df_train_val[train_sample.shape[0]:train_sample.shape[0]+len(dev)]['click']
X_test=df_train_val[train_sample.shape[0]+len(dev):][feat]

In [8]:
from xgboost import XGBClassifier
model = xgb.XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=200,min_child_weight=4,reg_alpha=6, seed=0)
model.fit(X_train, y_train)
preds = model.predict(X_val)


### confusion matrix

In [9]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_val, preds)
print(cm)

auc = roc_auc_score(y_val, preds)
rmse = math.sqrt(mean_squared_error(y_val, preds))
print('AUC:', auc, 'RMSE:',rmse)

[[292816   6707]
 [   109    117]]
AUC: 0.747653422334 RMSE: 0.1507946450781221


In [10]:
prob = []
for a, b in model.predict_proba(X_val):
    prob.append(b)

val_pred_xgb_prob = prob
auc = roc_auc_score(y_val, val_pred_xgb_prob)
rmse = math.sqrt(mean_squared_error(y_val, val_pred_xgb_prob))
print('AUC:', auc, 'RMSE:',rmse)

AUC: 0.866970695796 RMSE: 0.14622007683625476


In [17]:

df_with_pred = pd.DataFrame(val_pred_xgb_prob)
df_with_pred = df_with_pred.rename(columns={0:'pCTR'})

df_with_pred['click'] = train_val[train_sample.shape[0]:]['click'].reset_index(drop=True)
df_with_pred['bidprice'] = train_val[train_sample.shape[0]:]['bidprice'].reset_index(drop=True)
df_with_pred['payprice'] = train_val[train_sample.shape[0]:]['payprice'].reset_index(drop=True)
#avg_predCTR=df_with_pred['pCTR'].sum()*1/len(df_with_pred.index)
avg_predCTR=df_with_pred['pCTR'].mean()
df_with_pred['lin_bidding']=77*df_with_pred['pCTR']*1/(avg_predCTR*1)
df_with_pred['spend'] = np.where((df_with_pred['lin_bidding'] >= df_with_pred['payprice']), df_with_pred['payprice'], 0)

### evaluation code

In [15]:
### import pandas as pd



def click_count_val(basebid,BUDGET):
    import operator
    import numpy as np
    
    
    df_with_pred['result'] = np.where((df_with_pred['lin_bidding'] >= df_with_pred['payprice']), 1, 0)
    
    
    # rule to filter zero clicks
    
    booleans = []
    for start in df_with_pred.pCTR:
        if start > 0.018 and start < 0.99 :
                booleans.append(True)
        else:
            booleans.append(False)
        
    Specifichour = pd.Series(booleans)

    df_with_pred1=df_with_pred[Specifichour]
    df_with_pred1=df_with_pred1.reset_index(drop=True)
    
    
    
    booleans = []
    for start in df_with_pred1.result:
        if start == 1 :
                booleans.append(True)
        else:
            booleans.append(False)
        
    Specifichour = pd.Series(booleans)

    df4=df_with_pred1[Specifichour]
    df4=df4.reset_index()
    
    df4['cum_spend'] = df4.spend.cumsum()
    
    
    booleans = []
    for start in df4.cum_spend:
        if start <= BUDGET :
                booleans.append(True)
        else:
            booleans.append(False)
        
    Specifichour = pd.Series(booleans)

    df5=df4[Specifichour]
    
    df6=df5.reset_index()
    
    booleans = []
    for start in df4.cum_spend:
        if start > BUDGET :
                booleans.append(True)
        else:
            booleans.append(False)
        
    Specifichour = pd.Series(booleans)

    df7=df4[Specifichour]
    
    df7['lin_bidding']=0
    
    
    df8= pd.concat((df6.loc[:,:],
                      df7.loc[:,:]))    
    
   
    
    try:
        clicks=df5.click.sum()
    except : clicks=0
        
    CTR=clicks*1/len(df6)
    Total_spend=df6.loc[len(df6)-1,'cum_spend']
    CPM=df6.loc[len(df6)-1,'cum_spend']/(len(df6)/1)
    CPC=df6.loc[len(df6)-1,'cum_spend']/(clicks*1000)
    clickRatio=clicks*100/226
    
    print("CTR :",clicks*1/len(df6))
    print("No of clicks :",clicks)
    print("Total money paid :",df6.loc[len(df6)-1,'cum_spend'])
    print("Average CPM :",df6.loc[len(df6)-1,'cum_spend']/(len(df6)/1))
    print("CPC :",df6.loc[len(df6)-1,'cum_spend']/(clicks*1000))
    print("click ratio :",clickRatio)
    
    return CTR,CPM,CPC,clicks,clickRatio



 #### code of finding the best base bid and plot graphs

In [None]:
import matplotlib.pyplot as plt
import math
import matplotlib.mlab as mlab
import numpy as np
    

list1=[]
list2=[]
list3=[]
list4=[]
list5=[]
for i in range(50,100):
    print("..................... i :" ,i)
   
    CTR,CPM,CPC,clicks,clickRatio=click_count_val(i,6250000)
    list1.append(CTR)
    list2.append(CPM)
    list3.append(CPC)
    list4.append(clicks)
    list5.append(clickRatio)
    
ctr_array=np.array(list1)
cpm_array=np.array(list2)
cpc_array=np.array(list3)
click_array=np.array(list4)
clickRatio_array=np.array(list5)


fig, ax = plt.subplots(1, 1)
ax.plot(ctr_array)

plt.ylabel('CTR')
plt.title('CTR vs basebid')
plt.xlabel('basebid') 

fig, ax = plt.subplots(1, 1)
ax.plot(cpm_array)

plt.ylabel('CPM')
plt.title('CPM vs basebid')
plt.xlabel('basebid') 

fig, ax = plt.subplots(1, 1)
ax.plot(cpc_array)

plt.ylabel('CPC')
plt.title('CPC vs basebid')
plt.xlabel('basebid') 
fig, ax = plt.subplots(1, 1)
ax.plot(click_array)





plt.ylabel('Clicks')
plt.title('Clicks vs basebid')
plt.xlabel('basebid') 
plt.show()

fig, ax = plt.subplots(1, 1)
ax.plot(clickRatio_array)
plt.ylabel('Click Ratio')
plt.title('Click Ratio vs budget size')
plt.xlabel('Budget size') 
plt.show()



#### test set bidding price prediction

In [19]:
preds_test = model.predict(X_test)

In [20]:
BASEBID=80
prob = []
for a, b in model.predict_proba(X_test):
    prob.append(b)
test_pred_xgb_prob = prob
df_with_pred_test = pd.DataFrame(test_pred_xgb_prob)
df_with_pred_test = df_with_pred_test.rename(columns={0:'pCTR'})


df_with_pred_test['bidprice'] = df_with_pred_test['pCTR']*BASEBID/(df_with_pred_test['pCTR'].mean())
del df_with_pred_test['pCTR']

In [22]:
df_with_pred_test

Unnamed: 0,bidprice
0,191.004630
1,25.128940
2,15.811079
3,52.947606
4,85.185600
5,104.487765
6,19.258431
7,39.946378
8,31.510226
9,28.610209


In [23]:
df_with_pred_test.to_csv('184val.csv')