In [3]:
import numpy as np
import pandas as pd
import os
import operator
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from sklearn.pipeline import Pipeline
from sklearn.pipeline import _name_estimators
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import log_loss

from sklearn.linear_model import LogisticRegression, SGDClassifier

from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score

import xgboost

import time
import sys

%matplotlib inline
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # To ignore SettingWithCopyWarning warning

import math
import random


# Set a random seed for repeatability
rand_seed = 100
from sklearn.externals import joblib

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from sklearn.externals import joblib

# Data Setup

In [4]:
# Original dataset
train = pd.read_csv("train.csv")
validation = pd.read_csv("validation.csv")
test = pd.read_csv("test.csv")

# The validation Y refers to the click column of the original validation set
val_Y = validation.click


In [5]:
# dummy data
train_X = joblib.load("pCTR_X_train_resampled.pkl")
train_Y = joblib.load("pCTR_y_train_resampled.pkl")
val_X = joblib.load("pCTR_X_validation.pkl")
test_X = joblib.load("pCTR_X_test.pkl")

In [6]:
test_X.shape

(303375, 829)

In [8]:
#check the dataset shape
train_X.shape, train_Y.shape, val_Y.shape, val_X.shape, test_X.shape

((40000, 829), (40000,), (303925,), (303925, 829), (303375, 829))

# Non-linear Bidding Strategy

## Logistic regression

In [None]:
# Models inherited from previous grid searching. 
lr_model = LogisticRegression(penalty = 'l1', C = 1)
lr_model.fit(train_X, train_Y)

In [None]:
val_X = val_X.as_matrix()
test_X = test_X.as_matrix()

In [None]:
y_valid_lr_pre = lr_model.predict_proba(val_X)

y_test_lr_pre = lr_model.predict_proba(test_X)

w = (40000-1786)/train.shape[0]
avgCTR = sum(train.click)/train.shape[0]

test_score_lr = y_test_lr_pre[:,1]/(y_test_lr_pre[:,1]+(1-y_test_lr_pre[:,1])/w)
valid_score_lr = y_valid_lr_pre[:,1]/(y_valid_lr_pre[:,1]+(1-y_valid_lr_pre[:,1])/w)

validation = validation.reset_index()

## XGBoost Best

In [10]:
xgb_model = xgboost.XGBClassifier(max_depth = 5, n_estimators = 600, learning_rate = 0.1, objective = 'binary:logistic')
xgb_model.fit(train_X, train_Y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=600,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [16]:
y_valid_xgb_pre = xgb_model.predict_proba(X_val)

X_test = X_test.as_matrix()
y_test_xgb_pre = xgb_model.predict_proba(X_test)

w = (40000-1786)/train.shape[0]
avgCTR = sum(train.click)/train.shape[0]

test_score_xgb = y_test_xgb_pre[:,1]/(y_test_xgb_pre[:,1]+(1-y_test_xgb_pre[:,1])/w)
valid_score_xgb = y_valid_xgb_pre[:,1]/(y_valid_xgb_pre[:,1]+(1-y_valid_xgb_pre[:,1])/w)

validation = validation.reset_index()

# Validation DataFrame

In [9]:
def validationDataFrame(model, lambda_range = None, c_range=None, bid_range=None, strategy):
    df = pd.DataFrame(columns=['parameter_c','parameter_lambda','impression','cost','clicks'])
    iteration = 0
    max_num = 0
    max_bid = 0
    cost = 0
    best_lambda = 0
    best_c = 0
    
    if model == "xgb":
        valid_score == valid_score_xgb
    else: 
        valid_score == vlaid_score_lr
        
        
    if strategy == "ortb_1":
        for c in c_range:
            for l in lambda_range:
                num_click = 0
                utils = True
                impression = 0
                cost = 0
                iteration += 1
                size = validation.shape[0]
                
                for i in range(validation.shape[0]):
                    b = np.sqrt(c / l * valid_score[i] + c ** 2) - c
                    
                    if b >= validation.payprice[i] and utils:
                        cost = cost + validation.payprice[i]
                        if cost > 6250000:
                            cost = cost - validation.payprice[i]
                            utils = False
                            break
                        num_click = num_click + validation.click[i]
                        impression = impression + 1
            
                df.loc[iteration,'clicks'] = num_click
                df.loc[iteration,'cost'] = cost/1000
                df.loc[iteration,'impression'] = impression
                
                if num_click > max_num:
                    max_num = num_click
                    best_lambda = m
                    best_c = c
        
                df.loc[iteration,'Parameter_c'] = best_c
                df.loc[iteration,'Parameter_lambda'] = best_lambda
            
    elif strategy == "ortb_2":
        for c in c_range:
            for l in lambda_range:
                num_click = 0
                utils = True
                impression = 0
                cost = 0
                iteration += 1
                size = validation.shape[0]
                
                for i in range(validation.shape[0]):
                    expr=(valid_score[i]+np.sqrt((c**2) * (m**2)+valid_score[i]**2))/(c*m)
                    b=(expr**(1/3)-expr**(-1/3))*c
                    
                    if b >= validation.payprice[i] and utils:
                        cost = cost + validation.payprice[i]
                        if cost > 6250000:
                            cost = cost - validation.payprice[i]
                            utils = False
                            break
                        num_click = num_click + validation.click[i]
                        impression = impression + 1
            
                df.loc[iteration,'clicks'] = num_click
                df.loc[iteration,'cost'] = cost/1000
                df.loc[iteration,'impression'] = impression
                
                if num_click > max_num:
                    max_num = num_click
                    best_lambda = m
                    best_c = c
        
                df.loc[iteration,'Parameter_c'] = best_c
                df.loc[iteration,'Parameter_lambda'] = best_lambda         
                     
    elif strategy == "quadratic":
        for bid_base in np.arange(3,300, 3):
            num_click = 0
            utils = True
            impression = 0
            cost = 0
            iteration += 1
            size = validation.shape[0]
            
            for i in range(validation.shape[0]):
                bid = bid_base*(valid_score[i]/avgCTR)**2
                
                if b >= validation.payprice[i] and utils:
                    cost = cost + validation.payprice[i]
                    if cost > 6250000:
                        cost = cost - validation.payprice[i]
                        utils = False
                        break
                    num_click = num_click + validation.click[i]
                    impression = impression + 1
                
                df.loc[iteration,'bid_base'] = bid_base
                df.loc[iteration,'clicks'] = num_click
                df.loc[iteration,'cost'] = cost/1000
                df.loc[iteration,'impression'] = impression
                
                if num_click > max_num:
                #print('increase')
                    max_num = num_click
                    max_bid = bid_base
                    
                    
    else:
        for bid_base in np.arange(3,300, 3):
            num_click = 0
            utils = True
            impression = 0
            cost = 0
            iteration += 1
            size = validation.shape[0]
            
            for i in range(validation.shape[0]):
                bid = bid_base*np.exp(valid_score[i]/avgCTR)
                
                if b >= validation.payprice[i] and utils:
                    cost = cost + validation.payprice[i]
                    if cost > 6250000:
                        cost = cost - validation.payprice[i]
                        utils = False
                        break
                    num_click = num_click + validation.click[i]
                    impression = impression + 1
                
                df.loc[iteration,'bid_base'] = bid_base
                df.loc[iteration,'clicks'] = num_click
                df.loc[iteration,'cost'] = cost/1000
                df.loc[iteration,'impression'] = impression
                
                if num_click > max_num:
                #print('increase')
                    max_num = num_click
                    max_bid = bid_base
        
                
    
    #print(num_click)
    
        
    df['CTR'] = df['clicks']/df['Imps']
    df['eCPC'] = df['spend']/df['clicks']
    df['CPM'] = df['spend']*1000/df['Imps']
    
    return df
    

# ORTB Strategy

## ORTB1+LR

In [17]:
# funtion
# b = sqrt(c/lambda * pctr + c^2) - c
lambda_range = [1e-8,5e-8,1e-7,5e-7,1e-6,5e-6,1e-5,5e-5,1e-4,5e-4]
c_range= np.linspace(0.1, 100, 200)
eval_ORTB_1_lr = validationDataFrame(model = "lr", lambda_range=lambda_range, c_range=c_range, strategy="ortb_1")

In [None]:
eval_ORTB_1_lr.iloc[np.where(eval_ORTB_1_lr.clicks == eval_ORTB_1_lr.clicks.max())[0]]

Unnamed: 0,best_c,best_lambda,Imps,spend,clicks,CTR,eCPC,CPM
936,93.9455,5e-06,125939,6247.15,163,0.00129428,38.3261,49.6046
946,93.9455,5e-06,125840,6249.85,163,0.0012953,38.3426,49.665


## ORTB2+LR

In [None]:
lambda_range = [1e-8,5e-8,1e-7,5e-7,1e-6,5e-6,1e-5,5e-5,1e-4,5e-4]
c_range= np.linspace(0.1, 100, 200)
eval_ORTB_2_lr = validationDataFrame(model = "lr", lambda_range=lambda_range, c_range=c_range, strategy="ortb_2")

In [None]:
eval_ORTB_2_lr.iloc[np.where(eval_ORTB_2_lr.clicks == eval_ORTB_2_lr.clicks.max())[0]]

## ORTB1+XGB

In [None]:
lambda_range = [1e-8,5e-8,1e-7,5e-7,1e-6,5e-6,1e-5,5e-5,1e-4,5e-4]
c_range= np.linspace(0.1, 100, 200)
eval_ORTB_1_xgb = validationDataFrame(model = "xgb", lambda_range=lambda_range, c_range=c_range, strategy="ortb_1")

In [None]:
eval_ORTB_1_xgb.iloc[np.where(eval_ORTB_1_xgb.clicks == eval_ORTB_1_xgb.clicks.max())[0]]

## ORTB2+XGB

In [None]:
lambda_range = [1e-8,5e-8,1e-7,5e-7,1e-6,5e-6,1e-5,5e-5,1e-4,5e-4]
c_range= np.linspace(0.1, 100, 200)
eval_ORTB_2_xgb = validationDataFrame(model = "xgb", lambda_range=lambda_range, c_range=c_range, strategy="ortb_2")

In [None]:
eval_ORTB_2_xgb.iloc[np.where(eval_ORTB_2_xgb.clicks == eval_ORTB_2_xgb.clicks.max())[0]]


# Other bidding strategies

## Quadratic bidding

## LR Quadratic

In [None]:
# function
#bid = base_bid * (pCTR / avgCTR)^2
bidbase = np.arange(3,150,3)
eval_lr_quad = validationDataFrame(model = "lr", bid_base = bidbase, strategy="quadratic")

In [None]:
eval_lr_quad.iloc[np.where(eval_lr_quad.clicks == eval_lr_quad.clicks.max())[0]]

## XGB Quadratic

In [None]:
bidbase = np.arange(3,150,3)
eval_xgb_quad = validationDataFrame(model = "xgb", bid_base = bidbase, strategy="quadratic")

In [None]:
eval_xgb_quad.iloc[np.where(eval_xgb_quad.clicks == eval_xgb_quad.clicks.max())[0]]

## Exponential bidding

## LR EXP

In [None]:
# function
#bid = base_bid * exp(pCTR / avgCTR)
bidbase = np.arange(3,30,3)
eval_lr_exp = validationDataFrame(model = "lr", bid_base = bidbase, strategy="exp")

In [None]:
eval_lr_exp.iloc[np.where(eval_lr_exp.clicks == eval_lr_exp.clicks.max())[0]]

## XGB EXP

In [None]:
# function
#bid = base_bid * exp(pCTR / avgCTR)
bidbase = np.arange(3,30,3)
eval_xgb_exp = validationDataFrame(model = "xgb", bid_base = bidbase, strategy="exp")

In [None]:
eval_xgb_exp.iloc[np.where(eval_xgb_exp.clicks == eval_xgb_exp.clicks.max())[0]]

# Ensembled Model

In [12]:
class MajorityVoteClassifier(BaseEstimator,
                            ClassifierMixin):
    def __init__(self, classifiers,
                 vote='classlabel', weights=None):
        self.classifiers = classifiers
        self.named_classifiers = {key: value for
                                  key, value in
                                  _name_estimators(classifiers)}
        self.vote = vote
        self.weights = weights
        
        
    def fit(self, X, y):     
        self.lablenc_ = LabelEncoder()
        self.lablenc_.fit(y)
        self.classes_ = self.lablenc_.classes_
        self.classifiers_ = []
        for clf in self.classifiers:
            fitted_clf = clone(clf).fit(X,
                              self.lablenc_.transform(y))
            self.classifiers_.append(fitted_clf)
        return self
    
    def predict(self, X):
        if self.vote == 'probability':
            maj_vote = np.argmax(self.predict_proba(X),
                                 axis=1)
        else: # 'classlabel' vote
             # Collect results from clf.predict calls
            predictions = np.asarray([clf.predict(X)
                                      for clf in
                                      self.classifiers_]).T
            maj_vote = np.apply_along_axis(
                           lambda x:
                           np.argmax(np.bincount(x,
                                        weights=self.weights)),
                           axis=1,
                           arr=predictions)

        maj_vote = self.lablenc_.inverse_transform(maj_vote)
        return maj_vote
    
    def predict_proba(self, X):
        probas = np.asarray([clf.predict_proba(X)
                             for clf in self.classifiers_])
        avg_proba = np.average(probas,
                               axis=0, weights=self.weights)
        return avg_proba
            
    def get_params(self, deep=True):
        """ Get classifier parameter names for GridSearch"""
        if not deep:
            return super(MajorityVoteClassifier,
                         self).get_params(deep=False)
        else:
            out = self.named_classifiers.copy()
            for name, step in\
                    six.iteritems(self.named_classifiers):
                for key, value in six.iteritems(
                        step.get_params(deep=True)):
                    out['%s__%s' % (name, key)] = value
            return out

In [None]:
mv_clf = MajorityVoteClassifier(classifiers = [lr_model,xgb_model])
clf_labels = ["Logistic Regression", "XGBoost"]
clf_labels += ['Majority Voting']
all_clf = [lr_model, xgb_model, mv_clf]
for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator = clf,
                             X = X_train,
                             y = Y_train,
                             cv = 3,
                             scoring = 'roc_auc')
    print("ROC AUC: %0.4f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

In [None]:
mv_clf.fit(X_train, Y_train)

In [None]:
y_valid_pre = mv_clf.predict_proba(X_val)

y_test_pre = mv_clf.predict_proba(X_test)

w = (40000-1786)/train.shape[0]
avgCTR = sum(train.click)/train.shape[0]

test_score_ensembled = y_test_pre[:,1]/(y_test_pre[:,1]+(1-y_test_pre[:,1])/w)
valid_score_ensembled = y_valid_pre[:,1]/(y_valid_pre[:,1]+(1-y_valid_pre[:,1])/w)


In [None]:
# funtion
# bid = sqrt(c/lambda pctr + c^2) - c
lambda_range = [1e-8,5e-8,1e-7,5e-7,1e-6,5e-6,1e-5,5e-5,1e-4,5e-4]
c_range= np.linspace(0.1, 100, 200)
eval_ORTB_1_ensemble = validationDataFrame(model = "xgb", lambda_range=lambda_range, c_range=c_range, strategy="ortb_1")

In [None]:
eval_ORTB_1_ensemble.iloc[np.where(eval_ORTB_1_ensemble.clicks == eval_ORTB_1_ensemble.clicks.max())[0]]