# Linear Model

## Enviroment

In [4]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import random
import math

import sys
sys.path.append('../')

from Preprocessing import preprocessing
from Preprocessing.single_set import SingleSet

In [5]:
%matplotlib inline

## Data Preprocessing

Features: data.data_features
Targets: data.data_targets (click, bidprice, payprice)

In [6]:
train_data_path = '/Data/train.csv'
train_data = SingleSet(relative_path=train_data_path,use_numerical_labels=True)

val_data_path = '/Data/validation.csv'
val_data = SingleSet(relative_path=val_data_path,use_numerical_labels=True)

test_data_path = '/Data/test.csv'
test_data = SingleSet(relative_path=test_data_path,use_numerical_labels=True)

-- data loaded --
-- data loaded --
-- data loaded --


In [7]:
def pandas_to_numpy(data):

    ## features
    features = np.asarray(data.data_features.values)

    ## targets
    if hasattr(data, "data_targets"):
        labels = np.asarray(data.data_targets.values)
        
    return features, labels


## drop unnecessary features
def drop_features(data):
    
    if 'userid' in data.data_features:
        data.data_features.drop('userid', axis=1, inplace = True)
        
    if 'urlid' in data.data_features:
        data.data_features.drop('urlid', axis=1, inplace = True)
        
    if 'slotvisibiliy' in data.data_features:
        data.data_features.drop('slotvisibiliy', axis=1, inplace = True)
        
    if 'weekday' in data.data_features:
        data.data_features.drop('weekday', axis=1, inplace = True)
        
    if 'region' in data.data_features:
        data.data_features.drop('region', axis=1, inplace = True)
        
    if 'url' in data.data_features:
        data.data_features.drop('url', axis=1, inplace = True)
        
    if 'ip' in data.data_features:
        data.data_features.drop('ip', axis=1, inplace = True)
        
    if 'city' in data.data_features:
        data.data_features.drop('city', axis=1, inplace = True)
        

drop_features(train_data)
drop_features(val_data)
drop_features(test_data)

x_train, y_train = pandas_to_numpy(train_data)
x_val, y_val = pandas_to_numpy(val_data)
x_test, y_test = pandas_to_numpy(test_data)


### Data Input Shapes

In [8]:
input_shape = x_train.shape[1]
print("input_shape", input_shape)
output_shape = 1

# targets_________________________________________________

# clicks
y_train_clicks = np.reshape(y_train[:,0], (y_train.shape[0], 1))  # get first column (clicks)
y_val_clicks = np.reshape(y_val[:,0], (y_val.shape[0], 1))  # get first column (clicks)

# payprice
y_train_payprice = np.reshape(y_train[:,2], (y_train.shape[0], 1))  # get third column (payprice)
y_val_payprice = np.reshape(y_val[:,2], (y_val.shape[0], 1))  # get third column (payprice)

input_shape 16


### Scale Data

In [9]:
def scale_data(x_train, x_val, x_test):
    
    # normalize the data attributes
    from sklearn import preprocessing
    from sklearn.preprocessing import MinMaxScaler

    #normalized_X = preprocessing.normalize(x_train)



    ## features
    feature_scaler = MinMaxScaler(feature_range=(0, 1))
    feature_scaler.fit(np.concatenate((x_train, x_val, x_test), axis = 0))       

    x_train = feature_scaler.transform(x_train)
    x_val = feature_scaler.transform(x_val)
    x_test = feature_scaler.transform(x_test)
    
    return x_train, x_val, x_test

    ## Targets________________________________________

    # payprice
    #payprice_scaler = MinMaxScaler(feature_range=(0, 1))
    #payprice_scaler.fit(np.concatenate((y_train_payprice, y_val_payprice), axis = 0))   

    #y_train_payprice = payprice_scaler.transform(y_train_payprice)
    #y_val_payprice = payprice_scaler.transform(y_val_payprice)

x_train, x_val, x_test = scale_data(x_train, x_val, x_test)

# Model - Neural Networks

## "Click" - Binary Classification

Train Baseline Accuracy "Clicks": 0.9992618932746251%
#of 0:    2429188     # of 1:       1793


Val Baseline Accuracy "Clicks": 0.9993349203056733%
#of 0:    303723     # of 1:       202


## Upsampling

sample up "1"s for more balanced classification

--> default accuracy: 50%

In [10]:
def upsampling(x, y):

    xy = np.concatenate((x, y), axis = 1)

    zeros = xy[xy[:,-1] == 0]
    ones = xy[xy[:,-1] == 1]

    ones_upsampled = np.repeat(ones, math.ceil(len(zeros)/len(ones)), axis=0)

    # cut at length of zeros.shape 2429188
    ones_upsampled = ones_upsampled[:zeros.shape[0]]

    xy_upsampled  = np.concatenate((ones_upsampled, zeros), axis = 0) # combine
    np.random.shuffle(xy_upsampled)                                   # shuffle

    x_upsampled = xy_upsampled[:,:-1]   # features
    y_upsampled = xy_upsampled[:,-1:]   # targets
    
    return x_upsampled, y_upsampled


#x_train_up, y_train_clicks_up = upsampling(x_train, y_train_clicks)

### Integer / Categorical One-Hot Encoding

In [11]:
#y_train_clicks_up = keras.utils.to_categorical(y_train_clicks_up, 2)
#y_val_clicks = keras.utils.to_categorical(y_val_clicks, 2)

#y_train_clicks_up = y_train_clicks_up.astype(int)
#y_val_clicks = y_val_clicks.astype(int)

# Logistic Regression - "Clicks"

In [12]:
from sklearn.linear_model import LogisticRegression

class_weight = {0: (1793 / 2429188), 1: (1-(1793 / 2429188))}#{0: 0.5, 1: 0.5}

log_regression = LogisticRegression(class_weight=class_weight,  C=1.0, penalty = 'l2', verbose=10).fit(x_train, y_train_clicks)
## model coefficients
print(train_data.data_features.columns)
print(log_regression.coef_)

print("\n-- logistic regression completed --")

  y = column_or_1d(y, warn=True)


[LibLinear]Index(['hour', 'bidid', 'useragent', 'IP', 'adexchange', 'domain', 'slotid',
       'slotwidth', 'slotheight', 'slotvisibility', 'slotformat', 'slotprice',
       'creative', 'keypage', 'advertiser', 'usertag'],
      dtype='object')
[[ 0.54760256  0.09121162 -3.26912613  0.05623659  0.21171319  0.1202381
  -0.12860596  0.01611399 -0.12425519 -0.46684149  1.98194502  1.36402433
  -0.17429778  0.12491277 -0.13256102  0.50086608]]

-- logistic regression completed --


### Prediction

In [13]:
# USE PREDICTION DIRECTLY________________________________________

def click_prediction(log_regression, features, labels = None):

    click_predictions = log_regression.predict(features)
    click_predictions_df = pd.DataFrame(click_predictions, columns= ['click'])
    print(click_predictions_df['click'].value_counts())


    if labels is not None:
        
        print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(log_regression.score(features, labels)))

        from sklearn import metrics
        print("\nF1_score:", metrics.f1_score(labels, click_predictions, average='weighted'))
        print("Accuracy:", metrics.accuracy_score(labels, click_predictions), "\n")

    return click_predictions



# USE PREDICTION PROBABILITY________________________________________

def click_prob_prediction(log_regression, features, labels = None):

    click_predictions_prob = log_regression.predict_proba(features)
    click_prob_decision = list()

    click_thres_prob = 0.85
    for pred in click_predictions_prob:
        if pred[1] > click_thres_prob:
            click_prob_decision.append(int(1.0))
        else:
            click_prob_decision.append(int(0.0))
            
    click_predictions_prob_df = pd.DataFrame(click_prob_decision, columns= ['click'])
    print(click_predictions_prob_df['click'].value_counts())
    
    if labels is not None:
        
        from sklearn import metrics
        print("\nF1_score:", metrics.f1_score(labels, click_prob_decision, average='weighted'))
        print("Accuracy:", metrics.accuracy_score(labels, click_prob_decision), "\n")

    return click_prob_decision 


click_predictions = click_prediction(log_regression, x_val, y_val_clicks)
click_predictions = click_prob_prediction(log_regression, x_val, y_val_clicks)

0    216603
1     87322
Name: click, dtype: int64
Accuracy of logistic regression classifier on test set: 0.71

F1_score: 0.8316035868799905
Accuracy: 0.7126462120588961 

0    301211
1      2714
Name: click, dtype: int64

F1_score: 0.9946046810627399
Accuracy: 0.9905568808094102 



# Linear Regression - "Payprice"

In [401]:
from sklearn.linear_model import LinearRegression

lin_regression = LinearRegression(normalize = True).fit(x_train, y_train_payprice)

## model coefficients
print(train_data.data_features.columns)
print(lin_regression.coef_)

print("\n-- linear regression completed --")

Index(['weekday', 'hour', 'useragent', 'IP', 'adexchange', 'slotid',
       'slotwidth', 'slotheight', 'slotvisibility', 'slotformat', 'slotprice',
       'creative', 'advertiser', 'usertag'],
      dtype='object')
[[-5.39503494e+00 -3.36176870e+00  5.18555811e+01  7.54098460e-02
  -6.23748817e+01  9.26153426e+00 -1.64822152e+01 -7.83998964e+00
   5.28672343e+00  6.08391278e+01  1.78837372e+02 -1.75130975e+01
   6.38067130e+00 -1.86742771e+00]]

-- linear regression completed --


### Prediction

In [394]:
# USE PREDICTION DIRECTLY________________________________________

def payprice_prediction(lin_regression, features, labels = None):

    payprice_predictions = lin_regression.predict(x_val)
    payprice_predictions_df = pd.DataFrame(payprice_predictions, columns= ['payprice'])
    #print(payprice_predictions_df)
    
    if labels is not None:
        
        print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lin_regression.score(features, labels)))

        from sklearn import metrics
        print("\nMean Squared Error:", metrics.mean_squared_error(labels, payprice_predictions), "\n")

    return payprice_predictions

payprice_predictions = payprice_prediction(lin_regression, x_val, y_val_payprice)

Accuracy of logistic regression classifier on test set: 0.11

Mean Squared Error: 3211.5799590621386 



# Make Bidding

## Predict Clicks and Payprice

In [14]:
pick_data = "val"



# VAL______________

if pick_data == "val":

    predict_data = x_val

    ## CLICKS___________

    #y_val_clicks
    
    print("click prediction")
    #click_predictions = click_prediction(log_regression, x_val, y_val_clicks)
    click_predictions = click_prob_prediction(log_regression, x_val, y_val_clicks)

    ## PAYPRICE___________

    #y_val_payprice
    #print("payprice prediction")
    #payprice_predictions = payprice_prediction(lin_regression, x_val, y_val_payprice)


    
# TEST______________

if pick_data == "test": 
    
    predict_data = x_test

    ## CLICKS___________

    #y_val_clicks

    print("click prediction")
    #click_predictions = click_prediction(log_regression, x_test)
    click_predictions = click_prob_prediction(log_regression, x_test)

    ## PAYPRICE___________

    #y_val_payprice
    
    print("payprice prediction")
    #payprice_predictions = payprice_prediction(lin_regression, x_test)




click prediction
0    301211
1      2714
Name: click, dtype: int64

F1_score: 0.9946046810627399
Accuracy: 0.9905568808094102 



## Make Bidding Decision

In [336]:
def set_bids(bids, click_predictions, payprice_predictions):

## 1.) Only bid for expected clicks!

    predicted_clicks = 0
    spend_on_clicks = 0
    
    for p in range(0, len(bids)):

        if click_predictions[p] == 1:
            predicted_clicks += 1
            
            bid_price = math.floor(payprice_predictions[p]) + 2 #np.random.randint(10,10)
            #bid_price = 72
            spend_on_clicks += bid_price
            
            bids[p] = bid_price
            #bids[p] = 71
            
    print("\npredicted_clicks:", predicted_clicks)
    print("spent_on_clicks:", spend_on_clicks)
    print("average_spent:", spend_on_clicks / predicted_clicks, "\n")
    
    return bids




def adjust_bidprices(bids, budget, payprice_predictions):
    
    ## 2.) Prefer cheap payprice predictions
    
    budget = 6250000
    planned_bid_amount = sum(bids)
    exceed_budget = 100000000000000
    n_bids = len(bids[np.where(bids > 0)])

    

    ## (1) spend too much_______________________
    
    if planned_bid_amount - budget > 0:
        
        print("-- spend too much:", planned_bid_amount - budget)
        
        while (planned_bid_amount - budget > exceed_budget):

            #print(round(np.mean(bids)))
            #print(budget - planned_bid_amount)
            index, = list(np.where(bids == max(bids)))    # find expensive bids
            bids[index] = max(bids) - 1                   # set expensive bid lower
            planned_bid_amount = sum(bids)                # check new bidding amount

    
    ## (2) spend too little______________________

    else:
        
        print("-- spend too little:", budget - planned_bid_amount)
        
        #fill_bid_price = math.floor(np.mean(bids))
        fill_bid_price = 70
        
        while (budget - planned_bid_amount >  (-exceed_budget)):

            #print(round(np.mean(bids)))
            #print(budget - planned_bid_amount)
            index, = list(np.where(bids == 0)) 
            index = random.sample(list(index), 100)
            bids[index] = fill_bid_price            
            planned_bid_amount = sum(bids)                


    n_bids = len(bids[np.where(bids > 0)])  
    print("\n\nplanned_bid_amount:", sum(bids), ", difference to budget:", (budget - sum(bids)), 
              ", number of bids:", n_bids, ", average bidprice:",round(np.mean(bids[np.where(bids > 0)])))
    
    return bids

    

In [340]:
budget = 6250000
bid_array = np.zeros((len(predict_data)))

bid_decisions = set_bids(bid_array, click_predictions, payprice_predictions)
bids = adjust_bidprices(bid_decisions, budget, payprice_predictions)


predicted_clicks: 268079
spent_on_clicks: 19224972
average_spent: 71.7138306245547 

-- spend too much: 12974972.0


planned_bid_amount: 19224972.0 , difference to budget: -12974972.0 , number of bids: 267990 , average bidprice: 72.0


## Test Decision in Auction

In [341]:
#data_path = os.path.abspath(os.pardir + '/Data/validation.csv')
#df = pd.read_csv(data_path, na_values=['Na', 'null']).fillna(0)

In [342]:
if pick_data == "val": 
    
    budget = 6250000

    ## Evaluation Stats_____________

    bids_won = 0
    earned_clicks = 0
    ctr = 0                  # bids_won / earned_clicks
    total_paid = 0
    cpc = 0                  # cost per click


    for index, row in df.iterrows():

        if bids[index] > budget: # check if budget is sufficient for bidprice
            bids[index] = budget
            #print("constant bid reduced to:", constant_bid, ", total_paid:", total_paid, ", bids_won:", bids_won, ", earned clicks:", earned_clicks, "\n")

        if budget <= 0:
            print("-- break after auction #", index)
            break

        # WON BID ______________________________________________

        if bids[index] >= row['payprice']:     

            bids_won += 1                        # won the bid
            total_paid += row['payprice']        # add amount to total_paid   
            budget = budget - row['payprice']    # substract money from budget

            #if constant_bid == row['bidprice']:      
                #budget = budget - row['payprice']    # substract money from budget

            #elif constant_bid > row['bidprice']:
            #    budget = budget - row['bidprice']    # substract money from budget

            # CLICK = 1 ______________________________________________

            if row['click'] == 1:    # only reduce money from budget if ad has been clicked

                    earned_clicks += 1                   # earn the click
                    #print("current budget:", budget, ", earned clicks:", earned_clicks, "\n")

        if index%100000 == 0:
            print("bid#", index, ", budget:", budget, ", payprice:", row['payprice'], ", bids_won:", bids_won, ", earned_clicks:", earned_clicks, "\n")



    print("__________________________________\n")

    if earned_clicks > 0:
        cpc = total_paid / earned_clicks
    if bids_won > 0:
        ctr = earned_clicks / bids_won

    print("left budget:", budget)
    print("bids_won:", bids_won)
    print("earned clicks:", earned_clicks)
    print("CTR:", ctr)
    print("cost per click:", cpc)



bid# 0 , budget: 6249977 , payprice: 23 , bids_won: 1 , earned_clicks: 0 

bid# 100000 , budget: 4209996 , payprice: 63 , bids_won: 48639 , earned_clicks: 24 

bid# 200000 , budget: 2169485 , payprice: 196 , bids_won: 97231 , earned_clicks: 45 

bid# 300000 , budget: 148429 , payprice: 60 , bids_won: 145382 , earned_clicks: 69 

__________________________________

left budget: 69701
bids_won: 147239
earned clicks: 71
CTR: 0.0004822091972914785
cost per click: 87046.4647887324


## CSV Submission

In [None]:
data_path_test = os.path.abspath(os.pardir + '/Data/test.csv')
df_test = pd.read_csv(data_path_test, na_values=['Na', 'null']).fillna(0)


In [262]:
bidprice_series = pd.Series(data = bids, name='bidprice')
submission_df = pd.DataFrame({'bidid': df_test['bidid'],'bidprice':bidprice_series})

# Group Token: QQri5ISZz4Kn
submission_df.to_csv('testing_bidding_price.csv', index = False)
