# Linear Model v2

## Enviroment

In [44]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import random
import math
import copy

import sys
sys.path.append('../')

from Preprocessing import preprocessing
from Preprocessing.single_set import SingleSet

In [45]:
%matplotlib inline

## Data Preprocessing

Features: data.data_features
Targets: data.data_targets (click, bidprice, payprice)

In [46]:
train_data_path = '/Data/train.csv'
train_data = SingleSet(relative_path=train_data_path,use_numerical_labels=True)

val_data_path = '/Data/validation.csv'
val_data = SingleSet(relative_path=val_data_path,use_numerical_labels=True)

test_data_path = '/Data/test.csv'
test_data = SingleSet(relative_path=test_data_path,use_numerical_labels=True)

-- data loaded --
-- data loaded --
-- data loaded --


In [47]:
def pandas_to_numpy(data):

    ## features
    features = np.asarray(data.data_features.values)

    ## targets
    if hasattr(data, "data_targets"):
        labels = np.asarray(data.data_targets.values)
        
    return features, labels


## drop unnecessary features
def drop_features(data):
    
    keep_features = ["hour", "useragent", "adexchange", "url", "slotformat", "slotid"]
    for f in data.data_features:
        if f not in keep_features:
            data.data_features.drop(f, axis=1, inplace = True)
    
    
train_data_click = copy.deepcopy(train_data)
val_data_click = copy.deepcopy(val_data)
test_data_click = copy.deepcopy(test_data)

drop_features(train_data_click)
drop_features(val_data_click)
drop_features(test_data_click)

x_train_clicks, y_train = pandas_to_numpy(train_data_click)
x_val_clicks, y_val = pandas_to_numpy(val_data_click)
x_test_clicks, y_test = pandas_to_numpy(test_data_click)


### Data Input Shapes

In [48]:
input_shape = x_train_clicks.shape[1]
print("input_shape", input_shape)
output_shape = 1

# targets_________________________________________________

# clicks
y_train_clicks = np.reshape(y_train[:,0], (y_train.shape[0], 1))  # get first column (clicks)
y_val_clicks = np.reshape(y_val[:,0], (y_val.shape[0], 1))  # get first column (clicks)

input_shape 6


### Scale Data

In [49]:
def scale_data(x_train, x_val, x_test):
    
    # normalize the data attributes
    from sklearn import preprocessing
    from sklearn.preprocessing import MinMaxScaler

    #normalized_X = preprocessing.normalize(x_train)



    ## features
    feature_scaler = MinMaxScaler(feature_range=(0, 1))
    feature_scaler.fit(np.concatenate((x_train, x_val, x_test), axis = 0))       

    x_train = feature_scaler.transform(x_train)
    x_val = feature_scaler.transform(x_val)
    x_test = feature_scaler.transform(x_test)
    
    return x_train, x_val, x_test

    ## Targets________________________________________

    # payprice
    #payprice_scaler = MinMaxScaler(feature_range=(0, 1))
    #payprice_scaler.fit(np.concatenate((y_train_payprice, y_val_payprice), axis = 0))   

    #y_train_payprice = payprice_scaler.transform(y_train_payprice)
    #y_val_payprice = payprice_scaler.transform(y_val_payprice)

#x_train, x_val, x_test = scale_data(x_train, x_val, x_test)

# Model - Neural Networks

## "Click" - Binary Classification

Train Baseline Accuracy "Clicks": 0.9992618932746251%
#of 0:    2429188     # of 1:       1793


Val Baseline Accuracy "Clicks": 0.9993349203056733%
#of 0:    303723     # of 1:       202

Average CTR:

1793 / 2429188 


## Upsampling

sample up "1"s for more balanced classification

--> default accuracy: 50%

In [50]:
def upsampling(x, y):

    xy = np.concatenate((x, y), axis = 1)

    zeros = xy[xy[:,-1] == 0]
    ones = xy[xy[:,-1] == 1]

    ones_upsampled = np.repeat(ones, math.ceil(len(zeros)/len(ones)), axis=0)

    # cut at length of zeros.shape 2429188
    ones_upsampled = ones_upsampled[:zeros.shape[0]]

    xy_upsampled  = np.concatenate((ones_upsampled, zeros), axis = 0) # combine
    np.random.shuffle(xy_upsampled)                                   # shuffle

    x_upsampled = xy_upsampled[:,:-1]   # features
    y_upsampled = xy_upsampled[:,-1:]   # targets
    
    return x_upsampled, y_upsampled


#x_train_up, y_train_clicks_up = upsampling(x_train, y_train_clicks)

### Integer / Categorical One-Hot Encoding

In [51]:
#y_train_clicks_up = keras.utils.to_categorical(y_train_clicks_up, 2)
#y_val_clicks = keras.utils.to_categorical(y_val_clicks, 2)

#y_train_clicks_up = y_train_clicks_up.astype(int)
#y_val_clicks = y_val_clicks.astype(int)

# Logistic Regression - "Clicks"

In [53]:
from sklearn.linear_model import LogisticRegression

class_weight = {0: 0.5, 1: 0.5}#{0: (1793 / 2429188), 1: (1-(1793 / 2429188))}#{0: 0.5, 1: 0.5}

log_regression = LogisticRegression(class_weight=class_weight,  C=1.0, penalty = 'l2', verbose=10).fit(x_train_clicks, y_train_clicks)
## model coefficients
print(train_data_click.data_features.columns)
print(log_regression.coef_)

print("\n-- logistic regression completed --")

[LibLinear]Index(['hour', 'useragent', 'adexchange', 'url', 'slotid', 'slotformat'], dtype='object')
[[-1.59960239e-01 -1.44829606e-01 -5.76878715e-03 -1.29796495e-06
  -2.85439513e-05  1.03332108e-03]]

-- logistic regression completed --


### Prediction

In [54]:
# USE PREDICTION DIRECTLY________________________________________

def click_prediction(log_regression, features, labels = None):

    click_predictions = log_regression.predict(features)
    click_predictions_df = pd.DataFrame(click_predictions, columns= ['click'])
    print(click_predictions_df['click'].value_counts())


    if labels is not None:
        
        print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(log_regression.score(features, labels)))

        from sklearn import metrics
        print("\nF1_score:", metrics.f1_score(labels, click_predictions, average='weighted'))
        print("Accuracy:", metrics.accuracy_score(labels, click_predictions), "\n")

    return click_predictions



# USE PREDICTION PROBABILITY________________________________________

def click_prob_prediction(log_regression, click_thres_prob, return_binary, features, labels = None):

    
    click_predictions_prob = log_regression.predict_proba(features)

    
    if return_binary == False:
        
        return click_predictions_prob
    
    if return_binary == True:

        click_prob_decision = list()

        for pred in click_predictions_prob:
            if pred[1] > click_thres_prob:
                click_prob_decision.append(int(1.0))
            else:
                click_prob_decision.append(int(0.0))

        click_predictions_prob_df = pd.DataFrame(click_prob_decision, columns= ['click'])
        print(click_predictions_prob_df['click'].value_counts())

        if labels is not None:

            from sklearn import metrics
            print("\nF1_score:", metrics.f1_score(labels, click_prob_decision, average='weighted'))
            print("Accuracy:", metrics.accuracy_score(labels, click_prob_decision), "\n")

        return click_prob_decision 



click_thres_prob = 0.5
return_binary = False

click_predictions = click_prediction(log_regression, x_val_clicks, y_val_clicks)
#click_predictions = click_prob_prediction(log_regression, click_thres_prob, return_binary, x_val, y_val_clicks)

0    303925
Name: click, dtype: int64
Accuracy of logistic regression classifier on test set: 1.00

F1_score: 0.9990031539865409
Accuracy: 0.9993353623426833 



  'precision', 'predicted', average, warn_for)


# Linear Regression - "Payprice"

## Data Loader

In [41]:
train_data_payprice = copy.deepcopy(train_data)
val_data_payprice = copy.deepcopy(val_data)
test_data_payprice = copy.deepcopy(test_data)


def pandas_to_numpy(data):

    ## features
    features = np.asarray(data.data_features.values)

    ## targets
    if hasattr(data, "data_targets"):
        labels = np.asarray(data.data_targets.values)
        
    return features, labels


## drop unnecessary features
def drop_features(data):
    
    keep_features = ["adexchange", "domain", "slotwidth", "slotheight", "slotformat", "slotprice"]
    for f in data.data_features:
        if f not in keep_features:
            data.data_features.drop(f, axis=1, inplace = True)
    

drop_features(train_data_payprice)
drop_features(val_data_payprice)
drop_features(test_data_payprice)

x_train_payprice, y_train = pandas_to_numpy(train_data_payprice)
x_val_payprice, y_val = pandas_to_numpy(val_data_payprice)
x_test_payprice, y_test = pandas_to_numpy(test_data_payprice)


input_shape = x_train_payprice.shape[1]
print("input_shape", input_shape)
output_shape = 1

# payprice
y_train_payprice = np.reshape(y_train[:,2], (y_train.shape[0], 1))  # get third column (payprice)
y_val_payprice = np.reshape(y_val[:,2], (y_val.shape[0], 1))  # get third column (payprice)


input_shape 6


In [43]:
from sklearn.linear_model import LinearRegression

lin_regression = LinearRegression(normalize = True).fit(x_train_payprice, y_train_payprice)

## model coefficients
print(train_data_payprice.data_features.columns)
print(lin_regression.coef_)

print("\n-- linear regression completed --")

Index(['adexchange', 'domain', 'slotwidth', 'slotheight', 'slotformat',
       'slotprice'],
      dtype='object')
[[-9.49636406e+00  9.49156433e-04 -1.99829419e-02 -1.34565218e-02
   1.29929560e+01  5.63191752e-01]]

-- linear regression completed --


### Prediction

In [44]:
# USE PREDICTION DIRECTLY________________________________________

def payprice_prediction(lin_regression, features, labels = None):

    payprice_predictions = lin_regression.predict(x_val_payprice)
    payprice_predictions_df = pd.DataFrame(payprice_predictions, columns= ['payprice'])
    #print(payprice_predictions_df)
    
    if labels is not None:
        
        print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lin_regression.score(features, labels)))

        from sklearn import metrics
        print("\nMean Squared Error:", metrics.mean_squared_error(labels, payprice_predictions), "\n")

    return payprice_predictions

payprice_predictions = payprice_prediction(lin_regression, x_val_payprice, y_val_payprice)

Accuracy of logistic regression classifier on test set: 0.11

Mean Squared Error: 3215.034565779497 



# Make Bidding

## Predict Clicks and Payprice

In [45]:
pick_data = "val"


def predict_clicks_payprice(click_thres_prob, return_binary):

    # VAL______________

    if pick_data == "val":

        ## CLICKS___________

        #y_val_clicks

        print("click prediction")
        #click_predictions = click_prediction(log_regression, x_val, y_val_clicks)
        click_predictions = click_prob_prediction(log_regression, click_thres_prob, return_binary, x_val_clicks, y_val_clicks)

        ## PAYPRICE___________

        #y_val_payprice
        print("payprice prediction")
        payprice_predictions = payprice_prediction(lin_regression, x_val_payprice, y_val_payprice)



    # TEST______________

    if pick_data == "test": 


        ## CLICKS___________

        #y_val_clicks

        print("click prediction")
        #click_predictions = click_prediction(log_regression, x_test)
        click_predictions = click_prob_prediction(log_regression, click_thres_prob, return_binary, x_test)

        ## PAYPRICE___________

        #y_val_payprice

        print("payprice prediction")
        payprice_predictions = payprice_prediction(lin_regression, x_test)
        
    return click_predictions, payprice_predictions


click_predictions, payprice_predictions = predict_clicks_payprice(click_thres_prob = 0.5, return_binary = False)

#print(payprice_predictions[0:20])

click prediction
payprice prediction
Accuracy of logistic regression classifier on test set: 0.11

Mean Squared Error: 3215.034565779497 



## Make Bidding Decision

In [53]:
def set_bids(click_predictions, payprice_predictions, base_bid, averageCTR, click_predict_weight = None):
    
    budget = 6250000
    bids = np.zeros((len(click_predictions)))

## 1.) Only bid for expected clicks!

    predicted_clicks = 0
    spend_on_clicks = 0
    
    for p in range(0, len(bids)):

        #if click_predictions[p] == 1:
            
        predicted_clicks += 1

        bid_price = base_bid * click_predictions[p,1] / np.mean(click_predictions[:,1]) #+ click_predict_weight * (click_predictions[p][1]) #/ averageCTR
        #bid_price = math.floor(payprice_predictions[p]) + 2 #np.random.randint(10,10)
        #bid_price = 72
        spend_on_clicks += bid_price

        bids[p] = bid_price
        #bids[p] = 71
            
    print("\npredicted_clicks:", predicted_clicks)
    print("spent_on_clicks:", spend_on_clicks)
    print("average_spent:", spend_on_clicks / predicted_clicks, "\n")
    
    return bids




def adjust_bidprices(bids, payprice_predictions):
    
    ## 2.) Prefer cheap payprice predictions
    
    budget = 6250000
    planned_bid_amount = sum(bids)
    exceed_budget = 1000
    n_bids = len(bids[np.where(bids > 0)])

    
    ## (1) spend too much_______________________
    
    if planned_bid_amount - budget > 0:
        
        print("-- spend too much:", planned_bid_amount - budget)
        
        while (planned_bid_amount - budget > exceed_budget):

            #print(round(np.mean(bids)))
            #print(budget - planned_bid_amount)
            index, = list(np.where(bids == max(bids)))    # find expensive bids
            bids[index] = max(bids) - 1                   # set expensive bid lower
            planned_bid_amount = sum(bids)                # check new bidding amount

    
    ## (2) spend too little______________________

    else:
        
        print("-- spend too little:", budget - planned_bid_amount)
        
        #fill_bid_price = math.floor(np.mean(bids))
        fill_bid_price = 70
        
        while (budget - planned_bid_amount >  (-exceed_budget)):

            #print(round(np.mean(bids)))
            #print(budget - planned_bid_amount)
            index, = list(np.where(bids == 0)) 
            index = random.sample(list(index), 100)
            bids[index] = fill_bid_price            
            planned_bid_amount = sum(bids)                


    n_bids = len(bids[np.where(bids > 0)])  
    print("\nplanned_bid_amount:", sum(bids), ", difference to budget:", (budget - sum(bids)), 
              ", number of bids:", n_bids, ", average bidprice:",round(np.mean(bids[np.where(bids > 0)])))
    
    return bids

    

In [54]:
base_bid = 70
averageCTR = 1793 / 2429188 

bid_decisions = set_bids(click_predictions, payprice_predictions, base_bid, averageCTR)
bids = adjust_bidprices(bid_decisions, payprice_predictions)


predicted_clicks: 303925
spent_on_clicks: 21274750.00000019
average_spent: 70.00000000000063 

-- spend too much: 15024750.00000019


KeyboardInterrupt: 

## Test Decision in Auction

In [26]:
data_path = os.path.abspath(os.pardir + '/Data/validation.csv')
df = pd.read_csv(data_path, na_values=['Na', 'null']).fillna(0)

In [56]:

def simulate_auction(bids):

    if pick_data == "val": 

        budget = 6250000

        ## Evaluation Stats_____________

        bids_won = 0
        earned_clicks = 0
        ctr = 0                  # bids_won / earned_clicks
        total_paid = 0
        cpc = 0                  # cost per click


        for index, row in df.iterrows():

            if bids[index] > budget: # check if budget is sufficient for bidprice
                bids[index] = budget
                #print("constant bid reduced to:", constant_bid, ", total_paid:", total_paid, ", bids_won:", bids_won, ", earned clicks:", earned_clicks, "\n")

            if budget <= 0:
                print("-- break after auction #", index)
                break

            # WON BID ______________________________________________

            if bids[index] >= row['payprice']:     

                bids_won += 1                        # won the bid
                total_paid += row['payprice']        # add amount to total_paid   
                budget = budget - row['payprice']    # substract money from budget

                #if constant_bid == row['bidprice']:      
                    #budget = budget - row['payprice']    # substract money from budget

                #elif constant_bid > row['bidprice']:
                #    budget = budget - row['bidprice']    # substract money from budget

                # CLICK = 1 ______________________________________________

                if row['click'] == 1:    # only reduce money from budget if ad has been clicked

                        earned_clicks += 1                   # earn the click
                        #print("current budget:", budget, ", earned clicks:", earned_clicks, "\n")

            if index%100000 == 0:
                print("bid#", index, ", budget:", budget, ", payprice:", row['payprice'], ", bids_won:", bids_won, ", earned_clicks:", earned_clicks, "\n")



        print("__________________________________\n")

        if earned_clicks > 0:
            cpc = total_paid / earned_clicks
        if bids_won > 0:
            ctr = earned_clicks / bids_won

        print("left budget:", budget)
        print("bids_won:", bids_won)
        print("earned clicks:", earned_clicks)
        print("CTR:", ctr)
        print("cost per click:", cpc)

        
simulate_auction(bids)

NameError: name 'bids' is not defined

# Simulate Auctions

In [62]:
averageCTR = 1793 / 2429188 

#for click_thres_prob in [0.8]:
for base_bid in [70]:
    #for click_predict_weight in [120.5]:

    print("\n\n\n_________________________________________\n")
    print("PARAMETER: base_bid", base_bid)#, "click_predict_weight", click_predict_weight, "click_thres_prob", click_thres_prob,"\n")

    click_predictions, payprice_predictions = predict_clicks_payprice(click_thres_prob, return_binary = False)
    bid_decisions = set_bids(click_predictions, payprice_predictions, base_bid, averageCTR)
    #bids = adjust_bidprices(bid_decisions, budget, payprice_predictions)
    simulate_auction(bid_decisions)

    print("_________________________________________\n\n\n")






_________________________________________

PARAMETER: base_bid 70
click prediction
payprice prediction
Accuracy of logistic regression classifier on test set: 0.11

Mean Squared Error: 3215.034565779497 


predicted_clicks: 303925
spent_on_clicks: 21274750.00000019
average_spent: 70.00000000000063 

bid# 0 , budget: 6249977 , payprice: 23 , bids_won: 1 , earned_clicks: 0 

bid# 100000 , budget: 4141027 , payprice: 63 , bids_won: 54977 , earned_clicks: 26 

bid# 200000 , budget: 2024727 , payprice: 196 , bids_won: 110128 , earned_clicks: 52 

-- break after auction # 297159
__________________________________

left budget: 0
bids_won: 162883
earned clicks: 76
CTR: 0.0004665925848615264
cost per click: 82236.84210526316
_________________________________________





## CSV Submission

In [34]:
data_path_test = os.path.abspath(os.pardir + '/Data/test.csv')
df_test = pd.read_csv(data_path_test, na_values=['Na', 'null']).fillna(0)


In [None]:
bidprice_series = pd.Series(data = bids, name='bidprice')
submission_df = pd.DataFrame({'bidid': df_test['bidid'],'bidprice':bidprice_series})

# Group Token: QQri5ISZz4Kn
submission_df.to_csv('testing_bidding_price.csv', index = False)


## Load old Submission File

In [129]:
bidprice_series = pd.Series(data = df_test_adjusted, name='bidprice')
submission_df = pd.DataFrame({'bidid': df_test['bidid'],'bidprice':bidprice_series})

# Group Token: QQri5ISZz4Kn
submission_df.to_csv('testing_bidding_price.csv', index = False)

In [135]:
df_test_final = pd.read_csv("/Users/niklasstoehr/Desktop/criterion_1_test_results.csv")
print(df_test_final.columns)
df_test_final = df_test_final.bidprice.astype(int)

sum(df_test_final)
df_test_adjusted = df_test_final.copy(deep=True)

Index(['bidid', 'bidprice'], dtype='object')


In [136]:
df_test_adjusted = np.clip(df_test_adjusted, 0, 305)
print(sum(df_test_adjusted))
df_test_adjusted[0] = 100
print(df_test_adjusted[0])

22775651
100


In [138]:
test_probs = click_prob_prediction(log_regression, click_thres_prob, return_binary, x_test_clicks)
one_prob = test_probs[:,1]
print(np.mean(one_prob))
one_prob = (one_prob + (1 - np.mean(one_prob)))**4

print(np.mean(one_prob))
print(one_prob)

0.004853689726435591
1.001091978400976
[0.99212988 1.13713592 0.98328965 ... 1.03972609 1.07281731 0.99377936]


In [139]:
print(df_test_adjusted )
np.mean(df_test_adjusted)



0         100
1         305
2          61
3          43
4          71
5          63
6          56
7          87
8          56
9          26
10         52
11        305
12         72
13         77
14         57
15        305
16         29
17         86
18         28
19         43
20         28
21         87
22         35
23         39
24         48
25        196
26         90
27         43
28         61
29         72
         ... 
303345     44
303346     33
303347     59
303348    122
303349     71
303350     55
303351     81
303352    103
303353     74
303354     78
303355     84
303356     67
303357    305
303358     60
303359     50
303360     75
303361    125
303362     63
303363     65
303364     56
303365     59
303366    109
303367     72
303368    305
303369     11
303370     17
303371     63
303372     92
303373     49
303374     44
Name: bidprice, Length: 303375, dtype: int64


75.07424474660074

In [140]:
print(df_test_adjusted * one_prob)

df_test_adjusted = df_test_adjusted * one_prob



0          99.212988
1         346.826456
2          59.980669
3          42.284912
4          69.798472
5          61.957732
6          55.275543
7          90.151684
8          55.810122
9          25.917824
10         51.339292
11        322.318494
12         74.444515
13         75.646271
14         56.001182
15        300.337153
16         28.775104
17         85.208866
18         27.577602
19         43.258762
20         27.706380
21         85.536434
22         34.808632
23         38.518641
24         47.170851
25        194.701078
26         89.279180
27         42.337457
28         60.006444
29         71.436100
             ...    
303345     43.251106
303346     32.463304
303347     58.045265
303348    120.093527
303349     73.267431
303350     54.025795
303351     79.666543
303352    101.686457
303353     73.502528
303354     76.639009
303355     83.147292
303356     66.627956
303357    300.501087
303358     59.035093
303359     50.112451
303360     73.713779
303361    122

In [141]:
print(df_test_adjusted.astype(int))
df_test_adjusted = df_test_adjusted.astype(int)
np.mean(df_test_adjusted)
#df_test_adjusted = (df_test_adjusted * one_prob)

0          99
1         346
2          59
3          42
4          69
5          61
6          55
7          90
8          55
9          25
10         51
11        322
12         74
13         75
14         56
15        300
16         28
17         85
18         27
19         43
20         27
21         85
22         34
23         38
24         47
25        194
26         89
27         42
28         60
29         71
         ... 
303345     43
303346     32
303347     58
303348    120
303349     73
303350     54
303351     79
303352    101
303353     73
303354     76
303355     83
303356     66
303357    300
303358     59
303359     50
303360     73
303361    122
303362     62
303363     64
303364     55
303365     58
303366    107
303367     71
303368    299
303369     10
303370     16
303371     62
303372     95
303373     52
303374     43
Name: bidprice, Length: 303375, dtype: int64


75.89555500618047