In [1]:
import boto3
import random
import os
import json
import pandas as pd
import datetime
import gzip
import seaborn as sns
import gc
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import f1_score
from sklearn import preprocessing

In [2]:
def get_hours(startHour=None, numberOfHours=6):
    numberOfHours = numberOfHours % 25
    if startHour is None:
        startHour = datetime.datetime.now().hour
    hours = ["{:02d}".format( (hour + 24) % 24 ) for hour in range(startHour, startHour - numberOfHours, -1)]
    
    return hours

# print(get_hours(23, 24))

def get_files(bucket, prefix, days=[], hours=[], ext='.csv'):
    all_files = []
    s3_resource = boto3.resource('s3')
    for day in days:
        for hour in hours:
            _prefix = '{}/d={}/h={}/'.format(prefix, day, hour)
            new_files = [ obj.key for obj in s3_resource.Bucket(bucket).objects.filter(Prefix=_prefix).all() if obj.key.endswith(ext) ]
            all_files = all_files + new_files
            
    return all_files

In [3]:
def get_types_of_attributes():
    return {
        'deliveryid' : np.string_,          # 307219911482
        'dayofweek' : np.uint8,              # 3
        'hour' : np.uint8,                   # 01
        'pub_sspid' : np.string_,           # 2
        'pub_accountid' : np.string_,       # 12440
        'pub_as_siteid' : np.string_,       # 8084
        'pub_as_adspaceid' : np.string_,    # 23678
        'pub_as_domain' : np.string_,       # null
        'pub_as_pageurl' : np.string_,      # null
        'pub_as_dimensions' : np.string_,   # 300x250
        'pub_as_viewrate' : np.float16,      # 0.707278907
        'pub_as_position' : np.string_,     # null
        'pub_as_caps' : np.string_,         # 15166603496
        'req_buymodel' : np.string_,        # 4
        'req_auctiontype' : np.string_,     # 2
        'device_os' : np.string_,           # 19930
        'device_model' : np.string_,        # 100000
        'rtb_ctr' : np.float16,              # null
        'rtb_viewrate' : np.float16,         # null
        'rtb_bidfloor' : np.float16,         # 0.315631807
        'rtb_battr' : np.string_,           # null
        'rtb_tagid' : np.string_,           # 2152980677
        'user_ip' : np.string_,             # 84.241.195.0
        'user_market' : np.uint8,         # 75
        'user_city' : np.string_,           # Amsterdam
        'ad_imptype' : np.uint8,          # 4
        'req_bid' : np.float16,              # 0.856686056
        'price' : np.float16,                # null
        'won' : np.uint8,                  # 0
        'targetbid' : np.float16,            # 0.8566860556602478
        'click' : np.uint8,                  # 0
        'imp' : np.uint8,                    # 0
        'imp_0' : np.uint8,                  # 0
        'imp_1' : np.uint8,                  # 0
        'imp_2' : np.uint8,                  # 0
        'imp_3' : np.uint8,                  # 0
        'imp_4' : np.uint8,                  # 0
        'imp_5' : np.uint8,                  # 0
        'imp_6' : np.uint8,                  # 0
        'imp_7' : np.uint8,                  # 0
        'imp_8' : np.uint8,                  # 0
        'imp_9' : np.uint8,                  # 0
        'imp_10' : np.uint8,                 # 0
        'imp_11' : np.uint8,                 # 0
        'imp_12' : np.uint8,                 # 0
        'imp_13' : np.uint8,                 # 0
        'imp_14' : np.uint8,                 # 0
        'weight' : np.float16                 # 0
    }


In [4]:
imp_label = 'targetbid'
SRC_BUCKET = 'wsbidder'
SRC_PREFIX = 'tsv/etl/imp-pred-service-v1/imppredservice_training_data'

filter_days = [ str( datetime.date.today() - datetime.timedelta(i+1) ) for i in range(7) ]
filter_hours = get_hours(23, 24)

dtypes = get_types_of_attributes()

all_files = get_files(SRC_BUCKET, SRC_PREFIX, filter_days, filter_hours, ext='.gz')

all_table = pd.concat((pd.read_csv('s3://{}/{}'.format(SRC_BUCKET, f), sep="\t", compression='gzip', na_values=["null", "\\N"], dtype=dtypes) for f in all_files))

In [5]:
all_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10031842 entries, 0 to 592
Data columns (total 47 columns):
deliveryid           object
dayofweek            uint8
hour                 uint8
pub_sspid            object
pub_accountid        object
pub_as_siteid        object
pub_as_adspaceid     object
pub_as_domain        object
pub_as_pageurl       object
pub_as_dimensions    object
pub_as_viewrate      float16
pub_as_position      object
pub_as_caps          object
req_buymodel         object
req_auctiontype      object
device_os            object
device_model         object
rtb_ctr              float16
rtb_viewrate         float16
rtb_bidfloor         float16
rtb_battr            object
rtb_tagid            object
user_ip              object
user_market          uint8
user_city            object
ad_imptype           uint8
req_bid              float16
price                float16
won                  uint8
targetbid            float16
click                uint8
imp                  

In [6]:
all_table = all_table[['deliveryid',
 'dayofweek',
 'hour',
 'pub_sspid',
 'pub_accountid',
 'pub_as_siteid',
 'pub_as_adspaceid',
 'pub_as_domain',
 'pub_as_pageurl',
 'pub_as_dimensions',
 'pub_as_viewrate',
 'pub_as_position',
 'pub_as_caps',
 'device_os',
 'device_model',
 'user_ip',
 'user_market',
 'user_city',
 'ad_imptype',
 'click',
 imp_label]]

all_table.head()

Unnamed: 0,deliveryid,dayofweek,hour,pub_sspid,pub_accountid,pub_as_siteid,pub_as_adspaceid,pub_as_domain,pub_as_pageurl,pub_as_dimensions,...,pub_as_position,pub_as_caps,device_os,device_model,user_ip,user_market,user_city,ad_imptype,click,imp_6
0,384796864574,3,23,0,11804,7407,22664,vk.se,https://www.vk.se/?vk-app=true&mobil=true,"300x250,320x160,320x320,320x480",...,,0,26150,119401,217.208.13.81,1,Ume?,0,0,0
1,384795413596,3,23,3,10752,6412,21335,dailymail.co.uk,https://www.dailymail.co.uk/tvshowbiz/article-...,300x250,...,0.0,0,26130,119925,77.172.158.87,75,Rotterdam,0,0,0
2,384794094565,3,23,3,13287,8870,24913,vi.nl,https://www.vi.nl/nieuws/denswil-kiest-voor-av...,"300x250,320x500",...,0.0,0,26140,117370,87.214.35.149,75,Amsterdam,0,0,0
3,384797671414,3,23,3,12021,7620,22998,startsiden.no,https://www.startsiden.no/,"300x250,300x300",...,0.0,0,26140,117501,88.88.105.12,3,Troms?,6,0,0
4,384796837092,3,23,2,11265,6889,21910,hastnet.se,https://hastnet.se/sv/sok/utrustning/till-salu...,"300x250,320x320",...,1.0,0,19936,100000,85.164.78.0,3,Raufoss,6,0,0


In [10]:
all_table = all_table.apply(lambda x: x.mask(x.map(x.value_counts()) < 1000, '0') if x.name in ['pub_sspid', 'pub_accountid', 'pub_as_siteid', 'pub_as_adspaceid', 'pub_as_domain', 'pub_as_pageurl', 'pub_as_dimensions', 'pub_as_position', 'device_os', 'device_model', 'user_ip', 'user_market', 'user_city'] else x)
all_table.head()

Unnamed: 0,deliveryid,dayofweek,hour,pub_sspid,pub_accountid,pub_as_siteid,pub_as_adspaceid,pub_as_domain,pub_as_pageurl,pub_as_dimensions,...,pub_as_position,pub_as_caps,device_os,device_model,user_ip,user_market,user_city,ad_imptype,click,imp_6
0,384796864574,3,23,0,11804,7407,22664,vk.se,https://www.vk.se/?vk-app=true&mobil=true,"300x250,320x160,320x320,320x480",...,,0,26150,119401,0,1,Ume?,0,0,0
1,384795413596,3,23,3,10752,6412,21335,dailymail.co.uk,0,300x250,...,0.0,0,26130,119925,0,75,Rotterdam,0,0,0
2,384794094565,3,23,3,13287,8870,24913,vi.nl,0,"300x250,320x500",...,0.0,0,26140,117370,0,75,Amsterdam,0,0,0
3,384797671414,3,23,3,12021,7620,22998,startsiden.no,https://www.startsiden.no/,"300x250,300x300",...,0.0,0,26140,117501,0,3,Troms?,6,0,0
4,384796837092,3,23,2,11265,6889,21910,hastnet.se,0,"300x250,320x320",...,1.0,0,19936,100000,0,3,0,6,0,0


In [None]:
all_table.groupby(imp_label).size().plot(kind='bar')

total_rows = all_table.shape[0]

imp_rate = all_table[imp_label].value_counts() / total_rows

imp_rate

In [None]:
data_imp = all_table[all_table[imp_label] == 1]

In [None]:
def get_data_by_feature(feature, data_true, all_data, min_render_count=1000):
    feature_data = pd.DataFrame()
    columns = []
    columns = [feature, imp_label]
    feature_values_with_imp = data_true[feature].unique()
    feature_data[feature] = data_true[columns].groupby(feature).count().reset_index()[columns[0]]
    feature_data['imp_count'] = (data_true[columns].groupby(feature).count().reset_index()[columns[-1]]).astype(np.uint64)
    feature_data['render_count'] = (all_data[all_data[feature].isin(feature_values_with_imp)][columns].groupby(feature).count().reset_index()[columns[-1]]).astype(np.uint64)
    feature_data = feature_data[feature_data['render_count'] >= min_render_count]
    feature_data['imp_rate'] = 100 * feature_data['imp_count'] / feature_data['render_count']
    feature_data = feature_data.sort_values(ascending = False, by = 'imp_rate')

    return feature_data

# Hour

In [None]:
hour_data = get_data_by_feature('hour', data_imp, all_table)
print(hour_data)
sns.barplot(y='imp_rate',x='hour', data=hour_data)

# Day of week

In [None]:
day_data = get_data_by_feature('dayofweek', data_imp, all_table)
print(day_data)

sns.barplot(y='imp_rate',x='dayofweek', data=day_data)

# Day of week + hour

In [None]:
all_table['dayofweek_hour'] = 1000*all_table['dayofweek'] + all_table['hour']
data_imp['dayofweek_hour'] = 1000*data_imp['dayofweek'] + data_imp['hour']

dayofweek_hour_data = get_data_by_feature('dayofweek_hour', data_imp, all_table)
print(dayofweek_hour_data)
sns.barplot(y='imp_rate',x='dayofweek_hour', data=dayofweek_hour_data)

# SSP ID

In [None]:
sspid_data = get_data_by_feature('pub_sspid', data_imp, all_table)
print(sspid_data)

sns.barplot(y='imp_rate',x='pub_sspid', data=sspid_data)

# Account ID

In [None]:
accountid_data = get_data_by_feature('pub_accountid', data_imp, all_table)
print(accountid_data)

sns.barplot(y='imp_rate',x='pub_accountid', data=accountid_data)

# Position

In [None]:
pub_as_position = get_data_by_feature('pub_as_position', data_imp, all_table)
print(pub_as_position)

sns.barplot(y='imp_rate',x='pub_as_position', data=pub_as_position)

# Dimension

In [None]:
pub_as_dimensions = get_data_by_feature('pub_as_dimensions', data_imp, all_table)
print(pub_as_dimensions)

sns.barplot(y='imp_rate',x='pub_as_dimensions', data=pub_as_dimensions)

# Position + Dimension

In [None]:
all_table['position_dimension'] = all_table['pub_as_position'] + '_' + all_table['pub_as_dimensions']
data_imp['position_dimension'] = data_imp['pub_as_position'] + '_' + data_imp['pub_as_dimensions']

position_dimension_data = get_data_by_feature('position_dimension', data_imp, all_table)
print(position_dimension_data)
sns.barplot(y='imp_rate',x='position_dimension', data=position_dimension_data)

# Domain + Position

In [None]:
all_table['domain_position'] = all_table['pub_as_position'] + '_' + all_table['pub_as_domain']
data_imp['domain_position'] = data_imp['pub_as_position'] + '_' + data_imp['pub_as_domain']

domain_position = get_data_by_feature('domain_position', data_imp, all_table)
print(domain_position)
sns.barplot(y='imp_rate',x='domain_position', data=domain_position)

# Device OS

In [None]:
device_os = get_data_by_feature('device_os', data_imp, all_table)
print(device_os)

sns.barplot(y='imp_rate',x='device_os', data=device_os)

# Device Model

In [None]:
device_model = get_data_by_feature('device_model', data_imp, all_table)
print(device_model)

sns.barplot(y='imp_rate',x='device_model', data=device_model)

# User Market

In [None]:
user_market = get_data_by_feature('user_market', data_imp, all_table)
print(user_market)

sns.barplot(y='imp_rate',x='user_market', data=user_market)

# User city

In [None]:
user_city = get_data_by_feature('user_city', data_imp, all_table)
print(user_city)

sns.barplot(y='imp_rate',x='user_city', data=user_city)

# User ip

In [None]:
user_ip = get_data_by_feature('user_ip', data_imp, all_table)
print(user_ip)

sns.barplot(y='imp_rate',x='user_ip', data=user_ip)

# Site ID

In [None]:
pub_as_siteid = get_data_by_feature('pub_as_siteid', data_imp, all_table)
print(pub_as_siteid)

sns.barplot(y='imp_rate',x='pub_as_siteid', data=pub_as_siteid)

# Adspace ID

In [None]:
pub_as_adspaceid = get_data_by_feature('pub_as_adspaceid', data_imp, all_table)
print(pub_as_adspaceid)

sns.barplot(y='imp_rate',x='pub_as_adspaceid', data=pub_as_adspaceid)

# Domain

In [None]:
pub_as_domain = get_data_by_feature('pub_as_domain', data_imp, all_table)
print(pub_as_domain)

sns.barplot(y='imp_rate',x='pub_as_domain', data=pub_as_domain)

# Page URL

In [None]:
pub_as_pageurl = get_data_by_feature('pub_as_pageurl', data_imp, all_table)
print(pub_as_pageurl)

sns.barplot(y='imp_rate',x='pub_as_pageurl', data=pub_as_pageurl)

# Ad Imp Type

In [None]:
ad_imptype = get_data_by_feature('ad_imptype', data_imp, all_table)
print(ad_imptype)

sns.barplot(y='imp_rate',x='ad_imptype', data=ad_imptype)

In [None]:
def one_hot_features(data_frame, feature_set):
    new_data_frame = pd.get_dummies(data_frame,
                                     columns = feature_set,
                                    sparse = True)

    return new_data_frame

def label_encoder(df):
#     print(df.unique())
#     df.fillna('0')
    le = preprocessing.LabelEncoder()
    return le.fit_transform(df.astype(str))

In [None]:
model_features = ['dayofweek_hour', 'pub_sspid',
                  'pub_as_dimensions', 'domain_position',
                  'device_model', 'user_market',
                  'user_city', 'pub_as_adspaceid', 'pub_as_pageurl']
model_target = imp_label

In [None]:
train_model = all_table[model_features+[model_target]].sample(frac=0.1,random_state=42)

In [None]:
train_model = one_hot_features(train_model,
                                ['pub_sspid',
                                 'pub_as_dimensions',
                                 'domain_position',
                                 'device_model',
                                 'user_market'
                                ])

In [None]:
for column in ['user_city', 'pub_as_adspaceid', 'pub_as_pageurl']:
    train_model[column] = label_encoder(train_model[column])

In [None]:
model_features = np.array(train_model.columns[train_model.columns!=model_target].tolist())

In [None]:
train_model.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    train_model[model_features].values,
    train_model[model_target].values,
    test_size=0.3,
    random_state=42
)

In [None]:
num_splits = 3
c_values = np.logspace(-3,0,7)

In [None]:
stratified_k_fold = StratifiedKFold(n_splits=num_splits)

scores = np.zeros(7)
nr_params = np.zeros(7)

In [None]:
for train_data, valid_data in stratified_k_fold.split(x_train,
                                                      y_train):
    for i, c in enumerate(np.logspace(-3, 0, 7)):
        lr_classify = LogisticRegression(penalty='l1',
                                         class_weight='balanced',
                                         C = c)
        lr_classify.fit(x_train[train_data],
                        y_train[train_data])

        #validation_Set evaluation

        y_prediction = lr_classify.predict(x_train[valid_data])
        score_f1 = f1_score(y_train[valid_data],
                            y_prediction, average='weighted' )

        scores[i] += score_f1 / num_splits

        ### spot the selected parameters ##

        model_selected = SelectFromModel(lr_classify, prefit=True)
        nr_params[i] += np.sum(model_selected.get_support()) / num_splits

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(nr_params, scores)

for i, c in enumerate(c_values):
    plt.annotate(c, (nr_params[i], scores[i]))
plt.xlabel("Nr of parameters")
plt.ylabel("Avg F1 score")