In [1]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder
from scipy import stats
import math

#### Повторим результат для первой модели

Буду делать все ровно так же. Единственное, выкинем не сыгравшие баннеры и campaign clicks.

In [2]:
data = pd.read_csv('../data/data.csv')
#let's remove columns we can't use
data.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
0,2021-09-27 00:01:30.000000,0,0,5664530014561852622,0,0,0,1240,0.067,0.035016,-7.268846,0,0.01,0.049516,-5.369901,1,1
1,2021-09-26 22:54:49.000000,1,1,5186611064559013950,0,0,1,1,0.002,0.054298,-2.657477,269,0.004,0.031942,-4.44922,1,1
2,2021-09-26 23:57:20.000000,2,2,2215519569292448030,3,0,0,2,0.014,0.014096,-3.824875,21,0.014,0.014906,-3.939309,1,1
3,2021-09-27 00:04:30.000000,3,3,6262169206735077204,0,1,1,3,0.012,0.015232,-3.461357,99,0.006,0.050671,-3.418403,1,1
4,2021-09-27 00:06:21.000000,4,4,4778985830203613115,0,1,0,4,0.019,0.051265,-4.009026,11464230,6.79,0.032005,-2.828797,1,1


In [3]:
print(len(data))

15821472


Выкинем не сыгравшие баннеры и оставим только нужные колонки

In [4]:
data = data[data['banner_id0']==data['banner_id']]
data=data[['date_time', 'zone_id', 'banner_id',
       'os_id', 'country_id', 'impressions', 'clicks', 'coeff_sum0', 'g0', 'coeff_sum1', 'g1', 'banner_id1']]

In [5]:
print(len(data))

13947160


Практически все осталось

In [6]:
data.head()

Unnamed: 0,date_time,zone_id,banner_id,os_id,country_id,impressions,clicks,coeff_sum0,g0,coeff_sum1,g1,banner_id1
1,2021-09-26 22:54:49.000000,1,1,0,1,1,1,-2.657477,0.054298,-4.44922,0.031942,269
2,2021-09-26 23:57:20.000000,2,2,0,0,1,1,-3.824875,0.014096,-3.939309,0.014906,21
3,2021-09-27 00:04:30.000000,3,3,1,1,1,1,-3.461357,0.015232,-3.418403,0.050671,99
4,2021-09-27 00:06:21.000000,4,4,1,0,1,1,-4.009026,0.051265,-2.828797,0.032005,11464230
5,2021-09-27 00:06:50.000000,5,5,2,2,1,1,-3.222757,0.337634,-3.221755,0.338195,37


In [7]:
def feature_engineering(data):
    
    #let's leave only known banners - that had >100 shows on the train set
    train_data = data.loc[(pd.to_datetime(data['date_time']).dt.month==9) & (pd.to_datetime(data['date_time']).dt.day!=1)]
    known_banner_ids = set(train_data['banner_id'].value_counts().loc[lambda x: x > 100].index)
    data.loc[~data.banner_id.isin(known_banner_ids), 'banner_id']=-1
    data = pd.get_dummies(data, columns = ['banner_id'], drop_first=True, prefix=['banner'])
    print("Banner columns generated.")
    
    data.loc[(data.zone_id>4) & (data.zone_id<10), "zone_id"]=-1
    data.loc[(data.zone_id>9) & (data.zone_id<20), "zone_id"]=-2
    data.loc[(data.zone_id>19) & (data.zone_id<50), "zone_id"]=-3
    data.loc[(data.zone_id>49), "zone_id"]=-4
    data = pd.get_dummies(data, columns = ['zone_id'], drop_first=True, prefix=['zone'])
    print("Zone columns generated.")
    
#     #normalize campaign clicks
#     data.loc[(data.campaign_clicks>50), 'campaign_clicks']=50
#     data.loc[:,'campaign_clicks']=data['campaign_clicks']/50

    
    data.loc[data.os_id>6, "os_id"] = 7
    data = pd.get_dummies(data, columns = ['os_id'], drop_first=True, prefix=['os'])
    print("OS columns generated.")
    
    data = pd.get_dummies(data, columns = ['country_id'], drop_first=True, prefix=['country'])
    print("Country columns generated.")
    
#     let's do interactions between country and banner id
#     even banner dummies worked for a long time so I decided not to do that
#     ban_cols = (ban_col for ban_col in data.columns if 'banner_' in ban_col)
#     country_cols = (country_col for country_col in data.columns if 'country_' in country_col)
#     for ban_col in ban_cols:
#         for country_col in country_cols:
#             data[ban_col+country_col]=data[ban_col]*data[country_col]
    
    data = data.drop(columns=['impressions'])
    
    train_data = data.loc[(pd.to_datetime(data['date_time']).dt.month==9) & (pd.to_datetime(data['date_time']).dt.day!=1)]
    val_data = data.loc[(pd.to_datetime(data['date_time']).dt.month==10) & (pd.to_datetime(data['date_time']).dt.day==1)]
    test_data = data.loc[(pd.to_datetime(data['date_time']).dt.month==10) & (pd.to_datetime(data['date_time']).dt.day==2)]
    train_data = train_data.drop(columns=['date_time'])  
    val_data = val_data.drop(columns=['date_time']) 
    test_data = test_data.drop(columns=['date_time']) 
    
    
    return train_data, val_data, test_data


In [8]:
def cv(train_data, val_data, test_data, alphas_list=[0.2, 0.5, 1.0, 2, 5]):
    
    
    #make X and y
    train_columns = [column for column in train_data.columns if column!='clicks']
    train_X = train_data[train_columns]
    val_X = val_data[train_columns]
    test_X = test_data[train_columns]
    train_y = train_data['clicks']
    val_y = val_data['clicks']
    test_y = test_data['clicks']
    
    
    models_list=[]
    val_loss_list=[]
    best_model=None
    best_loss=100000
    best_alpha=-1
    
    #kinda grid search for the best regression coefficient
    for alpha in alphas_list:
        #as it was discussed during the lecture, the most correct from bayesian point of view is a model with L2 regularization
        curr_model = Ridge(alpha=alpha)
        curr_model.fit(train_X, train_y)
        val_y =curr_model.predict(val_X)
        logloss = log_loss(val_data['clicks'], val_y)
        models_list.append(curr_model)
        val_loss_list.append(logloss)
        
        print(f"For alpha {alpha} log loss on validation is {logloss}")
        if logloss<best_loss:
            best_loss=logloss
            best_alpha=alpha
            best_model=curr_model
            
    #print data for the best model     
    test_y = best_model.predict(test_X)
    test_logloss = log_loss(test_data['clicks'], test_y)
    print(f"Best validation log loss is {best_loss} for alpha {best_alpha}. Log loss on test data for this model is {test_logloss}")
    return best_model, models_list, val_loss_list

In [22]:
%%time
train_data, val_data, test_data = feature_engineering(data)

Banner columns generated.
Zone columns generated.
OS columns generated.
Country columns generated.
CPU times: user 7min 10s, sys: 4min 1s, total: 11min 12s
Wall time: 12min 1s


In [23]:
columns_for_prediction=[column for column in train_data.columns if (not 'coeff_sum' in column)and(not 'g0'==column) and (not 'g1'==column) and (not 'banner_id1'==column)]

In [25]:
alphas_list= [0.2, 0.5, 1.0, 2, 5]
best_model, models_list, val_lost_list = cv(train_data[columns_for_prediction], val_data[columns_for_prediction], test_data[columns_for_prediction], alphas_list = alphas_list)

For alpha 0.2 log loss on validation is 0.16239503519611978
For alpha 0.5 log loss on validation is 0.16238854681434864
For alpha 1.0 log loss on validation is 0.16237944573364246
For alpha 2 log loss on validation is 0.1623652208281624
For alpha 5 log loss on validation is 0.1623388519787048
Best validation log loss is 0.1623388519787048 for alpha 5. Log loss on test data for this model is 0.14497883957185315


Тут запустила генерацию данных заново после нескольких попыток их изменения, чтобы быть точно уверенной, что все правильно работает при последовательном исполнении. Модели не трогала, так что их перезапускать не стала

In [280]:
%%time
train_data, val_data, test_data = feature_engineering(data)

Banner columns generated.
Zone columns generated.
OS columns generated.
Country columns generated.
CPU times: user 7min 4s, sys: 4min 31s, total: 11min 36s
Wall time: 12min 53s


In [281]:
test_data1 = test_data.copy()
test_data1 = test_data1[[column for column in test_data1.columns if 'banner_' not in column]]

In [282]:
test_data1['banner_id']=test_data['banner_id1']

С one-hot для второго баннера придется пошаманить

In [283]:
train_data = data.loc[(pd.to_datetime(data['date_time']).dt.month==9) & (pd.to_datetime(data['date_time']).dt.day!=1)]
known_banner_ids = set(train_data['banner_id'].value_counts().loc[lambda x: x > 100].index)
test_data1.loc[~test_data1.banner_id.isin(known_banner_ids), 'banner_id']=-1
test_data1 = pd.get_dummies(test_data1, columns = ['banner_id'], drop_first=True, prefix=['banner'])

In [284]:
print(len(test_data.columns))
print(len(test_data1.columns))

1003
584


In [285]:
test_data1['banner_1178'].value_counts()

0    1890488
1         74
Name: banner_1178, dtype: int64

In [286]:
empty_columns = [column for column in test_data.columns if (('banner_' in column)and('banner_id' not in column) and (column not in test_data1.columns))]
print(len(empty_columns))

418


In [287]:
for col in empty_columns:
    test_data1[col]=0

In [288]:
print(len(test_data.columns))
print(len(test_data1.columns))

1003
1002


In [289]:
test_data1['banner_1178'].value_counts()

0    1890488
1         74
Name: banner_1178, dtype: int64

In [290]:
val_data1 = val_data.copy()
val_data1 = val_data1[[column for column in val_data1.columns if 'banner_' not in column]]
val_data1['banner_id']=val_data['banner_id1']
val_data1.loc[~val_data1.banner_id.isin(known_banner_ids), 'banner_id']=-1
val_data1 = pd.get_dummies(val_data1, columns = ['banner_id'], drop_first=True, prefix=['banner'])
empty_columns = [column for column in val_data.columns if (('banner_' in column)and('banner_id' not in column) and (column not in val_data1.columns))]
for col in empty_columns:
    val_data1[col]=0


Недостающая колонка - banner_id1, которую мы не копировали

In [291]:
print(len(val_data.columns))
print(len(val_data1.columns))

1003
1002


Наны в колонках, которые используются для подсчета, портили мне CIPS, и я долго не могла понять, почему я исправила несколько багов, а в ней все равно получаются наны (остальные колонки я смотрела в первом задании, а эти - нет) :) Выкинем их!

In [292]:
val_data1[['g0', 'g1', 'coeff_sum0', 'coeff_sum1', 'clicks']].isna().sum()

g0              0
g1            869
coeff_sum0      0
coeff_sum1    869
clicks          0
dtype: int64

In [293]:
len(val_data1)

1442740

In [294]:
val_data1=val_data1.dropna(subset = ['g0', 'g1', 'coeff_sum0', 'coeff_sum1', 'clicks'])

In [295]:
val_data1[['g0', 'g1', 'coeff_sum0', 'coeff_sum1']].isna().sum()

g0            0
g1            0
coeff_sum0    0
coeff_sum1    0
dtype: int64

In [296]:
len(val_data1)

1441871

In [297]:
val_data=val_data.dropna(subset = ['g0', 'g1', 'coeff_sum0', 'coeff_sum1', 'clicks'])
test_data1=test_data1.dropna(subset = ['g0', 'g1', 'coeff_sum0', 'coeff_sum1', 'clicks'])
test_data=test_data.dropna(subset = ['g0', 'g1', 'coeff_sum0', 'coeff_sum1', 'clicks'])

In [298]:
for column in columns_for_prediction:
    if column not in val_data1.columns:
        print(column)

In [299]:
columns_for_prediction2 = [column for column in columns_for_prediction if (column!='clicks')]

In [300]:
def get_prob_first_sampling_is_greater_than_second(mu1, sigma1, mu2, sigma2):
    mu_diff = mu1-mu2 #mu for dist1-dist2
    sigma_diff = np.sqrt(sigma1**2 + sigma2**2)#sigma for dist1 - dist2
    #we don't want nans in result
    sigma_diff[sigma_diff==0]=1e-7
    #that's still the normal distribution
    return 1 - stats.norm.cdf(0, mu_diff, sigma_diff) #prob((dist1 - dist2)>0)
    
    

In [301]:
from scipy.special import logit

def get_coeffs_sum(df, model):
    #columns_for_prediction are helping us with a correct columns order
    preds = model.predict(df[columns_for_prediction2])
    #we don't want logit to be nan
    preds[preds>=1]=1 - 1e-7
    preds[preds<=0]=1e-7
    return logit(preds)
    

In [302]:
def CIPS(pi_0, pi_1, reward, lambdaa=10):
    return np.sum(reward*np.minimum(pi_0/pi_1,lambdaa))/(len(reward))

In [303]:
def get_CIPS_by_model_on_val(model):
    pi_0 = get_prob_first_sampling_is_greater_than_second(val_data['coeff_sum0'].to_numpy(), val_data['g0'].to_numpy(), val_data['coeff_sum1'].to_numpy(), val_data['g1'].to_numpy())
    pi_1 = get_prob_first_sampling_is_greater_than_second(get_coeffs_sum(val_data, model), val_data['g0'].to_numpy(), get_coeffs_sum(val_data1, model), val_data['g1'].to_numpy())
    pi_1[pi_1==0]=1e-7
    return CIPS(pi_0, pi_1, val_data['clicks'].to_numpy())

In [310]:
def get_CIPS_by_model_on_test(model):
    
    pi_0 = get_prob_first_sampling_is_greater_than_second(test_data['coeff_sum0'], test_data['g0'], test_data['coeff_sum1'], test_data['g1'])
    pi_1 = get_prob_first_sampling_is_greater_than_second(get_coeffs_sum(test_data, model), test_data['g0'], get_coeffs_sum(test_data1, model), test_data['g1'])
    pi_1[pi_1==0]=1e-7
    return CIPS(pi_0, pi_1, test_data['clicks'].to_numpy())
    

In [305]:
import numpy as np
print(np.mean(val_data['clicks']))
print(np.mean(test_data['clicks']))

0.042277013685690326
0.036045013178339795


Посмотрим на CIPS при разных коэффициентах регуляризации

CIPS - оценка value (то есть реварда, который мы получим), поэтому ее надо максимизировать

In [307]:
max_CIPS=-100500
best_i=-1

for i, model in enumerate(models_list):
    curr_CIPS=get_CIPS_by_model_on_val(model)
    print(f"CIPS on validation for model with alpha={alphas_list[i]} is {curr_CIPS}")
    if(curr_CIPS>max_CIPS):
        max_CIPS=curr_CIPS
        best_i = i
    

CIPS on validation for model with alpha=0.2 is 0.09525745784915227
CIPS on validation for model with alpha=0.5 is 0.09519166475509483
CIPS on validation for model with alpha=1.0 is 0.09509585505963003
CIPS on validation for model with alpha=2 is 0.094874779655249
CIPS on validation for model with alpha=5 is 0.09433513158578342


CIPS относительно похоже по значению на CTR, но отличается в несколько раз в большую сторону - похоже, все норм

Единственное, что меня смущает - что для log loss получается лучший (минимальный) результат при больших alpha, а тут - наоборот. Видимо, издержки того, что результаты для второго баннера мы все-таки не знаем...

In [311]:
CIPS_on_test = get_CIPS_by_model_on_test(models_list[best_i])
print(f"Best model (by CIPS on validation) on test has alpha={alphas_list[best_i]}. Its CIPS on test is {CIPS_on_test}")

Best model (by CIPS on validation) on test has alpha=0.2. Its CIPS on test is 0.08385523080019004


Попробуем еще уменьшить коэффициент регуляризации. 

В этом месте пришлось перезапустить кернел, он умер

In [9]:
%%time
train_data, val_data, test_data = feature_engineering(data)

Banner columns generated.
Zone columns generated.
OS columns generated.
Country columns generated.
CPU times: user 6min 45s, sys: 4min 7s, total: 10min 53s
Wall time: 11min 47s


In [11]:
columns_for_prediction=[column for column in train_data.columns if (not 'coeff_sum' in column)and(not 'g0'==column) and (not 'g1'==column) and (not 'banner_id1'==column)]

In [12]:
alphas_list= [0.2, 0.1, 0.01, 0.001, 1e-4, 1e-5]
best_model2, models_list2, val_lost_list2 = cv(train_data[columns_for_prediction], val_data[columns_for_prediction], test_data[columns_for_prediction], alphas_list = alphas_list)

For alpha 0.2 log loss on validation is 0.16239503519611978
For alpha 0.1 log loss on validation is 0.16239744422219654
For alpha 0.01 log loss on validation is 0.16239975020899494
For alpha 0.001 log loss on validation is 0.16239998901414354
For alpha 0.0001 log loss on validation is 0.16240001298289916
For alpha 1e-05 log loss on validation is 0.16240001538066393
Best validation log loss is 0.16239503519611978 for alpha 0.2. Log loss on test data for this model is 0.1452986518084258


Здесь опять генерим данные по баннерам и выкидываем наны для тех колонок, которые нам нужны

In [13]:
train_data2 = data.loc[(pd.to_datetime(data['date_time']).dt.month==9) & (pd.to_datetime(data['date_time']).dt.day!=1)]
known_banner_ids = set(train_data2['banner_id'].value_counts().loc[lambda x: x > 100].index)

In [14]:
test_data1 = test_data.copy()
test_data1 = test_data1[[column for column in test_data1.columns if 'banner_' not in column]]
test_data1['banner_id']=test_data['banner_id1']
test_data1.loc[~test_data1.banner_id.isin(known_banner_ids), 'banner_id']=-1
test_data1 = pd.get_dummies(test_data1, columns = ['banner_id'], drop_first=True, prefix=['banner'])
empty_columns = [column for column in test_data.columns if (('banner_' in column)and('banner_id' not in column) and (column not in test_data1.columns))]
for col in empty_columns:
    test_data1[col]=0

In [15]:
val_data1 = val_data.copy()
val_data1 = val_data1[[column for column in val_data1.columns if 'banner_' not in column]]
val_data1['banner_id']=val_data['banner_id1']
val_data1.loc[~val_data1.banner_id.isin(known_banner_ids), 'banner_id']=-1
val_data1 = pd.get_dummies(val_data1, columns = ['banner_id'], drop_first=True, prefix=['banner'])
empty_columns = [column for column in val_data.columns if (('banner_' in column)and('banner_id' not in column) and (column not in val_data1.columns))]
for col in empty_columns:
    val_data1[col]=0

In [16]:
columns_for_prediction2 = [column for column in columns_for_prediction if (column!='clicks')]

In [21]:
val_data1=val_data1.dropna(subset = ['g0', 'g1', 'coeff_sum0', 'coeff_sum1', 'clicks'])
val_data=val_data.dropna(subset = ['g0', 'g1', 'coeff_sum0', 'coeff_sum1', 'clicks'])
test_data1=test_data1.dropna(subset = ['g0', 'g1', 'coeff_sum0', 'coeff_sum1', 'clicks'])
test_data=test_data.dropna(subset = ['g0', 'g1', 'coeff_sum0', 'coeff_sum1', 'clicks'])

Скопировала все функции вместе, чтобы отдельно не перезапускать

In [22]:
from scipy.special import logit
import numpy as np

def get_prob_first_sampling_is_greater_than_second(mu1, sigma1, mu2, sigma2):
    mu_diff = mu1-mu2 #mu for dist1-dist2
    sigma_diff = np.sqrt(sigma1**2 + sigma2**2)#sigma for dist1 - dist2
    #we don't want nans in result
    sigma_diff[sigma_diff==0]=1e-7
    #that's still the normal distribution
    return 1 - stats.norm.cdf(0, mu_diff, sigma_diff) #prob((dist1 - dist2)>0)

def get_coeffs_sum(df, model):
    #columns_for_prediction are helping us with a correct columns order
    preds = model.predict(df[columns_for_prediction2])
    #we don't want logit to be nan
    preds[preds>=1]=1 - 1e-7
    preds[preds<=0]=1e-7
    return logit(preds)

def CIPS(pi_0, pi_1, reward, lambdaa=10):
    return np.sum(reward*np.minimum(pi_0/pi_1,lambdaa))/(len(reward))

def get_CIPS_by_model_on_val(model):
    pi_0 = get_prob_first_sampling_is_greater_than_second(val_data['coeff_sum0'].to_numpy(), val_data['g0'].to_numpy(), val_data['coeff_sum1'].to_numpy(), val_data['g1'].to_numpy())
    pi_1 = get_prob_first_sampling_is_greater_than_second(get_coeffs_sum(val_data, model), val_data['g0'].to_numpy(), get_coeffs_sum(val_data1, model), val_data['g1'].to_numpy())
    pi_1[pi_1==0]=1e-7
    return CIPS(pi_0, pi_1, val_data['clicks'].to_numpy())

def get_CIPS_by_model_on_test(model): 
    pi_0 = get_prob_first_sampling_is_greater_than_second(test_data['coeff_sum0'], test_data['g0'], test_data['coeff_sum1'], test_data['g1'])
    pi_1 = get_prob_first_sampling_is_greater_than_second(get_coeffs_sum(test_data, model), test_data['g0'], get_coeffs_sum(test_data1, model), test_data['g1'])
    pi_1[pi_1==0]=1e-7
    return CIPS(pi_0, pi_1, test_data['clicks'].to_numpy())

In [23]:
max_CIPS=-100500
best_i=-1

for i, model in enumerate(models_list2):
    curr_CIPS=get_CIPS_by_model_on_val(model)
    print(f"CIPS on validation for model with alpha={alphas_list[i]} is {curr_CIPS}")
    if(curr_CIPS>max_CIPS):
        max_CIPS=curr_CIPS
        best_i = i

CIPS on validation for model with alpha=0.2 is 0.09525745784915227
CIPS on validation for model with alpha=0.1 is 0.09528947126236129
CIPS on validation for model with alpha=0.01 is 0.09531045635652971
CIPS on validation for model with alpha=0.001 is 0.09531322432167641
CIPS on validation for model with alpha=0.0001 is 0.09531349737172144
CIPS on validation for model with alpha=1e-05 is 0.09531352698238812


Тут кажется, что чем меньше регуляризация, тем лучше. Но, вообще, совсем чуть-чуть отличается...

In [24]:
CIPS_on_test = get_CIPS_by_model_on_test(models_list2[best_i])
print(f"Best model (by CIPS on validation) on test has alpha={alphas_list[best_i]}. Its CIPS on test is {CIPS_on_test}")

Best model (by CIPS on validation) on test has alpha=1e-05. Its CIPS on test is 0.08390874146745823
