# Для начала загрузим данные и подготовим все для линейной модели из первого задания

In [43]:
import pandas as pd
from pandas_profiling import ProfileReport
import datetime
from matplotlib import pyplot as plt
import seaborn as sns
from patsy import dmatrices, dmatrix
import numpy as np
from copy import deepcopy
from sklearn.preprocessing import normalize, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve, roc_auc_score, precision_score, recall_score, precision_recall_curve
from sklearn.metrics import f1_score, log_loss
from scipy import sparse
%matplotlib inline

In [2]:
df = pd.read_csv(
    '~/Downloads/data.csv'
)

In [3]:
df.head(3)

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
0,2021-09-27 00:01:30.000000,0,0,5664530014561852622,0,0,0,1240,0.067,0.035016,-7.268846,0,0.01,0.049516,-5.369901,1,1
1,2021-09-26 22:54:49.000000,1,1,5186611064559013950,0,0,1,1,0.002,0.054298,-2.657477,269,0.004,0.031942,-4.44922,1,1
2,2021-09-26 23:57:20.000000,2,2,2215519569292448030,3,0,0,2,0.014,0.014096,-3.824875,21,0.014,0.014906,-3.939309,1,1


In [15]:
# Немного уменьшим размер 
df[['clicks']] = df[['clicks']].apply(pd.to_numeric, downcast="unsigned")
df[['zone_id', 'os_id','impressions','country_id','banner_id', 'banner_id0', 'banner_id1']] = df[['zone_id', 'os_id','impressions','country_id','banner_id', 'banner_id0', 'banner_id1']].apply(lambda x: x.astype('category'))
df.drop(columns=['campaign_clicks'], inplace=True)

In [22]:
def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
    data["date_time"] =  pd.to_datetime(data["date_time"])
    
    # Чтобы удобнее было делить выборку по времени, когда мы будем работать с разреженной матрицей 
    # и удобно индексироваться по pd.DataFrame уже не получится, отсортируем все
    
    data.sort_values("date_time",inplace=True)  
    data.reset_index(inplace=True, drop=True)  
    data['time'] = data['date_time'].dt.hour.astype(int)*60 + data['date_time'].dt.minute.astype(int)
    data['hour'] = data['date_time'].dt.hour
    data['day'] = data['date_time'].dt.month.astype(int)*30 + data['date_time'].dt.day.astype(int)
    data['month'] = data['date_time'].dt.month
    data['weekday'] = data['date_time'].dt.weekday
    data = data.drop(columns=['date_time'])   # Исходная колонка с временем теперь избыточна
    data[['time','day', 'hour', 'weekday']] = data[['time','day', 'hour', 'weekday']].apply(pd.to_numeric, downcast="unsigned")
    data = data.drop(data.loc[data['day']==271].index)
    data['day'] = data['day'] - (data['day'].min() - 1) # для простоты будем вести счет дней с первого
    data['time_absolute'] = data['day'] *24 *60 + data['time']
    return data 

In [None]:
columns_to_train = ['date_time','zone_id', 'os_id','country_id','banner_id']
train = feature_engineering(df[columns_to_train])

In [24]:
train.head(3)

Unnamed: 0,zone_id,os_id,country_id,banner_id,time,hour,day,month,weekday,time_absolute
1,41,3,0,29,0,0,1,9,6,1440
2,1,2,15,188,0,0,1,9,6,1440
3,17,2,5,52,0,0,1,9,6,1440


In [61]:
# Получим индекс для тестовой выборки

train.loc[(train['month'] == 10) & (train['day']==7)]

Unnamed: 0,zone_id,os_id,country_id,banner_id,hour,day,month,weekday
13692494,14,1,0,1239,0,7,10,5
13692495,99999,3,0,174,0,7,10,5
13692496,14,2,9,175,0,7,10,5
13692497,0,1,3,76,0,7,10,5
13692498,24,1,10,428,0,7,10,5
...,...,...,...,...,...,...,...,...
15821467,24,2,6,180,23,7,10,5
15821468,99999,1,0,92,23,7,10,5
15821469,17,4,0,1235,23,7,10,5
15821470,1,0,0,2,23,7,10,5


In [27]:
test_index = 13692494

In [None]:
model_OHE = ColumnTransformer(
                  [('OHE', OneHotEncoder(drop='first', sparse=True, dtype=np.uint8),data.columns)],
                   remainder = 'passthrough'
                )

In [32]:
def deal_with_categorical(data: pd.DataFrame, model_ohe) -> sparse.csc_matrix:
    
    # Оставим только частые зоны
    freq_zone_ids = set(data['zone_id'].value_counts().loc[lambda x: x > 60000].index)
    data['zone_id'] = data['zone_id'].astype(int)
    data.loc[~data['zone_id'].isin(freq_zone_ids), 'zone_id'] = 99999
    
    # И баннеры
    freq_banners = set(data['banner_id'].value_counts().loc[lambda x: x > 100].index)
    data['banner_id'] = data['banner_id'].astype(int)
    data.loc[~data['banner_id'].isin(freq_banners), 'banner_id'] = 99999
    
    # Нормализуем то, что не в [0,1]
    to_normalize = data[["time", "time_absolute"]].values.astype(float)
    normalized_X = normalize(to_normalize)
   
    
    # Сделаем one-hot для всего остального
    data.drop(columns=["time", "time_absolute"], inplace=True)
    model_OHE = ColumnTransformer(
                  [('OHE', OneHotEncoder(drop='first', sparse=True, dtype=np.uint8),data.columns)],
                   remainder = 'passthrough'
                )
    dummified = model_OHE.fit_transform(data)
    X = sparse.csc_matrix(normalized_X)
    X = sparse.hstack([X,dummified])
    X = X.tocsr()
    
    return X

In [33]:
X = deal_with_categorical(train)
y = df['clicks']

In [34]:
X_train, X_test = X[: test_index], X[test_index:]
y_train, y_test = y[: test_index], y[test_index:]

In [37]:
model = LogisticRegression(penalty='l2',C=1.0,solver='liblinear')

In [38]:
model.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

In [None]:
test_predict_proba = model.predict_proba(X_test)
test_logloss = log_loss(y_test, test_predict_proba)
print(f"Best test log loss is {test_logloss}")

# Подготовим данные для подсчета IPS

In [44]:
last_day_0 = df[test_index:][df['banner_id'].astype('int') == df['banner_id0'].astype('int')]


  last_day_0 = df[test_index:][df['banner_id'].astype('int') == df['banner_id0'].astype('int')]


Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
13692494,2021-09-26 17:15:24.000000,117,599,5176437783170910447,2,9,599,0.00776,0.091844,-3.696148,80,0.008,0.071517,-3.868994,1,0
13692495,2021-09-27 17:05:05.000000,405,673,3768131969816496466,4,6,673,0.05,0.087554,-3.122432,660,0.05,0.096358,-3.189199,1,0
13692496,2021-10-02 23:42:00.000000,169,352,6598460192541315118,4,1,352,0.01,0.070265,-3.802742,476,0.01,0.065444,-3.791173,1,0
13692497,2021-09-27 03:02:22.000000,12,3,9166017617081547449,1,1,3,0.012,0.012159,-4.334572,14200245,1.5,0.041578,-3.359119,1,0
13692498,2021-09-27 14:00:35.000000,50,613,6740341189147500356,4,5,613,0.05,0.093642,-3.519574,14200246,0.6,0.00711,-1.993851,1,0


In [54]:
# После некоторых проб и ошибок выяснилось, что нужно также выкинуть наны

last_day_0 = last_day_0.dropna(subset = ['g1', 'coeff_sum1'])

In [55]:
last_day_1 = deepcopy(last_day_0)
last_day_1.banner_id = last_day_1.banner_id1

# Определим нужные функции

In [65]:
from scipy import stats

def calculate_policy(dif_sigma, dif_mean):
    return stats.norm.sf((0 - dif_sigma) / dif_sigma, dif_mean, dif_sigma)


def calculate_cips(policy_0, policy_1, df):
    lam = 10
    eps = 1e-7
    return (np.minimum(p1/(p0 + eps), lam) * df['clicks'].values).sum() / len(df)

# Посчитаем полиси и CIPS

In [66]:
dif_sigma = np.sqrt(last_day_0['g0'] ** 2 + last_day_0['g1'] ** 2)
dif_mean = last_day_0['coeff_sum0'] - last_day_0['coeff_sum1']

In [67]:
policy_0 = calculate_policy(dif_sigma, dif_mean)
