Импортируем библиотеки

In [1]:
import numpy as np
import pandas as pd
import xlearn as xl
import scipy.stats as ss
from tqdm.notebook import tqdm
from scipy.special import logit
from sklearn.metrics import log_loss

Читаем и предобрабатываем данные и оставляем только последний день

In [2]:
def preprocess_data(data):
    '''
    Препроцессинг на полном датасете
    '''
    # удаляем столбец impressions
    data = data.drop(columns=['impressions'])
    
    # выделяем из даты часы и дни недели
    dates = pd.to_datetime(data['date_time'])
    data['hour'] = dates.dt.hour
    data['weekday'] = dates.dt.dayofweek
    
    # категоризуем oaid_hash, иначе xlearn умирает от слишком большого числа фич
    data['oaid_hash'] = data['oaid_hash'].astype('category').cat.codes
    
    return data

# читаем данные
data = pd.read_csv('../../data/data.csv')

# препроцессинг на полных данных
data = data.sort_values(by='date_time')
data = preprocess_data(data)

# считаем max_values для удобства генерации файлов
max_values = dict()
for col in data.columns:
    max_values[col] = data[col].max()

# оставляем последний день и только строки, где banner_id совпадает с banner_id0 и нет NaN
last_day = '2021-10-02'
splitting_mask = data['date_time'].str.startswith(last_day) & (data['banner_id'] == data['banner_id0']) & data['g1'].notna()
last_day_data = data[splitting_mask]

# удаляем полные данные за ненадобностью
del data

In [3]:
last_day_data.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,clicks,hour,weekday
14196412,2021-10-02 00:00:00.000000,14,1239,4676385,0,1,0,1239,0.046,0.011367,-6.356145,1234,0.046,0.011368,-6.355855,0,0,5
8706638,2021-10-02 00:00:00.000000,525,174,1718633,0,3,0,174,0.001,0.063288,-3.107591,104,0.001,0.041912,-3.0916,0,0,5
13000378,2021-10-02 00:00:00.000000,14,175,4672584,0,2,9,175,0.007,0.042959,-3.023266,232,0.007,0.042071,-3.060019,0,0,5
9767447,2021-10-02 00:00:00.000000,0,76,2550468,0,1,3,76,0.008,0.051014,-4.665202,34,0.00873,0.014532,-4.988883,0,0,5
9054327,2021-10-02 00:00:00.000000,24,428,483280,0,1,10,428,0.001,0.157651,-3.672826,719,0.001,0.167411,-3.67658,0,0,5


Генерируем новые тестовые файлы для `banner_id = banner_id0` и `banner_id = banner_id1`

In [4]:
def create_field_string(data_row: pd.DataFrame, max_values: dict, features: list, field_no: int):
    current_max = 0
    field = []
    for feature in features:
        field.append(f"{field_no}:{current_max + data_row[feature]}:{1}")
        current_max += max_values[feature] + 1
    return (' ').join(field)
        

def generate_data_for_model(data: pd.DataFrame, max_values: dict, filename: str, banner_id='banner_id'):
    '''
    Генерируем данные с нужным айди баннера
    '''
    # устанавливаем нужный айди баннера
    data['banner_id'] = data[banner_id]
    
    lines = []
    for i in tqdm(data.index):
        row = data.loc[i]
        label = str(row['clicks'])
        user_features = ['os_id', 'country_id']
        user_field = create_field_string(row, max_values, user_features, 0)
        ad_features = ['banner_id', 'zone_id', 'campaign_clicks']
        ad_field = create_field_string(row, max_values, ad_features, 1)
        time_features = ['hour', 'weekday']
        time_field = create_field_string(row, max_values, time_features, 2)
        lines.append(f"{label} {user_field} {ad_field} {time_field}")
    with open(filename, 'w') as file:
        file.write('\n'.join(lines))

In [5]:
generate_data_for_model(last_day_data, max_values, 'test0.txt')
generate_data_for_model(last_day_data, max_values, 'test1.txt', banner_id='banner_id1')

HBox(children=(FloatProgress(value=0.0, max=1885670.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1885670.0), HTML(value='')))




Загружаем FFM модель из предыдущей лабораторной, генерируем ее предсказания и считаем новые суммы коэффициентов

In [6]:
def compute_coeff_sum(test_file: str, model_file: str):
    model = xl.create_ffm()
    model.setTest(test_file)
    model.setSigmoid()
    model.predict(model_file, './predictions.txt')
    with open(test_file) as file:
        labels = [int(line[0]) for line in file.read().split('\n')]
    with open('./predictions.txt') as file:
        pred = np.array([float(p) for p in file.read().strip().split('\n')])
    coeff_sum = logit(pred)
    return coeff_sum, log_loss(labels, pred)

In [7]:
coeff_sum0, loss0 = compute_coeff_sum('test0.txt', 'model.out')
print(loss0)
coeff_sum1, loss1 = compute_coeff_sum('test1.txt', 'model.out')
print(loss1)

0.1486442615785983
0.15162719879855321


Считаем $\pi_0$ и $\pi_1$

In [8]:
def compute_pi0(data, seed=1337):
    np.random.seed(seed)
    sample0 = np.random.normal(data['coeff_sum0'], np.abs(data['g0']), (10 ** 3, len(data)))
    sample1 = np.random.normal(data['coeff_sum1'], np.abs(data['g1']), (10 ** 3, len(data)))
    return (sample0 > sample1).mean(axis=0)


def compute_pi1(data, coeff_sum0, coeff_sum1, seed=1337):
    np.random.seed(seed)
    sample0 = np.random.normal(coeff_sum0, np.abs(data['g0']), (10 ** 3, len(data)))
    sample1 = np.random.normal(coeff_sum1, np.abs(data['g1']), (10 ** 3, len(data)))
    return (sample0 > sample1).mean(axis=0)

In [9]:
pi0 = compute_pi0(last_day_data)
pi1 = compute_pi1(last_day_data, coeff_sum0, coeff_sum1)

CIPS для $\lambda=10$ и CTR, чтобы сравнить полученное значение

In [12]:
lmbda = 10.0
ctr = last_day_data['clicks'].mean()
cips = (last_day_data['clicks'] * np.minimum(pi1 / (pi0 + 1e-6), lmbda)).mean()

In [13]:
ctr, cips

(0.036045013178339795, 0.07378550157132419)

CIPS получилось примерно в 2 раза больше CTR, что похоже на адекватное значение