In [1]:
import numpy as np
import pandas as pd

import os

import datetime

from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import log_loss
from sklearn.model_selection import TimeSeriesSplit

import xlearn as xl

In [4]:
import warnings
warnings.filterwarnings("ignore")

# Препроцессинг датасета

In [23]:
df_ = pd.read_csv('/Users/evgenia/Desktop/data.csv')

In [24]:
# убираем ненужные колонки, оставляем 'oaid_hash'
df_ = df_.drop(columns=['banner_id0', 'rate0', 'g0', 'coeff_sum0',
       'banner_id1', 'rate1', 'g1', 'coeff_sum1'])

In [25]:
enc = OrdinalEncoder()
transformed_user_hash = enc.fit_transform(df_[['oaid_hash']])
df_[['oaid_hash']] = transformed_user_hash.astype(int)
oaid_hash_max = max(df_['oaid_hash'])

In [27]:
df_.sort_values(by='date_time', inplace=True)

In [28]:
train_df = df_[df_['date_time'] < '2021-10-02 00:00:01.000000']
test_df = df_[df_['date_time'] >= '2021-10-02 00:00:01.000000']

In [None]:
def prepare_features(df_, zone_id_dict=None):
    # убираем impressions - везде 1
    df_.drop(columns=['impressions'], inplace=True) 
    
    # date_time -> день недели, час, выходной ли, месяц
    df_['day_of_week']= pd.to_datetime(df_['date_time']).dt.dayofweek
    df_['is_weekend'] = (df_['day_of_week'] > 4).astype(int)
    df_['hour']= pd.to_datetime(df_['date_time']).dt.hour 
    
    if zone_id_dict is None:
        zone_id_dict = ((df_.groupby('zone_id')['clicks'].mean())).to_dict()
    # zone_id 
    df_['zone_id'] = df_['zone_id'].map(zone_id_dict)
    df_['zone_id'] = df_['zone_id'].fillna(np.mean(list(zone_id_dict.values())))
    
    # и для 9, и для 10 os_id - только негативные примеры, можно объединить в один класс
    df_.loc[df_['os_id'] == 10]['os_id'] = 9
    # убираем время
    df_.drop(columns=['date_time'], inplace=True)
    
    return df_, zone_id_dict

In [23]:
# подготовка фич для датасета
train_df_prepared, zone_id_dict = prepare_features(train_df)
test_df_prepared, _ = prepare_features(test_df, zone_id_dict)

In [24]:
train_df_prepared.head(5)

Unnamed: 0,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,clicks,day_of_week,is_weekend,hour
1390198,0.021063,596,3693539,0,0,7,0,2,0,0
5041415,0.007601,29,1293475,1,3,0,0,6,1,0
1442602,0.038422,188,5233288,2,2,15,0,6,1,0
7232498,0.027595,52,1292247,2,2,5,0,6,1,0
14938691,0.018115,73,2947801,1,4,13,0,6,1,0


**User field:** oaid_hash, os_id, country_id

**Context field:** zone_id, day_of_week, is_weekend, hour

**banner_field:** banner_id,  campaign_clicks

In [25]:
# подготовка данных в нужном для ffm формате
def get_features_for_ffm(df, filepath):
    with open(filepath, "w") as f:
        lines = ''
        for idx, i in enumerate(df.index):
            row = df.loc[i, :]
            target = row['clicks']
            user_features = '1:' + str(int(row['oaid_hash'])) + ':1,1:' + str(int(oaid_hash_max + row['os_id'])) + ':1,1:' + str(oaid_hash_max + 11 + int(row['country_id']))
            context_features = '2:0:' + str(row['zone_id']) + ',2:' + str(int(1 + row['day_of_week'])) + ':1,2:8:' + str(int(row['is_weekend'])) + ',2:' + str(int(9+row['hour']))+ ':1'
            banner_features = '3:' + str(int(row['banner_id'])) + ':1,3:2000:' + str(int(row['campaign_clicks']))
            lines += str(int(target)) + ',' + user_features + ',' + context_features + ',' + banner_features + '\n'
            if idx > 0 and idx % 200000 == 0:
                print(idx)
        f.write(lines)

In [26]:
# для кросс-валидации используем часть данных
train_sampled = train_df_prepared.sample(n=2000000, random_state=1)
test_sampled = test_df_prepared.sample(n=500000, random_state=1)

In [27]:
# сохраняем подготовленные данные

get_features_for_ffm(train_df_prepared, './full_train_data_ffm.txt')
get_features_for_ffm(train_sampled, './train_data_ffm.txt')
get_features_for_ffm(test_df_prepared, './full_test_data_ffm.txt')
get_features_for_ffm(test_sampled, './test_data_ffm.txt')

200000
400000
600000
800000
1000000
1200000
1400000
1600000
1800000
2000000
2200000
2400000
2600000
2800000
3000000
3200000
3400000
3600000
3800000
4000000
4200000
4400000
4600000
4800000
5000000
5200000
5400000
5600000
5800000
6000000
6200000
6400000
6600000
6800000
7000000
7200000
7400000
7600000
7800000
8000000
8200000
8400000
8600000
8800000
9000000
9200000
9400000
9600000
9800000
10000000
10200000
10400000
10600000
10800000
11000000
11200000
11400000
11600000
11800000
12000000
12200000
12400000
12600000
12800000
13000000
13200000
13400000
13600000
200000
400000
600000
800000
1000000
1200000
1400000
1600000
1800000
2000000


In [17]:
def create_model(ffm_lines, k, lambda_=0.001):
    with open('./features.txt', 'w') as f:
        f.writelines(ffm_lines)
    
    ffm_model = xl.create_ffm()
    ffm_model.setTrain("./features.txt")
    
    param = {'task':'binary', 'lr':0.01, 'lambda': lambda_, 'k': k, 'metric': 'acc'}
    ffm_model.fit(param, model_path='./model.out')
    return ffm_model

In [11]:
def evaluate_model(ffm_lines, model, real_y):
    with open('./val_features.txt', 'w') as f:
        f.writelines(ffm_lines)
    model.setTest("./val_features.txt")
    model.setSign()
    model.predict("./model.out", "./output.txt")
    with open('./output.txt', 'r') as f:
        y_pred = list(map(int, f.read().split('\n')[:-1]))
    return log_loss(real_y, y_pred)

In [30]:
def cv_k(df):
    split = TimeSeriesSplit(n_splits=3)
    res_score, res_k = np.inf, None
    for k in [4, 6, 8]:
        scores = []
        print(f'Evaluating  model with latent factor size {k}')
        for train_index, test_index in split.split(df):
            real_y = list(map(int, df.loc[test_index]['clicks'].to_list()))
            model = create_model(df.loc[train_index]['data'].to_list(), k)
            scores.append(evaluate_model(df.loc[test_index]['data'].to_list(), model, real_y))
        score = np.mean(scores)
        if score < res_score:
            res_score, res_k = score, k
    print(f'Best score {res_score} with size: {res_k}')

# Валидация по времени для подбора размерности 

In [None]:
with open('./train_data_ffm.txt', 'r') as f:
    train_ffm_list = f.readlines()
df = pd.DataFrame()
df['data'] = train_ffm_list
df['clicks'] = df['data'].apply(lambda x: x[0])
cv_k(df)

Evaluating  model with latent factor size 4
predicting
predicted
predicting
predicted


In [None]:
# кёрнел умер, когда пересчитывала :( 
# до этого лучшее значение log_loss получилось для размерности 6

# Тест на 2021-10-02

In [9]:
with open('./full_test_data_ffm.txt', 'r') as f:
    full_test_ffm_list = f.readlines()

In [10]:
y_real = list(map(lambda x: int(x[0]), full_test_ffm_list))

In [7]:
model = xl.create_ffm()
model.setTrain('./full_train_data_ffm.txt')

param = {'task':'binary', 'lr':0.01, 'lambda': 0.001, 'k': 6, 'metric': 'acc'}
model.fit(param, model_path='./model.out')

model.setTest('./full_test_data_ffm.txt')
model.setSign()
model.predict('./model.out', './output.txt')

with open('./output.txt', 'r') as f:
    y_pred = list(map(int, f.read().split('\n')[:-1]))
        

In [11]:
log_loss(y_real, y_pred)

1.2226216036444428

In [4]:
# ffm_model = xl.create_ffm()
# ffm_model.setTXTModel(model_path='./model.out')