In [6]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
not_to_use = 'banner_id0, banner_id1, rate0, rate1, g0, g1, coeff_sum0, coeff_sum1'.split(', ')
df = pd.read_csv('../../data/data.csv').drop(not_to_use, axis=1)
df['date_time'] = pd.to_datetime(df['date_time'], format='%Y-%m-%d %H:%M:%S.%f')

df.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,impressions,clicks
0,2021-09-27 00:01:30,0,0,5664530014561852622,0,0,0,1,1
1,2021-09-26 22:54:49,1,1,5186611064559013950,0,0,1,1,1
2,2021-09-26 23:57:20,2,2,2215519569292448030,3,0,0,1,1
3,2021-09-27 00:04:30,3,3,6262169206735077204,0,1,1,1,1
4,2021-09-27 00:06:21,4,4,4778985830203613115,0,1,0,1,1


# повторим подготовку фичей из дз1

In [3]:
df['date'] = df['date_time'].dt.date
df = df[df['date'] > datetime(2021, 9, 10).date()]

df['hour'] = df['date_time'].dt.hour
df['last_date'] = (df['date_time'] - timedelta(days=1)).dt.date

df['last_hour'] = (df['date_time'] - timedelta(hours=1)).dt.hour

In [4]:
def add_feature_by_time(df, feature):
    feature_date = (
        df
        .groupby(['date', feature], as_index=False)
        .agg(
            feature_last_date_actions=('impressions', 'count'), 
            feature_last_date_conversion=('clicks', 'mean')
        )
        .rename(columns={
            'date': 'last_date',
            'feature_last_date_actions': f'{feature}_last_date_actions',
            'feature_last_date_conversion': f'{feature}_last_date_conversion'
        })
    )
    
    feature_date_hour = (
        df
        .groupby(['date', 'hour', feature], as_index=False)
        .agg(
            feature_last_hour_actions=('impressions', 'count'), 
            feature_last_hour_conversion=('clicks', 'mean')
        )
        .rename(columns={
            'date': 'last_date',
            'hour': 'last_hour',
            'feature_last_hour_actions': f'{feature}_last_hour_actions',
            'feature_last_hour_conversion': f'{feature}_last_hour_conversion'
        })
    )
    
    return (
        df
        .merge(
            feature_date,
            on=['last_date', feature],
            how='left'
        )
        .merge(
            feature_date_hour,
            on=['last_date', 'last_hour', feature],
            how='left'
        )
    )

df_merged = add_feature_by_time(df, 'banner_id')
df_merged = add_feature_by_time(df_merged, 'zone_id')
df_merged = add_feature_by_time(df_merged, 'os_id')
df_merged = add_feature_by_time(df_merged, 'country_id')
df_merged = df_merged.fillna(0)

In [5]:
user_actions = df.groupby('oaid_hash').agg(actions=('impressions', 'count'))

(
    user_actions.shape[0],
    user_actions.query('actions >= 5').shape[0],
    user_actions.query('actions >= 10').shape[0],
    user_actions.query('actions >= 50').shape[0],
    user_actions.query('actions >= 100').shape[0],
    user_actions.query('actions >= 500').shape[0],
)

(6510315, 613020, 217988, 12040, 2849, 154)

In [6]:
banner_actions = df.groupby('banner_id').agg(actions=('impressions', 'count'))

(
    banner_actions.shape[0],
    banner_actions.query('actions >= 5').shape[0],
    banner_actions.query('actions >= 10').shape[0],
    banner_actions.query('actions >= 50').shape[0],
    banner_actions.query('actions >= 100').shape[0],
    banner_actions.query('actions >= 500').shape[0],
)

(1633, 1372, 1302, 1178, 1109, 975)

Возьмем только юзеров, у которых >=5 показов и баннеры, у которых >=50 показов.

In [7]:
df_merged = (
    df_merged
    .merge(
        user_actions,
        on='oaid_hash',
        how='left'
    )
)

df_merged.loc[df_merged['actions'] < 5, 'oaid_hash'] = -1

df_merged = (
    df_merged
    .drop('actions', axis=1)
    .merge(
        banner_actions,
        on='banner_id',
        how='left'
    )
)

df_merged.loc[df_merged['actions'] < 50, 'banner_id'] = -1
df_merged = df_merged.drop('actions', axis=1)

Возьмем такие филды:

 - **user**: `os_id`, `country_id`, `oaid_hash`
 - **campaign**: `banner_id`, `campaign_clicks`, `banner_id_last_hour_actions`, `banner_id_last_hour_conversion`, `banner_id_last_date_actions`, `banner_id_last_date_conversion`
 - **context**: `zone_id`, `hour`, `zone_id_last_hour_actions`, `zone_id_last_hour_conversion`, `os_id_last_hour_actions`, `os_id_last_hour_conversion`, `country_id_last_hour_actions`, `country_id_last_hour_conversion`, `zone_id_last_date_actions`, `zone_id_last_date_conversion`, `os_id_last_date_actions`, `os_id_last_date_conversion`, `country_id_last_date_actions`, `country_id_last_date_conversion`

In [8]:
fields = [
    ['os_id', 'country_id', 'oaid_hash'],
    ['banner_id', 'campaign_clicks', 
     *[f'banner_id_last_{i}_{j}' for i in ['date', 'hour'] for j in ['actions', 'conversion']]],
    ['zone_id', 'hour', 
     *[
         f'{i}_id_last_{j}_{k}' 
         for i in ['zone', 'os', 'country']
         for j in ['date', 'hour']
         for k in ['actions', 'conversion']
     ]]
]

field_dict = {
    feature: i for i, field in enumerate(fields) for feature in field
}

In [9]:
numeric = [
    'campaign_clicks', 
    *[
         f'{i}_id_last_{j}_{k}' 
         for i in ['banner', 'zone', 'os', 'country']
         for j in ['date', 'hour']
         for k in ['actions', 'conversion']
     ]
]

categorical = [
    'os_id', 'country_id', 'oaid_hash',
    'banner_id', 'zone_id', 'hour'
]

In [10]:
def encode_categories(df, categorical):
    encoder = {
        'cur_value': 0,
        'codes': dict()
    }
    
    for col in df.columns:
        if col in categorical:
            unique = df[col].unique()
            encoder['codes'][col] = dict()
            
            for i in unique:
                encoder['codes'][col][i] = encoder['cur_value']
                encoder['cur_value'] += 1
                
    return encoder

In [11]:
encoder = encode_categories(df_merged, categorical)

In [20]:
def make_dataset(df, encoder, field_dict, categorical, numeric):
    i = 0
    for col in numeric:
        field = field_dict[col]
        df[col + '_ffm'] = df[col].map(lambda x: f'{field}:{i}:{x}')
        i += 1
        
    for col in categorical:
        field = field_dict[col]
        codes_fol_feat = encoder['codes'][col]
        df[col + '_ffm'] = df[col].map(lambda x: f'{field}:{codes_fol_feat[x]}:1')
        
    return df

In [21]:
df_final = make_dataset(df_merged, encoder, field_dict, categorical, numeric)
columns_to_select = ['clicks', *[col for col in df_final.columns if col.endswith('_ffm')]]

In [23]:
(
    df_final
    [df_final['date'] < datetime(2021, 10, 1).date()]
    [columns_to_select]
    .to_csv('./data/train_ffm.txt', sep=' ', header=False, index=False)
)

(
    df_final
    [df_final['date'] == datetime(2021, 10, 1).date()]
    [columns_to_select]
    .to_csv('./data/val_ffm.txt', sep=' ', header=False, index=False)
)

(
    df_final
    [df_final['date'] == datetime(2021, 10, 2).date()]
    [columns_to_select]
    .to_csv('./data/test_ffm.txt', sep=' ', header=False, index=False)
)

# тренируем ffm

In [6]:
import xlearn as xl
from sklearn.metrics import log_loss
import pandas as pd
import numpy as np
import os

In [2]:
def train_and_eval(k=4, lr=0.2, l=0.002):
    ffm_model = xl.create_ffm()
    
    if f'model_{k}_{lr}_{l}.out' not in os.listdir('./models'):
        ffm_model.setTrain('./data/train_ffm.txt')

        param = {
            'task': 'binary', 
            'lr': lr, 
            'lambda': l,
            'k': k,
            'metric': 'acc'
        }

        ffm_model.fit(param, f'./models/model_{k}_{lr}_{l}.out')
    
    
    ffm_model.setTest('./data/val_ffm.txt')
    ffm_model.setSigmoid()
    
    ffm_model.predict(f'models/model_{k}_{lr}_{l}.out', f'data/pred_{k}_{lr}_{l}.txt')
    
    y_pred = pd.read_csv(f'data/pred_{k}_{lr}_{l}.txt', sep=' ', header=None).values[:, 0].reshape(-1)
    y_true = pd.read_csv(f'./data/val_ffm.txt', sep=' ', header=None).values[:, 0].reshape(-1)
    return log_loss(y_true.astype(int), y_pred)

In [3]:
for k in [2, 4, 6]:
    for lr in [0.5, 0.2, 0.05]:
        for l in [0.2, 0.02, 0.002]:
            loss = train_and_eval(k=k, lr=lr, l=l)
            print(f'{k} {lr} {l} {loss:.3f}')

2 0.5 0.2 0.180
2 0.5 0.02 0.179
2 0.5 0.002 0.178
2 0.2 0.2 0.180
2 0.2 0.02 0.180
2 0.2 0.002 0.178
2 0.05 0.2 0.179
2 0.05 0.02 0.179
2 0.05 0.002 0.178
4 0.5 0.2 0.180
4 0.5 0.02 0.180
4 0.5 0.002 0.178
4 0.2 0.2 0.180
4 0.2 0.02 0.180
4 0.2 0.002 0.178
4 0.05 0.2 0.178
4 0.05 0.02 0.179
4 0.05 0.002 0.178
6 0.5 0.2 0.180
6 0.5 0.02 0.180
6 0.5 0.002 0.178
6 0.2 0.2 0.180
6 0.2 0.02 0.179
6 0.2 0.002 0.178
6 0.05 0.2 0.178
6 0.05 0.02 0.179
6 0.05 0.002 0.178


# оценка на тесте

In [7]:
# baseline
y_true = pd.read_csv(f'./data/test_ffm.txt', sep=' ', header=None).values[:, 0].reshape(-1)

with open('./data/train_ffm.txt', 'r') as f:
    lines = f.readlines()
    
s = 0
for l in lines:
    s += int(l[0])
    
dummy = s / len(lines)
dummy_predict = np.ones(len(y_true)) * dummy

print(f'{log_loss(y_true.astype(int), dummy_predict):.4f}')

0.1559


In [4]:
ffm_model = xl.create_ffm()

k = 6
lr = 0.05
l = 0.002

ffm_model.setTest('./data/test_ffm.txt')
ffm_model.setSigmoid()

ffm_model.predict(f'models/model_{k}_{lr}_{l}.out', f'data/pred_{k}_{lr}_{l}.txt')

y_pred = pd.read_csv(f'data/pred_{k}_{lr}_{l}.txt', sep=' ', header=None).values[:, 0].reshape(-1)
print(f'{log_loss(y_true.astype(int), y_pred):.4f}')

0.1545


FFM оказался чуть хуже линейной модели на этих же фичах (у линейной лосс был `0.1479`), однако все равно удалось побить бейзлайн.