In [1]:
# !pip install pandas xlearn numpy scikit-learn tqdm  # in case you don't have some libraries required to run the code

In [2]:
import warnings

warnings.filterwarnings('ignore')

from utils import *

import pandas as pd
from tqdm.auto import tqdm

tqdm.pandas()

## Загрузка данных

Загрузим данные – только необходимые столбцы для решения задачи.

Проанализировал данные с [pandas-profiling](https://github.com/pandas-profiling/pandas-profiling) (но репорт в репозиторий загружать не стал).


In [17]:
types = {
    'os_id': int,
    'country_id': int,
    'campaign_clicks': int,
    'clicks': int,
    'oaid_hash': str,
    'zone_id': int,
    'banner_id': int
}
columns = ['date_time', *types.keys()]
df = pd.read_csv(
    '../data/data.csv',
    dtype=types,
    usecols=columns,
    parse_dates=[columns[0]]
)
df = df.set_index(columns[0]).sort_index()
df.head()

Unnamed: 0_level_0,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,clicks
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-09-01 00:02:49,30,596,5236744527665721365,0,0,7,0
2021-09-26 00:00:00,41,29,1834033519797437404,1,3,0,0
2021-09-26 00:00:00,1,188,7416450538971744701,2,2,15,0
2021-09-26 00:00:00,17,52,1832228443297591417,2,2,5,0
2021-09-26 00:00:00,47,73,4180077124914749282,1,4,13,0


## Предобработка данных

Подготовим данные к тренировке и тестированию (сразу все для упрощения).

In [18]:
data = feature_engineering(df, zone_id=(df.groupby("zone_id").clicks.mean()).to_dict())
oaid_max = max(data.oaid_hash)
data.head()

  0%|          | 0/15821472 [00:00<?, ?it/s]

Unnamed: 0_level_0,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,clicks,hour,weekday,weekend,daytime_Early Morning,daytime_Eve,daytime_Late Night,daytime_Morning,daytime_Night,daytime_Noon
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2021-09-01 00:02:49,0.02164,596,3320628,0,0,7,0,0,2,0,0,0,1,0,0,0
2021-09-26 00:00:00,0.007788,29,653046,1,3,0,0,0,6,1,0,0,1,0,0,0
2021-09-26 00:00:00,0.040499,188,5030974,2,2,15,0,0,6,1,0,0,1,0,0,0
2021-09-26 00:00:00,0.027959,52,651670,2,2,5,0,0,6,1,0,0,1,0,0,0
2021-09-26 00:00:00,0.017774,73,2491428,1,4,13,0,0,6,1,0,0,1,0,0,0


In [None]:
df_train = data.loc[:'2021-09-30']
df_test = data.loc['2021-10-01']
train, test = (
    data.loc[:'2021-10-01'].copy(),
    data.loc['2021-10-02'].copy()
)



In [None]:
GOAL = "clicks"
NUMERICAL_FEATURES = ["campaign_clicks"]
CATEGORICAL_FEATURES = [
    feature for feature in data.columns if feature != "campaign_clicks"
]
ALL_FEATURES = data.columns
NUM_THREADS = 16

encoder = {
    "currentcode": len(
        NUMERICAL_FEATURES
    ),  # Unique index for each numerical field or categorical variables
    "catdict": {},  # Dictionary that stores numerical and categorical variables
    "catcodes": {},
}  # Dictionary that stores index for each categorical variables per categorical field

encoder = convert_to_ffm(
    "../data/",
    df_train,
    "train",
    GOAL,
    NUMERICAL_FEATURES,
    CATEGORICAL_FEATURES,
    ALL_FEATURES,
    encoder,
)

encoder = convert_to_ffm(
    "../data/",
    df_test,
    "val",
    GOAL,
    NUMERICAL_FEATURES,
    CATEGORICAL_FEATURES,
    ALL_FEATURES,
    encoder,
)

encoder = convert_to_ffm(
    "../data/",
    test,
    "test",
    GOAL,
    NUMERICAL_FEATURES,
    CATEGORICAL_FEATURES,
    ALL_FEATURES,
    encoder,
)

## Подбор гиперпараметров

Подберем параметр размерности для модели – и валидироваться будем на предпоследнем дне (а последний день нужен для финального теста по условию).

XLearn "из коробки" умеет считать скор на валидации – посмотрим в его логи.

In [10]:
for k in tqdm((2, 4, 6, 8)):
    create_model(k=k)

  0%|          | 0/4 [00:00<?, ?it/s]

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 16 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (../data/train_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (../data/val_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 5662980
[32m[-

## Тренировка и тестирование

Данные готовы, параметры подобраны – теперь тренируем новую модель на всех доступных для тренировки данных и тестируем ее.

In [11]:
model = create_model(k=8)
model.setTest("../data/test_ffm.txt")
model.setSign()
model.predict("model.out", "output.txt")

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 16 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (../data/train_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (../data/val_ffm.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 5662980
[32m[-

Достигнутый результат (по логам) – `The test loss is: 0.007778`. Результат получился значительно лучше результатов прошлой работы [прошлой работы](https://github.com/tiulpin/Recsys-course-homework/blob/tiulpin/hw1/tiulpin.v/hw1/tiulpin_v.ipynb):
- `0.1549` – baseline
- `0.1481` – LogReg