In [1]:
import pandas as pd
import os
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

In [2]:
from scipy import sparse

In [3]:
df = pd.read_csv('../../../../data.csv')
df = df.sample(int(len(df) * 0.5), random_state=42).reset_index(False)

In [4]:
df.oaid_hash = LabelEncoder().fit_transform(df.oaid_hash.values)

In [5]:
df['rate0'].fillna(np.mean(df[df.date_time < '2021-10-02']['rate0']), inplace=True)
df['g0'].fillna(np.mean(df[df.date_time < '2021-10-02']['g0']), inplace=True)
df['coeff_sum0'].fillna(np.mean(df[df.date_time < '2021-10-02']['coeff_sum0']), inplace=True)

df['rate1'].fillna(np.mean(df[df.date_time < '2021-10-02']['rate1']), inplace=True)
df['g1'].fillna(np.mean(df[df.date_time < '2021-10-02']['g1']), inplace=True)
df['coeff_sum1'].fillna(np.mean(df[df.date_time < '2021-10-02']['coeff_sum1']), inplace=True)

In [6]:
one_encoder = OneHotEncoder()
scaller = StandardScaler()

In [7]:
data = sparse.hstack([
    one_encoder.fit_transform(df[['zone_id']]),
    one_encoder.fit_transform(df[['banner_id']]),
    one_encoder.fit_transform(df[['oaid_hash']]),
    one_encoder.fit_transform(df[['zone_id']]),
    one_encoder.fit_transform(df[['os_id']]),
    one_encoder.fit_transform(df[['country_id']]),
    sparse.csr_matrix(scaller.fit_transform(df[['campaign_clicks']])),
    sparse.csr_matrix(scaller.fit_transform(df[['rate0']])),
    sparse.csr_matrix(scaller.fit_transform(df[['g0']])),
    sparse.csr_matrix(scaller.fit_transform(df[['coeff_sum0']])),
    sparse.csr_matrix(scaller.fit_transform(df[['rate1']])),
    sparse.csr_matrix(scaller.fit_transform(df[['g1']])),
    sparse.csr_matrix(scaller.fit_transform(df[['coeff_sum1']])),
])

data = data.tocsr()

In [8]:
train_index_max = df[df['date_time'] < '2021-10-02'].index
test_index_max = df[df['date_time'] >= '2021-10-02'].index

x_train, x_test = data[train_index_max], data[test_index_max]
y_train, y_test = df.iloc[train_index_max]['clicks'], df.iloc[test_index_max]['clicks']

In [9]:
from pyfm import pylibfm

In [10]:
fm = pylibfm.FM(
    num_factors=10,
    num_iter=30,
    verbose=True,
    task='classification',
    initial_learning_rate=0.001,
    learning_rate_schedule="optimal"
)

In [11]:
fm.fit(x_train, y_train.values)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training log loss: 0.10670
-- Epoch 2
Training log loss: 0.10488
-- Epoch 3
Training log loss: 0.10442
-- Epoch 4
Training log loss: 0.10408
-- Epoch 5
Training log loss: 0.10379
-- Epoch 6
Training log loss: 0.10352
-- Epoch 7
Training log loss: 0.10324
-- Epoch 8
Training log loss: 0.10297
-- Epoch 9
Training log loss: 0.10267
-- Epoch 10
Training log loss: 0.10241
-- Epoch 11
Training log loss: 0.10212
-- Epoch 12
Training log loss: 0.10188
-- Epoch 13
Training log loss: 0.10162
-- Epoch 14
Training log loss: 0.10135
-- Epoch 15
Training log loss: 0.10108
-- Epoch 16
Training log loss: 0.10085
-- Epoch 17
Training log loss: 0.10058
-- Epoch 18
Training log loss: 0.10035
-- Epoch 19
Training log loss: 0.10013
-- Epoch 20
Training log loss: 0.09988
-- Epoch 21
Training log loss: 0.09961
-- Epoch 22
Training log loss: 0.09939
-- Epoch 23
Training log loss: 0.09916
-- Epoch 24
Training log loss: 0.098

In [16]:
preds = fm.predict(x_test)

In [17]:
from sklearn.metrics import roc_auc_score, log_loss

In [18]:
log_loss(y_test.values, preds)

0.12903765559972247

In [19]:
roc_auc_score(y_test.values, preds)

0.8005871025839967

Получили хорошие метрики, но какой ценой? Очень долго учится, я не знаю какие выводы еще можно сделать, но работает

### В общем, тут стало понятно, что для ffm надо все совсем под другому, поэтому хоть это и в одном ноутбуке, но это неправда, тут все будет совсем по-другому) 

In [3]:
df = pd.read_csv('../../../../data.csv')

Функцию подготовки возьмем с открытого источника на гитхабе

In [4]:
import json
import math


def _convert_to_ffm(path, df, type, target, numerics, categories, features, encoder):
    print('convert_to_ffm - START')
    for x in numerics:
        if(x not in encoder['catdict']):
            print(f'UPDATING CATDICT: numeric field - {x}')
            encoder['catdict'][x] = 0
    for x in categories:
        if(x not in encoder['catdict']):
            print(f'UPDATING CATDICT: categorical field - {x}')
            encoder['catdict'][x] = 1

    nrows = df.shape[0]
    with open(path + str(type) + "_ffm.txt", "w") as text_file:

        for n, r in enumerate(range(nrows)):
            datastring = ""
            datarow = df.iloc[r].to_dict()
            datastring += str(int(datarow[target]))

            for i, x in enumerate(encoder['catdict'].keys()):
                if(encoder['catdict'][x] == 0):
                    if math.isnan(datarow[x]) is not True:
                        datastring = datastring + " "+str(i)+":" + str(i)+":" + str(datarow[x])
                else:

                    if(x not in encoder['catcodes']):
                        print(f'UPDATING CATCODES: categorical field - {x}')
                        encoder['catcodes'][x] = {}
                        encoder['currentcode'] += 1
                        print(f'UPDATING CATCODES: categorical value for field {x} - {datarow[x]}')
                        encoder['catcodes'][x][datarow[x]] = encoder['currentcode']  # encoding the feature
                    elif(datarow[x] not in encoder['catcodes'][x]):
                        encoder['currentcode'] += 1
                        print(f'UPDATING CATCODES: categorical value for field {x} - {datarow[x]}')
                        encoder['catcodes'][x][datarow[x]] = encoder['currentcode']  # encoding the feature

                    code = encoder['catcodes'][x][datarow[x]]
                    datastring = datastring + " "+str(i)+":" + str(int(code))+":1"

            datastring += '\n'
            text_file.write(datastring)
    return encoder

In [5]:
good_columns = [
    'zone_id',
    'banner_id',
    'campaign_clicks',
    'os_id',
    'country_id',
    'impressions',
    'clicks',
    'oaid_hash'
]
df = df[[*good_columns, 'date_time']]

In [6]:
def feature_prepare(df: pd.DataFrame, rules) -> pd.DataFrame:
    for name, count in rules:
        mask = df[name].map(df[name].value_counts()) < count
        df[name] = df[name].mask(mask, -1)
    return df

In [7]:
df = feature_prepare(df, [
    ('oaid_hash', int(np.percentile(df.oaid_hash.value_counts(), 75))), 
    ('banner_id', int(np.percentile(df.banner_id.value_counts(), 80))),
    ('zone_id', int(np.percentile(df.zone_id.value_counts(), 90)))
])

In [8]:
target = 'clicks'
num_features = ['campaign_clicks']
cat_features = [feature for feature in good_columns if (feature!='campaign_clicks') and (feature != 'clicks') and (feature != 'date_time')]

In [9]:
test_df = df[df['date_time'] >= '2021-10-02'].drop(['date_time'], axis=1)
train_df = df[df['date_time'] < '2021-10-02'].drop(['date_time'], axis=1)

In [11]:
np.save('true.npy', np.array(test_df['clicks']))

In [29]:
encoder = {
    'currentcode': 1,
    'catdict': {},
    'catcodes': {}
}

encoder = _convert_to_ffm('data/', train_df, 'train', target,
                          num_features,
                          cat_features,
                          good_columns,
                          encoder)

encoder = _convert_to_ffm('data/', test_df, 'test', target,
                          num_features,
                          cat_features,
                          good_columns,
                          encoder)

convert_to_ffm - START
UPDATING CATDICT: numeric field - campaign_clicks
UPDATING CATDICT: categorical field - zone_id
UPDATING CATDICT: categorical field - banner_id
UPDATING CATDICT: categorical field - os_id
UPDATING CATDICT: categorical field - country_id
UPDATING CATDICT: categorical field - impressions
UPDATING CATDICT: categorical field - oaid_hash
UPDATING CATCODES: categorical field - zone_id
UPDATING CATCODES: categorical value for field zone_id - 0
UPDATING CATCODES: categorical field - banner_id
UPDATING CATCODES: categorical value for field banner_id - 0
UPDATING CATCODES: categorical field - os_id
UPDATING CATCODES: categorical value for field os_id - 0
UPDATING CATCODES: categorical field - country_id
UPDATING CATCODES: categorical value for field country_id - 0
UPDATING CATCODES: categorical field - impressions
UPDATING CATCODES: categorical value for field impressions - 1
UPDATING CATCODES: categorical field - oaid_hash
UPDATING CATCODES: categorical value for field oa

UPDATING CATCODES: categorical value for field oaid_hash - 3354612649540968810
UPDATING CATCODES: categorical value for field oaid_hash - 287070909624614747
UPDATING CATCODES: categorical value for field oaid_hash - 4882457832362949033
UPDATING CATCODES: categorical value for field oaid_hash - 3826213987559901507
UPDATING CATCODES: categorical value for field os_id - 7
UPDATING CATCODES: categorical value for field oaid_hash - 972317677657366055
UPDATING CATCODES: categorical value for field oaid_hash - 6281239119278787386
UPDATING CATCODES: categorical value for field oaid_hash - 4995445639763870787
UPDATING CATCODES: categorical value for field oaid_hash - 3197340394554347254
UPDATING CATCODES: categorical value for field oaid_hash - 5351436589773950064
UPDATING CATCODES: categorical value for field banner_id - 762
UPDATING CATCODES: categorical value for field oaid_hash - 4913983047583624636
UPDATING CATCODES: categorical value for field zone_id - 433
UPDATING CATCODES: categorical 

UPDATING CATCODES: categorical value for field oaid_hash - 8643230964272366646
UPDATING CATCODES: categorical value for field oaid_hash - 3249707210889861989
UPDATING CATCODES: categorical value for field oaid_hash - 1174374597128820528
UPDATING CATCODES: categorical value for field oaid_hash - 7006378580775407747
UPDATING CATCODES: categorical value for field oaid_hash - 7711800443439145115
UPDATING CATCODES: categorical value for field oaid_hash - 4577408917020591949
UPDATING CATCODES: categorical value for field oaid_hash - 4776946477878230855
UPDATING CATCODES: categorical value for field oaid_hash - 1344133003877088572
UPDATING CATCODES: categorical value for field oaid_hash - 3768650544931667276
UPDATING CATCODES: categorical value for field oaid_hash - 1733314009195517029
UPDATING CATCODES: categorical value for field oaid_hash - 2016336998165050001
UPDATING CATCODES: categorical value for field oaid_hash - 462829502957740138
UPDATING CATCODES: categorical value for field oaid_h

UPDATING CATCODES: categorical value for field oaid_hash - 3517544510565085465
UPDATING CATCODES: categorical value for field oaid_hash - 5973264706983239258
UPDATING CATCODES: categorical value for field oaid_hash - 171178432041620512
UPDATING CATCODES: categorical value for field oaid_hash - 8116879314568007045
UPDATING CATCODES: categorical value for field oaid_hash - 7749700942332224657
UPDATING CATCODES: categorical value for field oaid_hash - 5567771039949182762
UPDATING CATCODES: categorical value for field oaid_hash - 3845793820964985993
UPDATING CATCODES: categorical value for field oaid_hash - 7145421494689319769
UPDATING CATCODES: categorical value for field oaid_hash - 5921034806370841283
UPDATING CATCODES: categorical value for field oaid_hash - 7699859695061718767
UPDATING CATCODES: categorical value for field oaid_hash - 5329212593085883375
UPDATING CATCODES: categorical value for field oaid_hash - 9015951821784273289
UPDATING CATCODES: categorical value for field oaid_h

UPDATING CATCODES: categorical value for field oaid_hash - 4699381085132656219
UPDATING CATCODES: categorical value for field oaid_hash - 4471770358015522303
UPDATING CATCODES: categorical value for field oaid_hash - 324595206116710282
UPDATING CATCODES: categorical value for field oaid_hash - 1921991733196495262
UPDATING CATCODES: categorical value for field oaid_hash - 7368020748352834387
UPDATING CATCODES: categorical value for field oaid_hash - 777813202827899938
UPDATING CATCODES: categorical value for field oaid_hash - 3560340907211074936
UPDATING CATCODES: categorical value for field oaid_hash - 1499319146049968958
UPDATING CATCODES: categorical value for field oaid_hash - 3666060003713728256
UPDATING CATCODES: categorical value for field oaid_hash - 3764257162291603143
UPDATING CATCODES: categorical value for field oaid_hash - 743630346660569924
UPDATING CATCODES: categorical value for field oaid_hash - 6274362223259238559
UPDATING CATCODES: categorical value for field oaid_has

UPDATING CATCODES: categorical value for field oaid_hash - 168845849578306797
UPDATING CATCODES: categorical value for field oaid_hash - 6940204625974551719
UPDATING CATCODES: categorical value for field oaid_hash - 1203710198943829066
UPDATING CATCODES: categorical value for field oaid_hash - 2088003111748613964
UPDATING CATCODES: categorical value for field oaid_hash - 5428218156002953543
UPDATING CATCODES: categorical value for field oaid_hash - 6476385190060308254
UPDATING CATCODES: categorical value for field oaid_hash - 3768397815720559446
UPDATING CATCODES: categorical value for field oaid_hash - 7950003243697908137
UPDATING CATCODES: categorical value for field oaid_hash - 6548764686004721894
UPDATING CATCODES: categorical value for field oaid_hash - 2952170242784490890
UPDATING CATCODES: categorical value for field oaid_hash - 3238946825870492233
UPDATING CATCODES: categorical value for field oaid_hash - 3736971407928914348
UPDATING CATCODES: categorical value for field oaid_h

UPDATING CATCODES: categorical value for field oaid_hash - 2063965024525088011
UPDATING CATCODES: categorical value for field oaid_hash - 4603120335026494214
UPDATING CATCODES: categorical value for field oaid_hash - 3310385092451799263
UPDATING CATCODES: categorical value for field oaid_hash - 2504036653511735292
UPDATING CATCODES: categorical value for field oaid_hash - 1394598191889933309
UPDATING CATCODES: categorical value for field oaid_hash - 6514722018738650487
UPDATING CATCODES: categorical value for field oaid_hash - 5799089252269430078
UPDATING CATCODES: categorical value for field oaid_hash - 5890074852771910860
UPDATING CATCODES: categorical value for field oaid_hash - 7813904499421248521
UPDATING CATCODES: categorical value for field oaid_hash - 1127410679445046025
UPDATING CATCODES: categorical value for field oaid_hash - 2951489434406308513
UPDATING CATCODES: categorical value for field oaid_hash - 95385929150525446
UPDATING CATCODES: categorical value for field oaid_ha

UPDATING CATCODES: categorical value for field oaid_hash - 6013249610111953004
UPDATING CATCODES: categorical value for field oaid_hash - 6401502075813802920
UPDATING CATCODES: categorical value for field oaid_hash - 1610177020672218619
UPDATING CATCODES: categorical value for field oaid_hash - 6381139797334576049
UPDATING CATCODES: categorical value for field oaid_hash - 719460664893389117
UPDATING CATCODES: categorical value for field oaid_hash - 6047851747402965883
UPDATING CATCODES: categorical value for field oaid_hash - 7180046407772142200
UPDATING CATCODES: categorical value for field oaid_hash - 5528686776373405929
UPDATING CATCODES: categorical value for field oaid_hash - 3581714727706996417
UPDATING CATCODES: categorical value for field oaid_hash - 6248065659806529852
UPDATING CATCODES: categorical value for field oaid_hash - 4071849540293041988
UPDATING CATCODES: categorical value for field oaid_hash - 695297188321965043
UPDATING CATCODES: categorical value for field oaid_ha

In [12]:
import xlearn as xl

In [15]:
ffm_model = xl.create_ffm()
ffm_model.setTrain("data/train_ffm.txt")
ffm_model.setValidate("data/test_ffm.txt")

param = {
    'task': 'binary',
    'lr': 1e-2,
    'lambda': 0.002,
    'metric': 'auc',
    'k':7
}


ffm_model.fit(param, 'model.out')


ffm_model.setTest("data/test_ffm.txt")
ffm_model.setSigmoid()


ffm_model.predict('model.out', 'predict.txt')

y_pred = pd.read_csv('predict.txt', header=None)
trues = np.load('true.npy')
logloss = log_loss(trues, np.squeeze(np.array(y_pred)))

In [16]:
logloss

0.14100359514155159

Сильно упростил для ffm, но при этом значение logloss тоже приемлимо, да и по времени сильно лучше, чем просто fm

In [17]:
roc_auc_score(trues, y_pred)

0.7563869497292568

RocAuC тоже норм, можно еще фичи наверное погенерить и даже просто fm обогнать, в общем однозначно лайк