In [1]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [2]:
data = pd.read_csv('../../../data/data.csv')
#let's remove columns we can't use
data=data[['date_time', 'zone_id', 'banner_id', 'campaign_clicks',
       'os_id', 'country_id', 'impressions', 'clicks', 'oaid_hash']]
data.head()

Unnamed: 0,date_time,zone_id,banner_id,campaign_clicks,os_id,country_id,impressions,clicks,oaid_hash
0,2021-09-27 00:01:30.000000,0,0,0,0,0,1,1,5664530014561852622
1,2021-09-26 22:54:49.000000,1,1,0,0,1,1,1,5186611064559013950
2,2021-09-26 23:57:20.000000,2,2,3,0,0,1,1,2215519569292448030
3,2021-09-27 00:04:30.000000,3,3,0,1,1,1,1,6262169206735077204
4,2021-09-27 00:06:21.000000,4,4,0,1,0,1,1,4778985830203613115


In [3]:
data['oaid_hash'].value_counts()

308174966294367527     5243
2890718152668627077    2511
2521895603443866206    2289
8212556321845734673    1974
3375698397737628939    1959
                       ... 
4246377695842597056       1
1683601233397340403       1
7979441231022932095       1
5832726667150953660       1
9144315809595125484       1
Name: oaid_hash, Length: 6510316, dtype: int64

In [4]:
mask1 = data['oaid_hash'].map(data['oaid_hash'].value_counts()) < 100

In [5]:
mask1.sum()

15219867

Остается слишком мало вариантов хешей

In [6]:
mask = data['oaid_hash'].map(data['oaid_hash'].value_counts()) < 10 #new users - will change them to id -1

In [7]:
mask.sum()

11017473

In [8]:
len(data)

15821472

Я решила, что история в 10 кликов - для человека история вполне адекватная, поэтому тут трешхол в 10, а не в 100, как для баннеров. Зато остается адекватное количество разных хешей :)

Остальной анализ такой же, как и в первой домашке, я не стала переделывать. С анализом там было все в порядке)

Кроме oaid_hash все остается таким же, за исключением того, что не делаем dummies - данные потом генерятся отдельно

Здесь разрешили не использовать TimeSeriesSplit, чтобы не умереть от старости во время обучения :)

In [9]:
def feature_engineering(data):
    
    #let's leave only known banners - that had >100 shows on the train set
    train_data = data.loc[(pd.to_datetime(data['date_time']).dt.month==9) & (pd.to_datetime(data['date_time']).dt.day!=1)]
    known_banner_ids = set(train_data['banner_id'].value_counts().loc[lambda x: x > 100].index)
    data.loc[~data.banner_id.isin(known_banner_ids), 'banner_id']=-1
#     data = pd.get_dummies(data, columns = ['banner_id'], drop_first=True, prefix=['banner'])
    print("Banner columns generated.")
    
    data.loc[(data.zone_id>4) & (data.zone_id<10), "zone_id"]=-1
    data.loc[(data.zone_id>9) & (data.zone_id<20), "zone_id"]=-2
    data.loc[(data.zone_id>19) & (data.zone_id<50), "zone_id"]=-3
    data.loc[(data.zone_id>49), "zone_id"]=-4
    # we could add categoric features here
#     data = pd.get_dummies(data, columns = ['zone_id'], drop_first=True, prefix=['zone'])
    print("Zone columns generated.")
    
    #normalize campaign clicks
    data.loc[(data.campaign_clicks>50), 'campaign_clicks']=50
    data.loc[:,'campaign_clicks']=data['campaign_clicks']/50
    
    #new users are not very predictable
    mask = data['oaid_hash'].map(data['oaid_hash'].value_counts()) < 10
    data['oaid_hash']=data['oaid_hash'].mask(mask, -1)

    
    data.loc[data.os_id>6, "os_id"] = 7
#     data = pd.get_dummies(data, columns = ['os_id'], drop_first=True, prefix=['os'])
    print("OS columns generated.")
    
#     data = pd.get_dummies(data, columns = ['country_id'], drop_first=True, prefix=['country'])
    print("Country columns generated.")
    
    data = data.drop(columns=['impressions'])
    
    #we'll use TimeSeries Split this time so no validation dataset
    train_data = data.loc[(pd.to_datetime(data['date_time']).dt.month==9) & (pd.to_datetime(data['date_time']).dt.day!=1)]
    val_data = data.loc[(pd.to_datetime(data['date_time']).dt.month==10) & (pd.to_datetime(data['date_time']).dt.day==1)]
    test_data = data.loc[(pd.to_datetime(data['date_time']).dt.month==10) & (pd.to_datetime(data['date_time']).dt.day==2)]
    
    train_data = train_data.drop(columns=['date_time'])  
    val_data = val_data.drop(columns=['date_time']) 
    test_data = test_data.drop(columns=['date_time']) 
    
    
    return train_data, val_data, test_data


In [10]:
train_data, val_data, test_data = feature_engineering(data)

Banner columns generated.
Zone columns generated.
OS columns generated.
Country columns generated.


На этот раз посчитаем бейзлайн

In [11]:
train_columns = [column for column in train_data.columns if column!='clicks']
train_X = train_data[train_columns]
train_y=train_data['clicks']
test_X = test_data[train_columns]
test_y = test_data['clicks']

In [12]:
train_data['clicks'].value_counts()

0    11770857
1      278188
Name: clicks, dtype: int64

In [13]:
print(f"Log loss for mean number of clicks: {log_loss(test_y, np.full_like(test_y, train_data['clicks'].mean()))}")
print(f"Log loss for median number of clicks: {log_loss(test_y, np.full_like(test_y, train_data['clicks'].median()))}")
print(f"Log loss for random of clicks: {log_loss(test_y, np.random.rand(len(test_y)))}")

Log loss for mean number of clicks: 1.2226106923947828
Log loss for median number of clicks: 1.2226106923947828
Log loss for random of clicks: 0.9995558180624815


В первой домашке получился лог лосс на тесте 0.1434375425967045

Сделаем датасеты для train, val и test

In [None]:
# we could use anything so copypasted from here: https://github.com/wngaw/blog/blob/master/xlearn_example/src/utils.py
import json
import math


def _convert_to_ffm(path, df, type, target, numerics, categories, features, encoder):
    # Flagging categorical and numerical fields
    print('convert_to_ffm - START')
    for x in numerics:
        if(x not in encoder['catdict']):
            print(f'UPDATING CATDICT: numeric field - {x}')
            encoder['catdict'][x] = 0
    for x in categories:
        if(x not in encoder['catdict']):
            print(f'UPDATING CATDICT: categorical field - {x}')
            encoder['catdict'][x] = 1

    nrows = df.shape[0]
    with open(path + str(type) + "_ffm.txt", "w") as text_file:

        # Looping over rows to convert each row to libffm format
        for n, r in enumerate(range(nrows)):
            datastring = ""
            datarow = df.iloc[r].to_dict()
            datastring += str(int(datarow[target]))  # Set Target Variable here

            # For numerical fields, we are creating a dummy field here
            for i, x in enumerate(encoder['catdict'].keys()):
                if(encoder['catdict'][x] == 0):
                    # Not adding numerical values that are nan
                    if math.isnan(datarow[x]) is not True:
                        datastring = datastring + " "+str(i)+":" + str(i)+":" + str(datarow[x])
                else:

                    # For a new field appearing in a training example
                    if(x not in encoder['catcodes']):
#                         print(f'UPDATING CATCODES: categorical field - {x}')
                        encoder['catcodes'][x] = {}
                        encoder['currentcode'] += 1
#                         print(f'UPDATING CATCODES: categorical value for field {x} - {datarow[x]}')
                        encoder['catcodes'][x][datarow[x]] = encoder['currentcode']  # encoding the feature

                    # For already encoded fields
                    elif(datarow[x] not in encoder['catcodes'][x]):
                        encoder['currentcode'] += 1
#                         print(f'UPDATING CATCODES: categorical value for field {x} - {datarow[x]}')
                        encoder['catcodes'][x][datarow[x]] = encoder['currentcode']  # encoding the feature

                    code = encoder['catcodes'][x][datarow[x]]
                    datastring = datastring + " "+str(i)+":" + str(int(code))+":1"

            datastring += '\n'
            text_file.write(datastring)
    print("File written")

    # print('Encoder Summary:')
    # print(json.dumps(encoder, indent=4))
    return encoder

In [12]:
GOAL = 'clicks'
NUMERICAL_FEATURES = ['campaign_clicks']
CATEGORICAL_FEATURES = [feature for feature in train_columns if feature!='campaign_clicks']
ALL_FEATURES = train_columns
NUM_THREADS = 7

In [14]:
#done in previous entries

# encoder = {"currentcode": len(NUMERICAL_FEATURES),  # Unique index for each numerical field or categorical variables
#            "catdict": {},  # Dictionary that stores numerical and categorical variables
#            "catcodes": {}}  # Dictionary that stores index for each categorical variables per categorical field

# encoder = _convert_to_ffm('data/', train_data, 'train', GOAL,
#                           NUMERICAL_FEATURES,
#                           CATEGORICAL_FEATURES,
#                           ALL_FEATURES,
#                           encoder)

# encoder = _convert_to_ffm('data/', val_data, 'val', GOAL,
#                           NUMERICAL_FEATURES,
#                           CATEGORICAL_FEATURES,
#                           ALL_FEATURES,
#                           encoder)
# encoder = _convert_to_ffm('data/', test_data, 'test', GOAL,
#                           NUMERICAL_FEATURES,
#                           CATEGORICAL_FEATURES,
#                           ALL_FEATURES,
#                           encoder)

convert_to_ffm - START
UPDATING CATDICT: numeric field - campaign_clicks
UPDATING CATDICT: categorical field - zone_id
UPDATING CATDICT: categorical field - banner_id
UPDATING CATDICT: categorical field - os_id
UPDATING CATDICT: categorical field - country_id
UPDATING CATDICT: categorical field - oaid_hash
File written
convert_to_ffm - START
File written
convert_to_ffm - START
File written


In [103]:
# !pip install cmake

In [104]:
# !pip install xlearn

Временами у меня падает канал, так что, так как данные записаны, буду начинать отсюда

In [19]:
import os
import shutil
import xlearn as xl
from sklearn.metrics import log_loss
import pandas as pd
#https://github.com/aksnzhy/xlearn/blob/master/demo/classification/scikit_learn_demo/example_FFM_criteo.py

def cv(dims, lrs, suffix=""):  
    best_model = None
    best_logloss = 10000
    for dim in dims:
        for lr in lrs:
            ffm_model = xl.create_ffm()
            ffm_model.setTrain("data/train_ffm.txt")
            ffm_model.setValidate("data/val_ffm.txt")

            param = {'task': 'binary',
                     'lr': lr,
                     'lambda': 0.002, #default value
                     'metric': 'auc',
                     'k':dim
                     }

            # Start to train
            ffm_model.fit(param, f'trained_models{suffix}/model_lr_{lr}_dim_{dim}.out')

            # Cross Validation
            ffm_model.cv(param)

            # Prediction task
            ffm_model.setTest("data/val_ffm.txt")  # Test data
            ffm_model.setSigmoid()  # Convert output to 0-1

            # Start to predict

            ffm_model.predict(f"trained_models{suffix}/model_lr_{lr}_dim_{dim}.out", f"output{suffix}/predictions_lr_{lr}_dim_{dim}.txt")
            y_pred = pd.read_csv(f"output{suffix}/predictions_lr_{lr}_dim_{dim}.txt", header=None)
            logloss = log_loss(np.array(val_data['clicks']), np.squeeze(np.array(y_pred)))
            print(f"Log loss for model with learning rate {lr} and hidden dim size {dim}: {logloss}")
            if logloss<best_logloss:
                best_logloss=logloss
                best_model = ffm_model
                print("This model becomes new best model.")
            print("----------")
                
    return best_model
    
        

        
    
    

In [14]:
# делать прогон с большим количеством параметров долго
# посмотрим, как что заходит

dims = [2, 4, 8]
lrs = [0.2, 0.1, 0.05]

In [15]:
%%time
best_model = cv(dims, lrs)

Log loss for model with learning rate 0.2 and hidden dim size 2: 0.15690044454616778
This model becomes new best model.
----------
Log loss for model with learning rate 0.1 and hidden dim size 2: 0.158885377442876
----------
Log loss for model with learning rate 0.05 and hidden dim size 2: 0.16067262589790215
----------
Log loss for model with learning rate 0.2 and hidden dim size 4: 0.15673029284902773
This model becomes new best model.
----------
Log loss for model with learning rate 0.1 and hidden dim size 4: 0.15862258229806955
----------
Log loss for model with learning rate 0.05 and hidden dim size 4: 0.1605145064051948
----------
Log loss for model with learning rate 0.2 and hidden dim size 8: 0.15673880696293324
----------
Log loss for model with learning rate 0.1 and hidden dim size 8: 0.15864537915948765
----------
Log loss for model with learning rate 0.05 and hidden dim size 8: 0.1605187551877174
----------
CPU times: user 3h 16min 57s, sys: 37.4 s, total: 3h 17min 35s
Wall

Лучший лосс был для модели с размерностью 4. Добавим embedding dim 6 и бОльшие lr

In [20]:
%%time
dims = [2, 4, 6, 8]
lrs = [1.0, 0.5, 0.2, 0.1]
best_model2 = cv(dims, lrs, suffix="_1")

Log loss for model with learning rate 1.0 and hidden dim size 2: 0.15446655679916366
This model becomes new best model.
----------
Log loss for model with learning rate 0.5 and hidden dim size 2: 0.15493156943236067
----------
Log loss for model with learning rate 0.2 and hidden dim size 2: 0.15692496732608574
----------
Log loss for model with learning rate 0.1 and hidden dim size 2: 0.15878769164945403
----------
Log loss for model with learning rate 1.0 and hidden dim size 4: 0.15485140138083003
----------
Log loss for model with learning rate 0.5 and hidden dim size 4: 0.15502533606601304
----------
Log loss for model with learning rate 0.2 and hidden dim size 4: 0.15708164384069057
----------
Log loss for model with learning rate 0.1 and hidden dim size 4: 0.15868039296181002
----------
Log loss for model with learning rate 1.0 and hidden dim size 6: 0.15492777464182358
----------
Log loss for model with learning rate 0.5 and hidden dim size 6: 0.15547847490532185
----------
Log l

In [21]:
best_model2.setTest("data/test_ffm.txt")  # Test data
best_model2.setSigmoid()  # Convert output to 0-1

# Start to predict

best_model2.predict(f"trained_models_1/model_lr_1.0_dim_2.out", f"output_1/best_model_on_test.txt")
y_pred = pd.read_csv(f"output_1/best_model_on_test.txt", header=None)
logloss = log_loss(np.array(test_data['clicks']), np.squeeze(np.array(y_pred)))

print(f"Log loss for best model on test set is {logloss}")

Log loss for best model on test set is 0.13783525487012696


***Лучший результат на текущий момент:***

Validation:
Log loss for model with learning rate 1.0 and hidden dim size 2: 0.15446655679916366

Test:
Log loss for best model on test set is 0.13783525487012696

В первой домашке получился лог лосс на тесте 0.1434375425967045 . Мы побили его!

Это уже бьет результат для задания 1, но попробуем еще регуляризацию

In [14]:
import os
import shutil
import xlearn as xl
from sklearn.metrics import log_loss
import pandas as pd
#https://github.com/aksnzhy/xlearn/blob/master/demo/classification/scikit_learn_demo/example_FFM_criteo.py

def cv_v2(dims, lrs, lambdas, suffix=""):  
    best_model = None
    best_logloss = 10000
    for dim in dims:
        for lr in lrs:
            for lambdaa in lambdas:
                ffm_model = xl.create_ffm()
                ffm_model.setTrain("data/train_ffm.txt")
                ffm_model.setValidate("data/val_ffm.txt")

                param = {'task': 'binary',
                         'lr': lr,
                         'lambda': lambdaa,
                         'metric': 'auc',
                         'k':dim
                         }

                # Start to train
                ffm_model.fit(param, f'trained_models{suffix}/model_lr_{lr}_dim_{dim}_lambda_{lambdaa}.out')

                # Cross Validation
                ffm_model.cv(param)

                # Prediction task
                ffm_model.setTest("data/val_ffm.txt")  # Test data
                ffm_model.setSigmoid()  # Convert output to 0-1

                # Start to predict

                ffm_model.predict(f"trained_models{suffix}/model_lr_{lr}_dim_{dim}_lambda_{lambdaa}.out", f"output{suffix}/predictions_lr_{lr}_dim_{dim}_lambda_{lambdaa}.txt")
                y_pred = pd.read_csv(f"output{suffix}/predictions_lr_{lr}_dim_{dim}_lambda_{lambdaa}.txt", header=None)
                logloss = log_loss(np.array(val_data['clicks']), np.squeeze(np.array(y_pred)))
                print(f"Log loss for model with learning rate {lr}, hidden dim size {dim}, and lambda {lambdaa}: {logloss}")
                if logloss<best_logloss:
                    best_logloss=logloss
                    best_model = ffm_model
                    print("This model becomes new best model.")
                print("----------")
                
    return best_model
    

In [15]:
dims = [2, 4, 6, 8]
lrs = [1.0, 0.5, 0.2]
lambdas=[0.02, 0.002, 0.0002, 0.00002]
best_model3 = cv_v2(dims, lrs, lambdas, suffix="_2")

Log loss for model with learning rate 1.0, hidden dim size 2, and lambda 0.02: 0.1642317308964012
This model becomes new best model.
----------
Log loss for model with learning rate 1.0, hidden dim size 2, and lambda 0.002: 0.15617884284396336
This model becomes new best model.
----------
Log loss for model with learning rate 1.0, hidden dim size 2, and lambda 0.0002: 0.15190371031951777
This model becomes new best model.
----------
Log loss for model with learning rate 1.0, hidden dim size 2, and lambda 2e-05: 0.1517230807683679
This model becomes new best model.
----------
Log loss for model with learning rate 0.5, hidden dim size 2, and lambda 0.02: 0.16478820008752781
----------
Log loss for model with learning rate 0.5, hidden dim size 2, and lambda 0.002: 0.1554917996232026
----------
Log loss for model with learning rate 0.5, hidden dim size 2, and lambda 0.0002: 0.1519127724658203
----------
Log loss for model with learning rate 0.5, hidden dim size 2, and lambda 2e-05: 0.15197

In [15]:
#поресетилось прямо во время исполнения(неправильно написала имя файла). По счастью, все сохраняется
best_model3 = xl.create_ffm()
best_model3.setTest("data/test_ffm.txt")  # Test data
best_model3.setSigmoid()  # Convert output to 0-1

# Start to predict

best_model3.predict(f"trained_models_2/model_lr_0.5_dim_6_lambda_2e-05.out", f"output_2/best_model_on_test.txt")
y_pred = pd.read_csv(f"output_2/best_model_on_test.txt", header=None)
logloss = log_loss(np.array(test_data['clicks']), np.squeeze(np.array(y_pred)))

print(f"Log loss for best model on test set is {logloss}")

Log loss for best model on test set is 0.13751008401444992


***Лучший результат-2:***

Validation:
Log loss for model with learning rate 0.5, hidden dim size 6, and lambda 2e-05: 0.15114008892927278

Test:
Log loss for best model on test set is 0.13751008401444992

В первой домашке получился лог лосс на тесте 0.1434375425967045 . Мы побили его!

Предыдущий результат(без подбора lambda): 0.13783525487012696

Значение стало лучше, но не сильно. В идеале можно было бы попробовать lambda = 1e-6, но это долго, и непонятно, стоит ли того :) Принцип, в общем-то, понятен :)

Метрику тоже смотреть уже не буду - все и так работало очень долго (прогон последнего Grid Search - около трех часов), и ее не было в задании :) 