In [8]:
import pandas as pd
import numpy as np

import xlearn as xl

In [9]:
# готовим данные для xlearn
# разбиваем на "последний день" и "остальное"
# разбиваем "остальное" на train (90%), val1 (5%) и val2 (5%)

# все разбиения по времени
# то есть train раньше val1 раньше val2, они все раньше последнего для

from pathlib import Path
from tqdm import tqdm


def last_day_eval_split(data):
    last_event = data.date_time.iloc[-1]
    day = last_event.day
    month = last_event.month
    year = last_event.year
    k = ((data.date_time.dt.day == day) & (data.date_time.dt.month == month) & (data.date_time.dt.year == year)).sum()
    
    data = data.iloc[:-k]
    lastday = data.iloc[-k:]
    
    return data, lastday
    
    
def train_val_split(data):
    n = len(data)
    
    # 5% for both validation sets
    k = n // 20
    
    train = data.iloc[:-(k * 2)]
    val1 = data.iloc[-(k * 2):-k]
    val2 = data.iloc[-k:]
    
    return train, val1, val2


def to_libffm(dataframe):
    for _, row in dataframe.iterrows():
        yield f"{row.clicks} 0:{row.banner_id}:1 1:{row.zone_id}:1 2:{row.country_id}:1 3:{row.os_id}:1 4:{row.oaid_trimmed}:1"


def prepare_dataframe(dataframe, path: Path):
    # shuffle the dataframe just in case
    dataframe = dataframe.sample(frac = 1)
    with path.open("w") as f:
        for line in tqdm(to_libffm(dataframe), total=len(dataframe)):
            f.write(f"{line}\n")

        
def preprocess():
    data = pd.read_csv('../data/data.csv')
    data['date_time'] = pd.to_datetime(data['date_time'])
    data.sort_values(by='date_time', inplace=True)
    
    # shift feature numbers so they wouldn't intersect
    data.zone_id = data.zone_id + data.banner_id.max() + 1
    data.country_id = data.country_id + data.zone_id.max() + 1
    data.os_id = data.os_id + data.country_id.max() + 1
    
    def trim_user_id(user_id):
        # we have over 6M different userid hashes
        # so let's take top 0.1% for additional 6k features
        counts = user_id.value_counts()
        lower_bound = np.percentile(counts, 99.9)
        
        userid_map = {}
        for userid, count in counts.iteritems():
            if count > lower_bound:
                userid_map.setdefault(userid, len(userid_map))
        
        return user_id.apply(lambda x: userid_map.get(x, len(userid_map)))
    
    data["oaid_trimmed"] = trim_user_id(data.oaid_hash) + data.os_id.max() + 1
    
    data, lastday = last_day_eval_split(data)
    train, val1, val2 = train_val_split(data)
    
    prepare_dataframe(train, Path("../data/train.txt"))
    prepare_dataframe(val1, Path("../data/val1.txt"))
    prepare_dataframe(val2, Path("../data/val2.txt"))
    prepare_dataframe(lastday, Path("../data/lastday.txt"))

In [10]:
# занимает около 13 мин на моем пк
# preprocess()

In [11]:
from sklearn.metrics import log_loss

def validate(model_path, test_set="../data/val2.txt") -> float:
    ffm_model = xl.create_ffm()
    ffm_model.setSigmoid()
    ffm_model.setTest(test_set)
    ffm_model.predict(model_path, "./output.txt")
    
    y_pred = []
    with open("./output.txt") as f:
        for line in f:
            prob = float(line.strip())
            y_pred.append(prob)
            
    y_true = []
    with open(test_set) as f:
        for line in f:
            label, _ = line.split(maxsplit=1)
            y_true.append(float(label))
    
    return log_loss(y_true, y_pred)

In [12]:
# подбираем параметры при помощи bayesian optimization
# k от 4 до 32
# lr от 0.2 до 0.005
# lambda от 0.00002 до 0.002
# все три параметра в логарифмической шкале

# предлагаем три "стартовые" точки:
# k=4 lr=0.2 lambda=0.00002
# k=8 lr=0.2 lambda=0.00002
# k=8 lr=0.2 lambda=0.0002

# максимум 100 эпох

from bayes_opt import BayesianOptimization

class Trainer:
    def __init__(self):
        self.bayesopt = None
        self.models = []
        self.params = []
        
        self.valset = "../data/val1.txt"
        
    def start(self, n_iter):
        pbounds = {
            'k_': [np.log2(4), np.log2(32)],
            'lr_': [np.log10(0.005), np.log10(0.02)],
            'l2_coeff_': [np.log10(0.00002), np.log10(0.002)]
        }
        self.bayesopt = BayesianOptimization(f=self.blackbox, pbounds=pbounds)
        self.bayesopt.probe(
            params={"k_": np.log2(8), "lr_": np.log10(0.2), "l2_coeff_": np.log10(0.00002)},
            lazy=True,
        )
        self.bayesopt.probe(
            params={"k_": np.log2(8), "lr_": np.log10(0.2), "l2_coeff_": np.log10(0.0002)},
            lazy=True,
        )
        self.bayesopt.probe(
            params={"k_": np.log2(4), "lr_": np.log10(0.2), "l2_coeff_": np.log10(0.00002)},
            lazy=True,
        )
        
        self.bayesopt.maximize(
            init_points=5,
            n_iter=n_iter,
        )                        
                                             
    def blackbox(self, k_, lr_, l2_coeff_):
        return self.train(int(2 ** k_), 10 ** lr_, 10 ** l2_coeff_)
    
    def train(self, k, lr, l2_coeff) -> float:
        ffm_model = xl.create_ffm()
        ffm_model.setTrain("../data/train.txt")
        ffm_model.setValidate("../data/val1.txt")

        param = {"task": "binary", "k": k, "lr": lr, "epoch": 100, "lambda": l2_coeff}
        model_path = f"./model_k={k}_lr={lr}_lambda={l2_coeff}.out"
        
        if not Path(model_path).exists():
            ffm_model.fit(param, model_path)

        self.models.append(model_path)
        self.params.append(param)
        
        score = validate(model_path, self.valset)
        
        return -score

In [13]:
trainer = Trainer()

In [14]:
trainer.start(n_iter=24)

|   iter    |  target   |    k_     | l2_coeff_ |    lr_    |
-------------------------------------------------------------
[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 8 threads for prediction task.
[32m[1m[ ACTION     ] Load model ...[0m
[32m[------------] [0mLoad model from ./model_k=8_lr=0.20000000000000004_lambda=2e-05.out
[32m[------------] [0mLoss function: cross-entropy
[32m[------------] [0mScore function: ffm
[32m[------------] [0mNumber of Feature: 11606
[32m[------------] [0mNumber of K: 8
[32m[------------] [0mNumber of field: 5
[32m[---------

KeyboardInterrupt: 

In [15]:
# не дождался всех 24 шагов

# выбираем лучшую модель по val2
models = trainer.models
params = trainer.params

best_model = None
best_score = 10e6
best_param = None
for model_path, param in zip(models, params):
    score = validate(model_path, "../data/val2.txt")
    print(f"{param} has logloss={score} on val2 set")
    if score < best_score:
        best_score = score
        best_model = model_path
        best_param = param

print(f"we take {best_param} with logloss={best_score:.4f} as the best model")

# применяем лучшую модель на тестовых данных (последний день)
test_score = validate(best_model, test_set="../data/lastday.txt")

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 8 threads for prediction task.
[32m[1m[ ACTION     ] Load model ...[0m
[32m[------------] [0mLoad model from ./model_k=8_lr=0.20000000000000004_lambda=2e-05.out
[32m[------------] [0mLoss function: cross-entropy
[32m[------------] [0mScore function: ffm
[32m[------------] [0mNumber of Feature: 11606
[32m[------------] [0mNumber of K: 8
[32m[------------] [0mNumber of field: 5
[32m[------------] [0mTime cost for loading model: 0.00 (sec)
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst 

In [16]:
print(f"the best model has logloss={test_score:.4f} on last day data")

the best model has logloss=0.1447 on last day data


In [None]:
# у модели из первого дз логлосс на последнем дне 0.1378

# слишком разный логлосс на трейне и тесте - переобучаемся