In [1]:
import itertools

import lightgbm as lgb
import numpy as np
import pandas as pd
import psutil
from lightfm import LightFM
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.metrics import average_precision_score, roc_auc_score



In [2]:
seed = 42
np.random.seed(seed)

# Dataset
* p: user probability
* q: item probability
* x: user feature
* y: item feature
* target probability = p * q * (1.0 - |x - y|)

In [3]:
n_user = 2000
n_item = 10000
ps = np.random.uniform(0.0, 1.0, n_user)
qs = np.random.uniform(0.0, 1.0, n_item)
xs = np.random.uniform(0.0, 1.0, n_user)
ys = np.random.uniform(0.0, 1.0, n_item)
rs = np.outer(ps, qs) * (1.0 - np.abs(np.subtract.outer(xs, ys)))
target = np.random.binomial(1, rs.flatten()).reshape(n_user, n_item)

df = pd.DataFrame([
    {
        'user': i, 'item': j, 'x': xs[i], 'y': ys[j],
        'p': ps[i], 'q': qs[j], 'target': target[i][j],
    }
    for (i, j) in itertools.product(range(n_user), range(n_item))
])
features = ['x', 'y', 'p', 'q']

# used by lightfm
user_features = np.array([[p, x] for (p, x) in zip(ps, xs)])
item_features = np.array([[q, y] for (q, y) in zip(qs, ys)])

In [4]:
def evaluate(df):
    users = df['user'].unique()
    auc = roc_auc_score(df['target'], df['pred'])
    ap = average_precision_score(df['target'], df['pred'])
    aucs = []
    aps = []
    for user in users:
        tmp = df[df['user']==user]
        if len(tmp['target'].unique()) != 2:
            continue
        aucs.append(roc_auc_score(tmp['target'], tmp['pred']))
        aps.append(average_precision_score(tmp['target'], tmp['pred']))
        
    return {
        'auc': auc,
        'mauc': np.mean(aucs),
        'ap': ap,
        'map': np.mean(aps),
    }

In [5]:
test_ratio = 0.3
train_sampling_ratio = 0.3
df_train = df.query("user > @n_user * @test_ratio").sample(frac=train_sampling_ratio).sort_values(by='user').reset_index(drop=True)
df_test = df.query("user <= @n_user * @test_ratio").reset_index(drop=True)
del df

In [6]:
def run_length_encoding(sequence):
    comp_seq_index, = np.concatenate(([True], sequence[1:] != sequence[:-1], [True])).nonzero()
    return sequence[comp_seq_index[:-1]], np.ediff1d(comp_seq_index)

def get_query_group(df):
    users = df['user'].values
    _, group = run_length_encoding(users)
    return list(group)
    
# used by lightgbm lambdarank
group_train = get_query_group(df_train)
group_test = get_query_group(df_test)

In [7]:
X_train = df_train[features]
X_test = df_test[features]
y_train = df_train['target']
y_test = df_test['target']

# LightGBM (binary loss)

In [8]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
lgb_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'learning_rate': 0.001,
    'metric': 'auc',
    'seed': seed,
    'verbose': 0,
}
model = lgb.train(
    lgb_params, lgb_train, valid_sets=[lgb_train, lgb_eval],
    num_boost_round=1000, callbacks=[lgb.early_stopping(10)],
)
df_test['pred'] = model.predict(X_test, num_iteration=model.best_iteration)
result_lgbm_binary = evaluate(df_test)
print(result_lgbm_binary)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[143]	training's auc: 0.789812	valid_1's auc: 0.79284
{'auc': 0.7928398537661724, 'mauc': 0.6968214668279816, 'ap': 0.43387864922580366, 'map': 0.28523645868577435}


# LightGBM (lambdarank loss)

In [9]:
lgb_train = lgb.Dataset(X_train, y_train, group=group_train)
lgb_eval = lgb.Dataset(X_test, y_test, group=group_test, reference=lgb_train)
lgb_params = {
    'objective': 'lambdarank',
    'boosting_type': 'gbdt',
    'learning_rate': 0.001,
    'metric': 'map',
    'seed': seed,
    'verbose': 0,
}
model = lgb.train(
    lgb_params, lgb_train, valid_sets=[lgb_train, lgb_eval],
    num_boost_round=1000, callbacks=[lgb.early_stopping(10)],
)
df_test['pred'] = model.predict(X_test, num_iteration=model.best_iteration)
result_lgbm_lambdarank = evaluate(df_test)
print(result_lgbm_lambdarank)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[6]	training's map@1: 0.362402	training's map@2: 0.320586	training's map@3: 0.297435	training's map@4: 0.280796	training's map@5: 0.27127	valid_1's map@1: 0.371048	valid_1's map@2: 0.313644	valid_1's map@3: 0.290996	valid_1's map@4: 0.276484	valid_1's map@5: 0.266578
{'auc': 0.6603071639958251, 'mauc': 0.718404820304217, 'ap': 0.2919098600681591, 'map': 0.3003598531645943}


# LightFM

In [10]:
mat_train = lil_matrix((n_user, n_item))
mat_train[df_train['user'], df_train['item']] = df_train['target']

model = LightFM(
    no_components=8,
    learning_schedule='adadelta',
    loss='warp',
    learning_rate=0.001,
    random_state=seed,
)
model.fit(
    mat_train,
    user_features=csr_matrix(user_features),
    item_features=csr_matrix(item_features),
    epochs=100,
    num_threads=psutil.cpu_count(logical=False),
)

df_test['pred'] = model.predict(
    user_ids=df_test['user'].values,
    item_ids=df_test['item'].values,
    user_features=csr_matrix(user_features),
    item_features=csr_matrix(item_features),
)
result_lightfm = evaluate(df_test)
print(result_lightfm)

{'auc': 0.33015735393666834, 'mauc': 0.731042048722827, 'ap': 0.11518122981098311, 'map': 0.31616077049112423}


In [11]:
result_lgbm_binary['name'] = 'lgbm_binary'
result_lgbm_lambdarank['name'] = 'lgbm_lambdarank'
result_lightfm['name'] = 'lightfm'
df_result = pd.DataFrame(
    [result_lgbm_binary, result_lgbm_lambdarank, result_lightfm]
)
df_result[['name', 'auc', 'mauc', 'ap', 'map']]

Unnamed: 0,name,auc,mauc,ap,map
0,lgbm_binary,0.79284,0.696821,0.433879,0.285236
1,lgbm_lambdarank,0.660307,0.718405,0.29191,0.30036
2,lightfm,0.330157,0.731042,0.115181,0.316161
