In [1]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
import numpy as np
import pickle
import scipy.sparse as sp

In [2]:
train_file = "../datasets/AOW-private/TRAIN_AOW_100negative"

In [48]:
model_file = "models/lightfm_model_cleaned_AOW_data_warp_loss.pkl"
models_file = "models/lightfm_models_with_grid_cleaned_AOW_data_warp_loss.pkl"

In [4]:
df_train_neg = pickle.load(open(train_file, "rb"))

In [5]:
df_train = df_train_neg[df_train_neg["labels"] == 1]

In [6]:
model = LightFM(learning_rate=0.05, loss='bpr')

In [7]:
df_train.head(5)

Unnamed: 0,id_user,id_item,labels
0,1,241,1
1,1,1066,1
2,1,954,1
3,1,161,1
4,1,1503,1


In [8]:
users_count = df_train["id_user"].unique().size

In [9]:
users_count

163566

In [10]:
items_count = df_train["id_item"].unique().size

In [11]:
items_count

1828

In [12]:
df_train["id_user"].max()

361571

In [13]:
print(df_train["id_item"].min())
print(df_train["id_item"].max())

1
1841


In [14]:
user_id_mappings = dict(zip(df_train["id_user"].unique(), range(users_count)))

In [28]:
user_id_mappings

{1: 0,
 2: 1,
 3: 2,
 5: 3,
 6: 4,
 7: 5,
 8: 6,
 10: 7,
 11: 8,
 12: 9,
 13: 10,
 14: 11,
 16: 12,
 17: 13,
 18: 14,
 20: 15,
 21: 16,
 22: 17,
 23: 18,
 24: 19,
 25: 20,
 26: 21,
 27: 22,
 28: 23,
 29: 24,
 30: 25,
 31: 26,
 32: 27,
 33: 28,
 34: 29,
 35: 30,
 36: 31,
 37: 32,
 38: 33,
 39: 34,
 40: 35,
 41: 36,
 42: 37,
 44: 38,
 45: 39,
 46: 40,
 47: 41,
 48: 42,
 49: 43,
 262194: 44,
 52: 45,
 53: 46,
 54: 47,
 56: 48,
 57: 49,
 59: 50,
 60: 51,
 61: 52,
 62: 53,
 64: 54,
 65: 55,
 66: 56,
 67: 57,
 68: 58,
 69: 59,
 262214: 60,
 71: 61,
 72: 62,
 73: 63,
 75: 64,
 77: 65,
 78: 66,
 79: 67,
 80: 68,
 81: 69,
 83: 70,
 84: 71,
 262230: 72,
 87: 73,
 88: 74,
 89: 75,
 90: 76,
 91: 77,
 94: 78,
 262239: 79,
 96: 80,
 97: 81,
 98: 82,
 99: 83,
 102: 84,
 103: 85,
 104: 86,
 105: 87,
 106: 88,
 107: 89,
 108: 90,
 262162: 91,
 110: 92,
 111: 93,
 112: 94,
 262257: 95,
 115: 96,
 116: 97,
 117: 98,
 118: 99,
 120: 100,
 262265: 101,
 122: 102,
 123: 103,
 124: 104,
 125: 105,
 126: 106,

In [15]:
item_id_mappings = dict(zip(df_train["id_item"].unique(), range(items_count)))

In [16]:
def build_interaction_matrix(rows, cols, data):

    mat = sp.lil_matrix((rows, cols), dtype=np.int32)

    for uid, iid, label in data.values:
        mat[user_id_mappings[uid], item_id_mappings[iid]] = label

    return mat.tocoo()


In [17]:
train_matrix = build_interaction_matrix(users_count, items_count, df_train)

In [37]:
model = LightFM(learning_rate=0.05, loss='warp')
model.fit(train_matrix, epochs=20)

<lightfm.lightfm.LightFM at 0x7f897681c518>

In [38]:
train_precision = precision_at_k(model, train_matrix, k=10).mean()

In [39]:
train_precision

0.49585974

In [40]:
train_precision = precision_at_k(model, train_matrix, k=5).mean()

In [41]:
train_precision

0.5538364

In [42]:
train_auc = auc_score(model, train_matrix).mean()

In [43]:
train_auc

0.9363851

In [44]:
model_dict = {
    'user_id_mappings': user_id_mappings,
    'item_id_mapings': item_id_mappings,
    'model': model
}

In [45]:
pickle.dump(model_dict, open(model_file, "wb"))

In [46]:
lrs = [ 0.01, 0.05, 0.1 ]
epochs = [ 10, 20 ]


In [47]:
models = [ { 'lr': lr, 'epoch': epoch, 'model': LightFM(learning_rate=lr, loss='warp').fit(train_matrix, epochs=epoch )} for lr in lrs for epoch in epochs ]

In [49]:
pickle.dump(models, open(models_file, "wb"))