In [2]:
!pip install lightfm
!pip install pandas pyarrow

Defaulting to user installation because normal site-packages is not writeable
Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m571.3 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25ldone
[?25h  Created wheel for lightfm: filename=lightfm-1.17-cp310-cp310-linux_x86_64.whl size=464742 sha256=c67e0a63023758af6f2813f1b212b1a315396ac6c9fd66c4146ea5ae9da354b8
  Stored in directory: /home/ap7641/.cache/pip/wheels/4f/9b/7e/0b256f2168511d8fa4dae4fae0200fdbd729eb424a912ad636
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17
Defaulting to user installation because normal site-packages is not writeable


In [1]:
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k
from scipy.sparse import coo_matrix
import pandas as pd
from time import perf_counter

In [2]:
df_train=pd.read_parquet('train_als.parquet')
df_test=pd.read_parquet('test_als.parquet')

In [3]:
df_train

Unnamed: 0,user_id,avg_rating,recording_id_index
0,243,0.000019,91438.0
1,243,0.000037,85165.0
2,243,0.000019,47472.0
3,243,0.000019,134027.0
4,243,0.000019,219370.0
...,...,...,...
17347430,21819,0.000020,8305.0
17347431,21819,0.000059,8218.0
17347432,21819,0.000138,15830.0
17347433,21819,0.000020,302175.0


In [4]:
df_test = df_test.sort_values(["user_id", "avg_rating"], ascending=[True, False])

In [5]:
df_test

Unnamed: 0,user_id,avg_rating,recording_id_index
788939,1,0.020073,311045.0
789040,1,0.020073,257557.0
788884,1,0.018248,218594.0
788894,1,0.018248,234874.0
789031,1,0.018248,158486.0
...,...,...,...
3137327,22705,0.007246,12951.0
3137328,22705,0.007246,79181.0
3137329,22705,0.007246,46607.0
3137330,22705,0.007246,32.0


In [6]:
df_train['user_id'] = df_train['user_id'].astype(int)
df_train['recording_id_index'] = df_train['recording_id_index'].astype(int)

df_test['user_id'] = df_test['user_id'].astype(int)
df_test['recording_id_index'] = df_test['recording_id_index'].astype(int)

In [7]:
interactions_train = coo_matrix((df_train['avg_rating'], 
                                 (df_train['user_id'], df_train['recording_id_index'])))

interactions_test = coo_matrix((df_test['avg_rating'], 
                                (df_test['user_id'], df_test['recording_id_index'])))

In [8]:
data = fetch_movielens(min_rating=5.0)
data['train'].shape

(943, 1682)

In [9]:
interactions_train.shape

(22706, 373151)

In [10]:
data['train'].shape

(943, 1682)

In [24]:
# Instantiate and train the model
model = LightFM(no_components=300 ,loss='warp')
start_time = perf_counter()
model.fit(interactions_train, epochs=10, num_threads=8)
time_take = perf_counter() - start_time

In [23]:
time_take

157.76215547975153

In [25]:
# Evaluate the trained model
test_precision = precision_at_k(model, interactions_test, k=100).mean()

In [26]:
print(test_precision)

0.17397659


In [17]:
# Check if there are any users or items in the test set not in the training set
test_user_ids = set(df_test['user_id'].unique())
train_user_ids = set(df_train['user_id'].unique())
test_item_ids = set(df_test['recording_id_index'].unique())
train_item_ids = set(df_train['recording_id_index'].unique())

if len(test_user_ids - train_user_ids) > 0:
    print("There are users in the test set not in the training set")
if len(test_item_ids - train_item_ids) > 0:
    print("There are items in the test set not in the training set")


In [15]:
# Find the unseen user and item ids
unseen_user_ids = test_user_ids - train_user_ids
unseen_item_ids = test_item_ids - train_item_ids

# Remove rows in the test set with unseen users or items
df_test = df_test[~df_test['user_id'].isin(unseen_user_ids)]
df_test = df_test[~df_test['recording_id_index'].isin(unseen_item_ids)]

# Recreate the interactions_test matrix
interactions_test = coo_matrix((df_test['avg_rating'], 
                                (df_test['user_id'], df_test['recording_id_index'])))


In [19]:
# Find the max user_id and recording_id_index from the training set
max_user_id_train = df_train['user_id'].max()
max_recording_id_train = df_train['recording_id_index'].max()

# Recreate the interactions_test matrix
interactions_test = coo_matrix((df_test['avg_rating'], 
                                (df_test['user_id'], df_test['recording_id_index'])),
                               shape=(max_user_id_train + 1, max_recording_id_train + 1))
