In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%config Completer.use_jedi = False

In [3]:
import warnings
from optuna.exceptions import ExperimentalWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=ExperimentalWarning)

In [4]:
K=10
SEED=1234

# The notebook contains an example of LightFM model usage and dataset preprocessing with RePlay, including:
1. Data loading
2. Features preprocessing with pyspark
3. Building LightFM model based on interaction matrix and features
4. Model evaluation

# 1) Data loading

We will use MovieLens 10m dataset from rs_datasets package, which contains a list of recommendations datasets.

In [5]:
from rs_datasets import MovieLens

data = MovieLens("10m")
data.info()

ratings


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392



items


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance



tags


Unnamed: 0,user_id,item_id,tag,timestamp
0,15,4973,excellent!,1215184630
1,20,1747,politics,1188263867
2,20,1747,satire,1188263867





### Convert interaction log to RePlay format

In [6]:
from replay.data_preparator import DataPreparator

log = DataPreparator().transform(
    data=data.ratings,
    columns_names={
        "user_id": "user_id",
        "item_id": "item_id",
        "relevance": "rating",
        "timestamp": "timestamp"
    }
)

### Data split

In [7]:
from replay.splitters import UserSplitter

user_random_splitter = UserSplitter(
    item_test_size=K,
    user_test_size=500,
    drop_cold_items=True,
    drop_cold_users=True,
    shuffle=True,
    seed=SEED
)

In [8]:
train, test = user_random_splitter.split(log)
train.count(), test.count()

(9995054, 5000)

In [9]:
train_opt, val_opt = user_random_splitter.split(train)
train_opt.count(), val_opt.count()

(9990054, 5000)

# 2) Features preprocessing with pyspark

### Convert features to RePlay format

In [10]:
%%time
item_features = DataPreparator().transform(
    data=data.items,
    columns_names={
        "item_id": "item_id"
    }
)

CPU times: user 37.6 ms, sys: 4.1 ms, total: 41.7 ms
Wall time: 215 ms


In [11]:
item_features.show(2)

+-------+--------------------+----------------+
|item_id|              genres|           title|
+-------+--------------------+----------------+
|      1|Adventure|Animati...|Toy Story (1995)|
|      2|Adventure|Childre...|  Jumanji (1995)|
+-------+--------------------+----------------+
only showing top 2 rows



#### Year

In [12]:
from pyspark.sql import functions as sf
from pyspark.sql.types import IntegerType

In [13]:
year = item_features.withColumn('year', sf.substring(sf.col('title'), -5, 4).astype(IntegerType())).select('item_id', 'year')
year.show(2)

+-------+----+
|item_id|year|
+-------+----+
|      1|1995|
|      2|1995|
+-------+----+
only showing top 2 rows



#### Genres

In [14]:
from replay.session_handler import State
from pyspark.sql.functions import split

genres = (
    State().session.createDataFrame(data.items[["item_id", "genres"]])
    .select(
        "item_id",
        split("genres", "\|").alias("genres")
    )
)

In [15]:
genres.show()

+-------+--------------------+
|item_id|              genres|
+-------+--------------------+
|      1|[Adventure, Anima...|
|      2|[Adventure, Child...|
|      3|   [Comedy, Romance]|
|      4|[Comedy, Drama, R...|
|      5|            [Comedy]|
|      6|[Action, Crime, T...|
|      7|   [Comedy, Romance]|
|      8|[Adventure, Child...|
|      9|            [Action]|
|     10|[Action, Adventur...|
|     11|[Comedy, Drama, R...|
|     12|    [Comedy, Horror]|
|     13|[Animation, Child...|
|     14|             [Drama]|
|     15|[Action, Adventur...|
|     16|      [Crime, Drama]|
|     17|[Comedy, Drama, R...|
|     18|[Comedy, Drama, T...|
|     19|            [Comedy]|
|     20|[Action, Comedy, ...|
+-------+--------------------+
only showing top 20 rows



In [16]:
from pyspark.sql.functions import explode

genres_list = (
    genres.select(explode("genres").alias("genre"))
    .distinct().filter('genre <> "(no genres listed)"')
    .toPandas()["genre"].tolist()
)

In [17]:
genres_list

['Documentary',
 'IMAX',
 'Adventure',
 'Animation',
 'Comedy',
 'Thriller',
 'Sci-Fi',
 'Musical',
 'Horror',
 'Action',
 'Fantasy',
 'War',
 'Mystery',
 'Drama',
 'Film-Noir',
 'Crime',
 'Western',
 'Romance',
 'Children']

In [18]:
from pyspark.sql.functions import col, lit, array_contains
from pyspark.sql.types import IntegerType

item_features = genres
for genre in genres_list:
    item_features = item_features.withColumn(
        genre,
        array_contains(col("genres"), genre).astype(IntegerType())
    )
item_features = item_features.drop("genres").cache()
item_features.count()

10681

In [19]:
item_features.show(2)

+-------+-----------+----+---------+---------+------+--------+------+-------+------+------+-------+---+-------+-----+---------+-----+-------+-------+--------+
|item_id|Documentary|IMAX|Adventure|Animation|Comedy|Thriller|Sci-Fi|Musical|Horror|Action|Fantasy|War|Mystery|Drama|Film-Noir|Crime|Western|Romance|Children|
+-------+-----------+----+---------+---------+------+--------+------+-------+------+------+-------+---+-------+-----+---------+-----+-------+-------+--------+
|      1|          0|   0|        1|        1|     1|       0|     0|      0|     0|     0|      1|  0|      0|    0|        0|    0|      0|      0|       1|
|      2|          0|   0|        1|        0|     0|       0|     0|      0|     0|     0|      1|  0|      0|    0|        0|    0|      0|      0|       1|
+-------+-----------+----+---------+---------+------+--------+------+-------+------+------+-------+---+-------+-----+---------+-----+-------+-------+--------+
only showing top 2 rows



In [20]:
item_features = item_features.join(year, on='item_id', how='inner')
item_features.cache()
item_features.count()

10681

# 3) Building LightFM model based on interaction matrix and features

In [21]:
from replay.models import LightFMWrap

model_feat = LightFMWrap(random_state=SEED, loss='warp', no_components=128)

In [22]:
%%time
model_feat.fit(train, item_features=item_features)

CPU times: user 13h 43min 26s, sys: 59.3 s, total: 13h 44min 25s
Wall time: 18min 55s


In [23]:
%%time
recs = model_feat.predict(
    k=K,
    users=test.select('user_id').distinct(),
    log=train,
    filter_seen_items=True,
    item_features=item_features
)

CPU times: user 8.44 s, sys: 2.99 s, total: 11.4 s
Wall time: 1min


# 4) Model evaluation

In [24]:
from replay.metrics import HitRate, NDCG, MAP, Coverage
from replay.experiment import Experiment

metrics = Experiment(test, {NDCG(): K,
                            MAP() : K,
                            HitRate(): [1, K],
                           Coverage(train): K})
 

In [25]:
metrics.add_result("LightFM_item_features", recs)
metrics.results

Unnamed: 0,Coverage@10,HitRate@1,HitRate@10,MAP@10,NDCG@10
LightFM_item_features,0.07193,0.348,0.796,0.113104,0.221282
