In [1]:
import pathlib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import faiss
import polars as pl
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from sklearn.metrics import ndcg_score, average_precision_score
from scipy.sparse import csr_matrix, coo_matrix
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV



In [8]:
%%time
chunk_paths = [path for path in pathlib.Path("../data/raw/steam-reviews-dataset/").glob("*.csv")]
dfs = []
for i in chunk_paths:
    dfs.append(pd.read_csv(i))
df = pd.concat(dfs)
df.head()

CPU times: user 52.6 s, sys: 10.8 s, total: 1min 3s
Wall time: 1min 9s


Unnamed: 0,steamid,appid,voted_up,votes_up,votes_funny,weighted_vote_score,playtime_forever,playtime_at_review,num_games_owned,num_reviews,review,unix_timestamp_created,unix_timestamp_updated
0,76561198024044792,342550,False,4,0,0.571289,39,39,574,22,"While this game seems promising, it's really j...",1601155855,1601155855
1,76561198225145856,342550,False,0,0,0.0,103,103,52,12,"Fuck this game. No updates, no nothing. Just T...",1592746747,1592746747
2,76561198066087993,342550,False,1,0,0.47619,191,191,149,2,Nothing about this game should have legally be...,1590110583,1590110583
3,76561198118355028,342550,True,1,0,0.524829,1679,1679,98,33,The game looks interesting whit that Lovercraf...,1582193846,1582193846
4,76561198038091775,342550,False,0,0,0.0,107,107,604,61,Got this game when it was a paid for. I enjoye...,1579007301,1579007301


In [3]:
print(df.shape)

(15437471, 13)


In [9]:
df["unix_timestamp_created"] = pd.to_datetime(df["unix_timestamp_created"], unit="s")
df["unix_timestamp_updated"] = pd.to_datetime(df["unix_timestamp_updated"], unit="s")
# df["unix_timestamp_created"] = 
df["voted_up"] = df["voted_up"].astype(int)

Let's take train from min timestamp to July of 2020. Test from July 2020 to December 2020. Validation from January 2021 to max timestamp.

In [10]:
columns = ["steamid", "appid", "voted_up", "playtime_at_review", "unix_timestamp_created"]
dataset_df = df[columns]

In [11]:
train_df = dataset_df[dataset_df["unix_timestamp_created"] < "2020-06-01"]
test_df = dataset_df[(dataset_df["unix_timestamp_created"] >= "2020-06-01") & (dataset_df["unix_timestamp_created"] < "2021-01-01")]
validation_df = dataset_df[dataset_df["unix_timestamp_created"] >= "2021-01-01"]

In [12]:
print(train_df.shape)
print(test_df.shape)
print(validation_df.shape)

(12231699, 5)
(2080330, 5)
(1125442, 5)


### train/test/val split

Remove cold start games and users from test and validation sets

In [13]:
test_df = test_df[test_df.steamid.isin(train_df.steamid) & test_df.appid.isin(train_df.appid)]
validation_df = validation_df[validation_df.steamid.isin(train_df.steamid) & validation_df.appid.isin(train_df.appid)]

In [14]:
print(train_df.shape)
print(test_df.shape)
print(validation_df.shape)

(12231699, 5)
(1035993, 5)
(443294, 5)


To form csr matrix we need to do indices for steam_id and app_id from [0...n] and [0...m] respectively

In [15]:
steamid_to_index = {steam_id: index for index, steam_id in enumerate(train_df["steamid"].unique())}
app_id_to_index = {app_id: index for index, app_id in enumerate(train_df["appid"].unique())}

Map indices


In [16]:
for dataframe in (train_df, test_df, validation_df):
    dataframe.loc[:, "user_id"] = dataframe["steamid"].map(steamid_to_index)
    dataframe.loc[:, "item_id"] = dataframe["appid"].map(app_id_to_index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.loc[:, "user_id"] = dataframe["steamid"].map(steamid_to_index)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.loc[:, "item_id"] = dataframe["appid"].map(app_id_to_index)


Let's form train matrix with implicit feedback with pure collaborative filtration without any user or item features

In [17]:
train_df.head()

Unnamed: 0,steamid,appid,voted_up,playtime_at_review,unix_timestamp_created,user_id,item_id
2,76561198066087993,342550,0,191,2020-05-22 01:23:03,0,0
3,76561198118355028,342550,1,1679,2020-02-20 10:17:26,1,0
4,76561198038091775,342550,0,107,2020-01-14 13:08:21,2,0
5,76561198432925868,342550,1,143,2019-10-05 10:42:53,3,0
6,76561198102945378,342550,0,56,2019-07-30 00:57:56,4,0


## Pure collaborative filtering with implicit feedback

In [18]:
%%time
# positive_reviews = dataset.query("voted_up == 1")
train_user_item_matrix = coo_matrix(
    (
        train_df["playtime_at_review"].to_numpy(),
        (
          train_df["user_id"].to_numpy(),
          train_df["item_id"].to_numpy(),
        )
    ),
)

CPU times: user 30.9 ms, sys: 68.3 ms, total: 99.1 ms
Wall time: 145 ms


In [19]:
N_EMBEDDING_DIMENSIONS = 64

In [20]:
%%time
model = LightFM(loss="warp", random_state=20242812, no_components=N_EMBEDDING_DIMENSIONS)
model.fit(train_user_item_matrix, epochs=30, num_threads=6)

CPU times: user 8min 42s, sys: 2min 13s, total: 10min 55s
Wall time: 12min 40s


<lightfm.lightfm.LightFM at 0x6302b3220>

In [21]:
test_user_item_matrix = coo_matrix(
    (test_df["playtime_at_review"].to_numpy(),
     (test_df["user_id"].to_numpy(),
      test_df["item_id"].to_numpy())),
)

Let's evaluate precision@k, mrr@k, mAp@k and ndcg@k

In [22]:
%%time
item_biases, item_embeddings = model.get_item_representations()
n_voronoi_cells = 8
quantizer = faiss.IndexFlatIP(N_EMBEDDING_DIMENSIONS)  # the other index
item_vector_index = faiss.IndexIVFFlat(quantizer, N_EMBEDDING_DIMENSIONS, n_voronoi_cells)
# item_vector_index = faiss.index_factory(N_EMBEDDING_DIMENSIONS, "IVF65536_HNSW32,Flat")
item_vector_index.nprobe = 1
item_vector_index.train(item_embeddings)
item_vector_index.add(item_embeddings)

CPU times: user 5.86 ms, sys: 36.3 ms, total: 42.2 ms
Wall time: 148 ms


actually we need to consider item biases and user biases and add (1, b_u) and (b_i, 1) to each user and item vector respectively

In [23]:
user_biases, user_train_embeddings = model.get_user_representations()

Predicting train 

In [24]:
%%time
train_user_to_recommended_items = []
batch_size = 512
K = 5
for i in range(0, user_train_embeddings.shape[0], batch_size):
    distances, labels = item_vector_index.search(user_train_embeddings[i:i+batch_size], k=K)
    for j in range(0, labels.shape[0]):
        train_user_to_recommended_items.extend(
            [(i+j, label, rank) for rank, label in enumerate(labels[j])]
        )

CPU times: user 1min 5s, sys: 30.1 s, total: 1min 35s
Wall time: 26.3 s


In [25]:
train_df_predicted = pd.DataFrame(data=np.array(train_user_to_recommended_items), columns=["user_id", "item_id", "rank"])
train_df_predicted.head()

Unnamed: 0,user_id,item_id,rank
0,0,4393,0
1,0,2133,1
2,0,3819,2
3,0,31,3
4,0,2181,4


In [26]:
## metrics functions
def mrr_at_k_score(
    df_true, df_pred,
    predicted_col="rank",
    *, user_col, item_col, k=10,
):
    return df_true.set_index([user_col, item_col]).join(
        df_pred.set_index([user_col, item_col]),
        how="left",
        lsuffix="_true",
        rsuffix="_predicted"
    ).sort_values(by=[user_col, predicted_col]).assign(
        reciprocal_rank=lambda x: 1 / x[predicted_col],
    ).groupby(level=user_col).max().fillna(0).mean()


def map_at_k_score(df_true, df_pred, predicted_col="rank", *, user_col="user_id", item_col="item_id", k=10):
    a = df_true.set_index([user_col, item_col]).join(
        df_pred.set_index([user_col, item_col]),
        how="left",
        lsuffix="_true",
        rsuffix="_predicted"
    ).sort_values(by=[user_col, predicted_col]).assign(
        cumulative_rank=lambda x: x.groupby(level=user_col).cumcount() + 1
    ).assign(
        cumulative_rank=lambda x: x["cumulative_rank"] / x["rank"]
    ).assign(
        user_item_count=lambda x: x.groupby(level=user_col)["rank"].transform(np.size)
    )
    users_count = a.index.get_level_values(user_col).nunique()
    return (a["cumulative_rank"] / a["users_item_count"]).sum() / users_count


In [29]:
%%time
print(f"""Train MRR@5: {mrr_at_k_score(
    train_df, train_df_predicted,
    predicted_col='rank', user_col='user_id',
    item_col='item_id', k=5,
)}""")

KeyboardInterrupt: 

In [None]:
%%time
print(f"""Train mAp@5: {map_at_k_score(
    train_df, train_df_predicted,
    predicted_col="rank", user_col="user_id",
    item_col="item_id", k=5,
)}""")

In [None]:
%%time
train_metrics_df = (
    train_df.set_index(["user_id", "item_id"])
    .join(
        train_df_predicted.set_index(["user_id", "item_id"]),
        how="left",
        lsuffix="_true",
        rsuffix="_predicted",
    )
    # [["user_id_gt]
)
train_metrics_df.head()

In [64]:
%%time
test_user_to_recommended_items = []
batch_size = 512
K = 5
for i in range(0, user_train_embeddings.shape[0], batch_size):
    distances, labels = item_vector_index.search(user_train_embeddings[i:i+batch_size], k=K)
    for j in range(0, labels.shape[0]):
        train_user_to_recommended_items.extend(
            [(i+j, label, rank) for rank, label in enumerate(labels[j])]
        )

KeyboardInterrupt: 

In [None]:
train_predicted_df = pd.DataFrame(
    {"user_id": np.array(train_user_to_recommended_items)[:, 0],
     "item_id": np.array(train_user_to_recommended_items)[:, 1],
     "rank": np.array(train_user_to_recommended_items)[:, 2],}
)
train_eval_df = train_df[["user_id", "item_id"]].join(train_predicted_df, on="user_id", how="left")

In [None]:
%%time
print("Train precision: %.2f" % precision_at_k(model, train_user_item_matrix, k=5, num_threads=6).mean())
print("Test precision: %.2f" % precision_at_k(model, test_user_item_matrix, k=5, num_threads=6).mean())

## Hybrid lightfm model

!? predict forever playtime and train model on it?

In [None]:
def load_steam_games_complete_dataset(csv_path="../data/raw/steam-games-complete-dataset/steam_games.csv"):
    def convert_price_to_float(x):
        if isinstance(x, float):
            return x
        elif x.startswith("$"):
            return float(x.lstrip("$"))
        elif x == "Free":
            return 0
        else:
            return None

    # int(x.lstrip("$") if x.startswith("$") else (if )

    df_games_complete = (
        pd.read_csv(csv_path)
        .query("types == 'app' | types == 'bundle'")
        .rename(columns={"name": "title", "achievements": "n_game_achievements"})
        .assign(refined_price=lambda df: df["original_price"].apply(convert_price_to_float))
    )
    df_games_complete["app_id"] = (
        df_games_complete.url
        .str.removeprefix("https://store.steampowered.com/")
        .str.split("/")
        .apply(lambda x: x[1]).astype(int)
    )
    df_games_complete["app_id"] = df_games_complete["app_id"].astype(int)
    return df_games_complete

In [None]:
df_games_complete = (
        pd.read_csv(csv_path)
        .query("types == 'app' | types == 'bundle'")
        .rename(columns={"name": "title", "achievements": "n_game_achievements"})
        .assign(refined_price=lambda df: df["original_price"].apply(convert_price_to_float))
    )
    df_games_complete["app_id"] = (
        df_games_complete.url
        .str.removeprefix("https://store.steampowered.com/")
        .str.split("/")
        .apply(lambda x: x[1]).astype(int)
    )

In [None]:
developers = []
genres = []

In [None]:
columns = ["steamid", "appid", "voted_up", "playtime_at_review", "unix_timestamp_created"]
user_feature_columns = [
    "num_reviews_upon_review", "total_reviews_items_playtime_upon_review", "mean_review_score_upon_review",
    "sum_votes_up_upon_review", "mean_votes_up_upon_review", "num_games_owned_upon_review",
]

game_feature_columns = [
    *[f"is_developer_{x}" for dev in developers], # one hot
    *[f"is_genre_{x}" for genre in genres], # one hot
    *[f"desc_multilingual_embed_component_{x}" for component in range(0, k_dim)], # dense embedding
    *[f"is_publisher_{x}" for publisher in publishers], # one hot
    *[f"language_{x}_support" for lang in languages], # one hot
    # "n_days_from_release", "n_months_from_release", "n_years_from_release",
    "n_game_achievements",
    "win_support", "mac_support", "linux_support", "steam_deck_support",
)