In [25]:
import pathlib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from implicit import als, lmf
from scipy.sparse import csr_matrix

In [26]:
%%time
chunk_paths = [path for path in pathlib.Path("../data/raw/steam-reviews-dataset/").glob("*.csv")]
dfs = []
for i in chunk_paths:
    dfs.append(pd.read_csv(i))
df = pd.concat(dfs)
# df = pd.read_csv("../data/raw/steam-reviews-dataset/reviews-1-115.csv")
df.head()

CPU times: user 59 s, sys: 9.75 s, total: 1min 8s
Wall time: 1min 12s


Unnamed: 0,steamid,appid,voted_up,votes_up,votes_funny,weighted_vote_score,playtime_forever,playtime_at_review,num_games_owned,num_reviews,review,unix_timestamp_created,unix_timestamp_updated
0,76561198024044792,342550,False,4,0,0.571289,39,39,574,22,"While this game seems promising, it's really j...",1601155855,1601155855
1,76561198225145856,342550,False,0,0,0.0,103,103,52,12,"Fuck this game. No updates, no nothing. Just T...",1592746747,1592746747
2,76561198066087993,342550,False,1,0,0.47619,191,191,149,2,Nothing about this game should have legally be...,1590110583,1590110583
3,76561198118355028,342550,True,1,0,0.524829,1679,1679,98,33,The game looks interesting whit that Lovercraf...,1582193846,1582193846
4,76561198038091775,342550,False,0,0,0.0,107,107,604,61,Got this game when it was a paid for. I enjoye...,1579007301,1579007301


In [27]:
print(df.shape)

(15437471, 13)


In [28]:
df["unix_timestamp_created"] = pd.to_datetime(df["unix_timestamp_created"], unit="s")
df["unix_timestamp_updated"] = pd.to_datetime(df["unix_timestamp_updated"], unit="s")
# df["unix_timestamp_created"] = 
df["voted_up"] = df["voted_up"].astype(int)

In [31]:
print(df["unix_timestamp_created"].min())
print(df["unix_timestamp_created"].max())

2010-10-15 22:27:49
2021-04-23 04:35:33


Let's take train from min timestamp to July of 2020. Test from July 2020 to December 2020. Validation from January 2021 to max timestamp.

0         False
1         False
2          True
3          True
4          True
          ...  
623037     True
623038     True
623039     True
623040     True
623041     True
Name: unix_timestamp_created, Length: 15437471, dtype: bool

In [37]:
train_df = df[df["unix_timestamp_created"] < "2020-06-01"]
test_df = df[(df["unix_timestamp_created"] >= "2020-06-01") & (df["unix_timestamp_created"] < "2021-01-01")]
validation_df = df[df["unix_timestamp_created"] >= "2021-01-01"]

In [38]:
print(train_df.shape)
print(test_df.shape)
print(validation_df.shape)

(12231699, 13)
(2080330, 13)
(1125442, 13)


Remove cold start games and users from test and validation sets

In [39]:
test_df = test_df[test_df.steamid.isin(train_df.steamid) & test_df.appid.isin(train_df.appid)]
validation_df = validation_df[validation_df.steamid.isin(train_df.steamid) & validation_df.appid.isin(train_df.appid)]

In [42]:
print(train_df.shape)
print(test_df.shape)
print(validation_df.shape)

(12231699, 13)
(1035993, 13)
(443294, 13)


Form dataset

In [43]:
columns = ["steamid", "appid", "voted_up", "playtime_at_review", "unix_timestamp_created"]

In [21]:
dataset = df[["steamid", "appid", "voted_up", "playtime_at_review", "unix_timestamp_created"]]
steamid_to_index = {steam_id: index for index, steam_id in enumerate(dataset["steamid"].values)}
app_id_to_index = {app_id: index for index, app_id in enumerate(dataset["appid"].values)}
dataset.loc[:, "user_id"] = dataset["steamid"].map(steamid_to_index)
dataset.loc[:, "item_id"] = dataset["appid"].map(app_id_to_index)
# dataset.to_csv("../data/interim/steam-games-review-interim.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset.loc[:, "user_id"] = dataset["steamid"].map(steamid_to_index)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset.loc[:, "item_id"] = dataset["appid"].map(app_id_to_index)


Add train-test-split on datetime

## Explicit feedback

In [10]:
%%time
# positive_reviews = dataset.query("voted_up == 1")
user_item_matrix = csr_matrix(
    (dataset["voted_up"].to_numpy(), (dataset["user_id"].to_numpy(), dataset["item_id"].to_numpy())),
    # shape=(dataset["steamid"].nunique(), dataset["appid"].nunique()),
)

CPU times: user 27.6 ms, sys: 16.9 ms, total: 44.5 ms
Wall time: 43.3 ms


In [12]:
%%time
# Initialize and train the ALS model
model = als.AlternatingLeastSquares(factors=50, regularization=0.1, random_state=21122024) #factors is the number of latent factors
model.fit(user_item_matrix)

  0%|          | 0/15 [00:00<?, ?it/s]

CPU times: user 45 s, sys: 952 ms, total: 46 s
Wall time: 47.2 s


In [60]:
model.save("../models/als-all-reviews-f-50-r-0.1-21-12-2024.npz")

Calculate mAp@10, calculate diversity and novelty. Cold start on games or users? 

In [None]:
# Recommend items for a user
user_id = 0
recommendations = model.recommend(user_id, user_item_matrix[user_id])
print(recommendations)

# Get similar items
item_id = 0
similar = model.similar_items(item_id)
print(similar)

## Implicit feedback

In [22]:
user_item_matrix_implicit = csr_matrix(
    (dataset["playtime_at_review"].to_numpy(), (dataset["user_id"].to_numpy(), dataset["item_id"].to_numpy())),
    # shape=(dataset["steamid"].nunique(), dataset["appid"].nunique()),
)

In [24]:
model_implicit = lmf.LogisticMatrixFactorization(factors=30, regularization=0.1, random_state=21122024)
model_implicit.fit(user_item_matrix_implicit)

  0%|          | 0/30 [00:00<?, ?it/s]