<a href="https://colab.research.google.com/github/reemchaaban/ArcherPipeline/blob/main/game_rec_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GAME RECOMMENDATION SYSTEM

## PREPROCESSING

In [None]:
import os
import json
import sys
import warnings
import pandas as pd
!pip install numpy==1.26.4
import numpy as np
from google.colab import drive, files

warnings.filterwarnings("ignore")

from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from scipy.sparse import coo_matrix, csr_matrix

!pip install lightfm optuna
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k

import optuna

!pip install gensim
from gensim.models import Word2Vec




In [None]:
drive.mount('/content/drive')
drive_base_path = '/content/drive/My Drive/503Nproj/game-rec'
file_path = os.path.join(drive_base_path, 'synthetic_training_data.json')

with open(file_path, 'r') as file:
    data = json.load(file)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# lists initialized for interaction matrix
player_ids = []
game_ids = []
hours_played = []

# lists initialized for game features
game_features = {}

In [None]:
# data processing
for player, details in data.items():
    for game_id, game_data in details["games"].items():
        # interaction matrix
        player_ids.append(player)
        game_ids.append(game_id)
        hours_played.append(game_data["hours"])

        # collection of game features
        if game_id not in game_features:
            game_features[game_id] = {
                "rating_ratio": game_data["rating_ratio"],
                "price": float(game_data["price"]),
                "genre": game_data["genre"],
                "tags": game_data["tags"]
            }

In [None]:
# creation of interaction matrix
player_idx = {player: i for i, player in enumerate(set(player_ids))}
game_idx = {game: i for i, game in enumerate(set(game_ids))}
rows = [player_idx[p] for p in player_ids]
cols = [game_idx[g] for g in game_ids]
data = hours_played
interaction_matrix = coo_matrix((data, (rows, cols)), shape=(len(player_idx), len(game_idx))).tocsr()

In [None]:
# game features ->> DataFrame
game_df = pd.DataFrame.from_dict(game_features, orient='index')

In [None]:
# Word2Vec embedding

tag_sentences = list(game_df['tags'].apply(lambda x: x if isinstance(x, list) else []))
word2vec = Word2Vec(sentences=tag_sentences, vector_size=50, window=5, min_count=1, workers=4)

def get_embedding(tags):
    vectors = [word2vec.wv[tag] for tag in tags if tag in word2vec.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(50)

game_df['tag_embedding'] = game_df['tags'].apply(get_embedding)


In [None]:
print("Word2vec vector size = ", word2vec.vector_size)  # Should be >3

print(game_df)  # Check what features look like


Word2vec vector size =  50
         rating_ratio   price  \
1203220         2.363     0.0   
433850          1.258     0.0   
250900         36.967   899.0   
1063730         2.188  4199.0   
582010          7.299  2999.0   
...               ...     ...   
255710         13.356   299.0   
289070          6.586   599.0   
438100          3.089     0.0   
108600         16.160  1099.0   
8930           22.865  2999.0   

                                                     genre  \
1203220         [Action, Adventure, Massively Multiplayer]   
433850   [Action, Adventure, Free To Play, Massively Mu...   
250900                                            [Action]   
1063730    [Action, Adventure, Massively Multiplayer, RPG]   
582010                                            [Action]   
...                                                    ...   
255710                              [Simulation, Strategy]   
289070                                          [Strategy]   
438100   [Adventur

In [None]:
# drop original lists
game_df.drop(columns=['genre', 'tags'], inplace=True)

# price & rating ratio normalization
scaler = MinMaxScaler()
game_df[['rating_ratio', 'price']] = scaler.fit_transform(game_df[['rating_ratio', 'price']])

print("Interaction matrix shape:", interaction_matrix.shape)
print("Game features shape:", game_df.shape)

Interaction matrix shape: (50000, 99)
Game features shape: (99, 3)


In [None]:
# Ensure the embeddings are expanded into separate numerical columns
game_embeddings = np.vstack(game_df['tag_embedding'].values)  # Use vstack instead of stack

# Drop old embedding column
game_df = game_df.drop(columns=['tag_embedding'])

# Ensure all columns are numerical
game_feature_matrix = np.hstack([game_df.to_numpy(), game_embeddings])

print("Game Features Shape:", game_feature_matrix.shape)  # Should be (99, 52) if 50-dim Word2Vec + 2 other features

Game Features Shape: (99, 52)


In [None]:
def objective(trial):
    loss = trial.suggest_categorical("loss", ["warp", "bpr", "logistic"])
    components = trial.suggest_int("components", 10, 100)
    item_alpha = trial.suggest_loguniform("item_alpha", 1e-6, 1e-2)
    user_alpha = trial.suggest_loguniform("user_alpha", 1e-6, 1e-2)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-1)


    model = LightFM(loss=loss, no_components=components, item_alpha=item_alpha, user_alpha=user_alpha, learning_rate=learning_rate)
    model.fit(train, epochs=10, num_threads=4, item_features=csr_matrix(game_feature_matrix))

    test_precision = precision_at_k(model, test, k=5, item_features=csr_matrix(game_feature_matrix)).mean()
    return test_precision

In [None]:
# train-test split (80-20)
train, test = random_train_test_split(interaction_matrix, test_percentage=0.2)

# optuna for hyperparameter tuning
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

[I 2025-04-04 12:40:09,227] A new study created in memory with name: no-name-3dab8d1d-df2d-47f5-8dc6-ca1d83e37c4a
[I 2025-04-04 12:40:46,602] Trial 0 finished with value: 0.021655529737472534 and parameters: {'loss': 'warp', 'components': 13, 'item_alpha': 9.646951934108638e-06, 'user_alpha': 0.0007107310782189796, 'learning_rate': 0.0758958974145939}. Best is trial 0 with value: 0.021655529737472534.
[I 2025-04-04 12:41:35,878] Trial 1 finished with value: 0.01624983176589012 and parameters: {'loss': 'logistic', 'components': 38, 'item_alpha': 0.00028408446715785183, 'user_alpha': 1.5256429804318574e-05, 'learning_rate': 0.002532793505306361}. Best is trial 0 with value: 0.021655529737472534.
[I 2025-04-04 12:42:30,509] Trial 2 finished with value: 0.020335469394922256 and parameters: {'loss': 'warp', 'components': 21, 'item_alpha': 1.096128998780821e-06, 'user_alpha': 0.00018306552747785157, 'learning_rate': 0.07440319660146542}. Best is trial 0 with value: 0.021655529737472534.
[I 2

Best hyperparameters: {'loss': 'warp', 'components': 53, 'item_alpha': 3.405269389037201e-05, 'user_alpha': 0.007227539049266429, 'learning_rate': 0.00015445194789740532}


In [None]:
print("nice!")

nice!


In [None]:
# training with best hyperparameters

final_model = LightFM(loss=best_params['loss'], no_components=best_params['components'],
                      item_alpha=best_params['item_alpha'], user_alpha=best_params['user_alpha'], learning_rate=best_params['learning_rate'])

game_feature_matrix = csr_matrix(game_feature_matrix)

final_model.fit(train, epochs=10, num_threads=4, item_features=game_feature_matrix)

<lightfm.lightfm.LightFM at 0x7c726fb85e90>

In [None]:
# model evaluation
train_precision = precision_at_k(final_model, train, k=5, item_features=game_feature_matrix).mean()
test_precision = precision_at_k(final_model, test, k=5, item_features=game_feature_matrix).mean()
print(f"Train precision at k=5: {train_precision:.4f}")
print(f"Test precision at k=5: {test_precision:.4f}")

Train precision at k=5: 0.0592
Test precision at k=5: 0.0207


In [None]:
# recommend games
def recommend_games(player_id, model, interaction_matrix, game_df, top_n=5):
    scores = model.predict(player_id, np.arange(interaction_matrix.shape[1]), item_features=game_feature_matrix)
    top_games = np.argsort(-scores)[:top_n]  # retrieve top N game indices
    recommended_games = [list(game_df.index)[i] for i in top_games]

    return recommended_games

player_to_recommend = 0  # example: player 0
recommended_games = recommend_games(player_to_recommend, final_model, interaction_matrix, game_df)
print("Recommended game IDs:", recommended_games)

Recommended game IDs: ['1938090', '1063730', '2358720', '899770', '553850']


In [None]:
print("Interaction matrix Shape:", interaction_matrix.shape)
print("Game features Shape:", game_df.shape)

Interaction matrix Shape: (50000, 99)
Game features Shape: (99, 2)


In [None]:
from lightfm.evaluation import recall_at_k

train_recall = recall_at_k(final_model, train, k=5, item_features=game_feature_matrix).mean()
test_recall = recall_at_k(final_model, test, k=5, item_features=game_feature_matrix).mean()

print(f"Train Recall at k=5: {train_recall:.4f}")
print(f"Test Recall at k=5: {test_recall:.4f}")


Train Recall at k=5: 0.0573
Test Recall at k=5: 0.0594


In [None]:
from lightfm.evaluation import reciprocal_rank

train_mrr = reciprocal_rank(final_model, train, item_features=game_feature_matrix).mean()
test_mrr = reciprocal_rank(final_model, test, item_features=game_feature_matrix).mean()

print(f"Train MRR: {train_mrr:.4f}")
print(f"Test MRR: {test_mrr:.4f}")


Train MRR: 0.1821
Test MRR: 0.0854


In [None]:
!pip install huggingface_hub transformers
from huggingface_hub import login
login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import joblib
import os

model_path = "game-rec-model"
if not os.path.exists(model_path):
    os.makedirs(model_path)

joblib.dump(final_model, os.path.join(model_path, 'game-rec.pkl'))


['game-rec-model/game-rec.pkl']

In [None]:
from huggingface_hub import HfApi
repo_id = "reemchaaban/game-rec"

api = HfApi()
api.create_repo(repo_id=repo_id, exist_ok=True)

api.upload_file(
    path_or_fileobj=os.path.join(model_path, 'game-rec.pkl'),
    path_in_repo="game-rec.pkl",
    repo_id=repo_id,
)


CommitInfo(commit_url='https://huggingface.co/reemchaaban/game-rec/commit/fb2840a9d39f578981ec0146a9cbfd963e26ecb9', commit_message='Upload game-rec.pkl with huggingface_hub', commit_description='', oid='fb2840a9d39f578981ec0146a9cbfd963e26ecb9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/reemchaaban/game-rec', endpoint='https://huggingface.co', repo_type='model', repo_id='reemchaaban/game-rec'), pr_revision=None, pr_num=None)