<a href="https://colab.research.google.com/github/reemchaaban/game_system/blob/main/data-processing/game_rec_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import json
from google.colab import drive, files, userdata
import shutil


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

!pip install lightfm
from scipy.sparse import coo_matrix
from lightfm import LightFM
from lightfm.data import Dataset
from sklearn.preprocessing import MinMaxScaler
from lightfm.evaluation import precision_at_k, recall_at_k, reciprocal_rank



In [None]:
drive.mount('/content/drive')
drive_base_path = '/content/drive/My Drive/503Nproj/game-rec/200'

games_df = pd.read_csv(os.path.join(drive_base_path, 'game_library_data.csv'))

file_path = os.path.join(drive_base_path, 'synthetic_training_data.json')

with open(file_path, 'r') as file:
    player_data = json.load(file)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# flatten player libraries into a dataframe
records = []
for player_id, pdata in player_data.items():
    for game_id, gdata in pdata['games'].items():
        records.append({
            'player_id': player_id,
            'game_id': game_id,
            'game_name': gdata['name'],
            'rating_ratio': float(gdata['rating_ratio']),
            'price': float(gdata['price']),
            'genres': gdata['genre'], #list
            'tags': gdata['tags'], #list
            'playtime_hours': gdata['hours']
        })

player_libraries = pd.DataFrame(records)

#normalizing playtime per player, where most-played game by player = 1.0 & others are relative
player_libraries['normalized_playtime'] = player_libraries.groupby('player_id')['playtime_hours'].transform(lambda x: x / x.max())

In [None]:
player_libraries.head()

Unnamed: 0,player_id,game_id,game_name,rating_ratio,price,genres,tags,playtime_hours,normalized_playtime
0,0,1551360,Forza Horizon 5,7.785,5999.0,"[Action, Adventure, Racing, Simulation, Sports]","[Racing, Open World, Driving, Multiplayer, Aut...",49,0.15655
1,0,244210,Assetto Corsa,12.532,1999.0,"[Indie, Racing, Simulation, Sports]","[Racing, Automobile Sim, Simulation, Driving, ...",235,0.750799
2,0,284160,BeamNG.drive,37.755,2499.0,"[Racing, Simulation, Early Access]","[Simulation, Driving, Physics, Realistic, Dest...",313,1.0
3,0,252950,Rocket League,6.945,0.0,"[Action, Indie, Racing, Sports]","[Multiplayer, Football (Soccer), Competitive, ...",92,0.29393
4,0,211500,RaceRoom Racing Experience,3.045,0.0,"[Racing, Simulation, Sports, Free To Play]","[Free to Play, Racing, Automobile Sim, Realist...",3,0.009585


In [None]:
dataset = Dataset()
dataset.fit(
    users=player_libraries['player_id'],
    items=games_df['game_id']
)

In [None]:
os.makedirs("game-rec", exist_ok=True)

In [None]:
#normalizing price & rating ratio
scaler = MinMaxScaler()
player_libraries[['norm_price', 'norm_rating_ratio']] = scaler.fit_transform(player_libraries[['price', 'rating_ratio']])
with open("game-rec/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)


#combining features
player_libraries['feature_list'] = player_libraries.apply(
    lambda row: [f'genre:{g}' for g in row['genres']] + #multi-hot categorical feature
                [f'tag:{t}' for t in row['tags']] + #multi-hot categorical feature
                [f'norm_price:{row.norm_price:.4f}'] +
                [f'norm_rating_ratio:{row.norm_rating_ratio:.4f}'], axis=1)

In [None]:
player_libraries.head()

Unnamed: 0,player_id,game_id,game_name,rating_ratio,price,genres,tags,playtime_hours,normalized_playtime,norm_price,norm_rating_ratio,feature_list
0,0,1551360,Forza Horizon 5,7.785,5999.0,"[Action, Adventure, Racing, Simulation, Sports]","[Racing, Open World, Driving, Multiplayer, Aut...",49,0.15655,0.749969,0.092774,"[genre:Action, genre:Adventure, genre:Racing, ..."
1,0,244210,Assetto Corsa,12.532,1999.0,"[Indie, Racing, Simulation, Sports]","[Racing, Automobile Sim, Simulation, Driving, ...",235,0.750799,0.249906,0.156498,"[genre:Indie, genre:Racing, genre:Simulation, ..."
2,0,284160,BeamNG.drive,37.755,2499.0,"[Racing, Simulation, Early Access]","[Simulation, Driving, Physics, Realistic, Dest...",313,1.0,0.312414,0.495093,"[genre:Racing, genre:Simulation, genre:Early A..."
3,0,252950,Rocket League,6.945,0.0,"[Action, Indie, Racing, Sports]","[Multiplayer, Football (Soccer), Competitive, ...",92,0.29393,0.0,0.081498,"[genre:Action, genre:Indie, genre:Racing, genr..."
4,0,211500,RaceRoom Racing Experience,3.045,0.0,"[Racing, Simulation, Sports, Free To Play]","[Free to Play, Racing, Automobile Sim, Realist...",3,0.009585,0.0,0.029144,"[genre:Racing, genre:Simulation, genre:Sports,..."


In [None]:
#grouping by game ID & aggregate features, i.e., genre, tag, price, rating ratio
item_features_df = player_libraries.groupby('game_id').agg({
    'feature_list': lambda lists: lists.iloc[0]  #features are the same for each game
}).reset_index()

In [None]:
#fill with all item features
all_features = set()
for feats in item_features_df['feature_list']:
    all_features.update(feats)

dataset.fit_partial(
    items=item_features_df['game_id'],
    item_features=list(all_features)
)

In [None]:
item_features_tuples = list(zip(item_features_df['game_id'], item_features_df['feature_list']))
item_features_matrix = dataset.build_item_features(item_features_tuples)
with open("game-rec/item_features_matrix.pkl", "wb") as f:
    pickle.dump(item_features_matrix, f)

In [None]:
#building interaction matrix for LightFM w/ weights
(interactions, weights) = dataset.build_interactions([
    (row['player_id'], row['game_id'], row['normalized_playtime'])
    for _, row in player_libraries.iterrows()
])

In [None]:
#training
model = LightFM(loss='warp')
model.fit(
    interactions,
    sample_weight=weights,
    item_features=item_features_matrix,
    epochs=60,
    num_threads=4
)

<lightfm.lightfm.LightFM at 0x7e4a5424d050>

Model evaluation

In [None]:
# precision at k = how many of top-K recommended items were relevant
precision = precision_at_k(model, interactions, item_features=item_features_matrix, k=5).mean()
print(f'Precision at k=5: {precision:.4f}')

# recall at k = how many of top-k recommended items were relevant & successfully recommended
recall = recall_at_k(model, interactions, item_features=item_features_matrix, k=5).mean()
print(f'Recall at k=5: {recall:.4f}')

# mean reciprocal rank = measures rank position of 1st relevant item in rec list
mrr = reciprocal_rank(model, interactions, item_features=item_features_matrix).mean()
print(f'MRR: {recall:.4f}')

Precision at k=5: 0.4324
Recall at k=5: 0.1412
MRR: 0.1412


Generate recommendations

In [None]:
#sample player ID
sample_player_id = player_libraries['player_id'].iloc[98375]

all_game_ids = games_df['game_id'].tolist()

#map player/game IDs to LightFM indices
user_id_map, _, item_id_map, item_feature_map = dataset.mapping()
with open("game-rec/dataset_mappings.pkl", "wb") as f:
    pickle.dump({
        "user_id_map": user_id_map,
        "item_id_map": item_id_map,
        "item_feature_map": item_feature_map
    }, f)
user_index = user_id_map[sample_player_id]

#predict scores for all games
scores = model.predict(
    user_ids=user_index,
    item_ids=np.arange(len(item_id_map)),
    item_features=item_features_matrix
)

#get top N game indices (that are not in the player's library)
played_game_ids = player_libraries[player_libraries['player_id'] == sample_player_id]['game_id'].tolist()
played_game_indices = [item_id_map[g] for g in played_game_ids if g in item_id_map]

#mask out games in player's library already
scores[played_game_indices] = -np.inf

#top 5 recommendations
top_indices = np.argsort(-scores)[:5]

#reverse mapping of item_index to game_id
inv_item_id_map = {v: k for k, v in item_id_map.items()}
recommended_game_ids = [inv_item_id_map[i] for i in top_indices]
recommended_game_ids = [int(gid) for gid in recommended_game_ids]

#print names of recommended games
print("Recommended games for player with ID", sample_player_id)
for gid in recommended_game_ids:
    match = games_df[games_df['game_id'] == gid]
    if not match.empty:
        name = match['name'].values[0]
    else:
        name = f"(Game ID {gid} not found)"
    print(f" - {name}")

Recommended games for player with ID 6145
 - Business Tour - Board Game with Online Multiplayer
 - BattleBlock Theater
 - Doki Doki Literature Club!
 - Sven Co-op
 - A Story About My Uncle


Save model

In [None]:
with open('game-rec.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
!pip install huggingface_hub
from huggingface_hub import login

HF_TOKEN = userdata.get("HF_TOKEN")
login(token=HF_TOKEN)



In [None]:
shutil.copy("game-rec.pkl", "game-rec/game-rec.pkl")

'game-rec/game-rec.pkl'

In [None]:
from huggingface_hub import upload_folder

upload_folder(
    repo_id="reemchaaban/game-rec",
    folder_path="game-rec",
    path_in_repo=".",
    repo_type="model"
)

dataset_mappings.pkl:   0%|          | 0.00/558k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

game-rec.pkl:   0%|          | 0.00/6.73M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/reemchaaban/game-rec/commit/975ea53ef0139e763f2ef7446019d8a749531665', commit_message='Upload folder using huggingface_hub', commit_description='', oid='975ea53ef0139e763f2ef7446019d8a749531665', pr_url=None, repo_url=RepoUrl('https://huggingface.co/reemchaaban/game-rec', endpoint='https://huggingface.co', repo_type='model', repo_id='reemchaaban/game-rec'), pr_revision=None, pr_num=None)

In [None]:
GITHUB_USERNAME = "reemchaaban"
GITHUB_EMAIL = "reem.chaabann@gmail.com"
REPO_NAME = "game_system"
BRANCH = "main"
TARGET_SUBDIR = "IEP2"

token = userdata.get('GITHUB_PAT')

!git config --global user.email "{GITHUB_EMAIL}"
!git config --global user.name "{GITHUB_USERNAME}"

!rm -rf {REPO_NAME}
!git clone https://{token}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git

!rm -rf {REPO_NAME}/{TARGET_SUBDIR}/model
!mkdir -p {REPO_NAME}/{TARGET_SUBDIR}/model
!cp game-rec/dataset_mappings.pkl {REPO_NAME}/{TARGET_SUBDIR}/model/
!cp game-rec/item_features_matrix.pkl {REPO_NAME}/{TARGET_SUBDIR}/model/

!cp game-rec.pkl {REPO_NAME}/{TARGET_SUBDIR}/model/game-rec.pkl

%cd {REPO_NAME}
!git add .
!git commit -m "Update IEP2 model and scaler from Colab"
!git push origin {BRANCH}

%cd ..

Cloning into 'game_system'...
remote: Enumerating objects: 108, done.[K
remote: Counting objects: 100% (108/108), done.[K
remote: Compressing objects: 100% (65/65), done.[K
remote: Total 108 (delta 33), reused 102 (delta 31), pack-reused 0 (from 0)[K
Receiving objects: 100% (108/108), 10.47 MiB | 22.43 MiB/s, done.
Resolving deltas: 100% (33/33), done.
/content/game_system/game_system
[main f8c774f] Update IEP2 model and scaler from Colab
 3 files changed, 0 insertions(+), 0 deletions(-)
Enumerating objects: 13, done.
Counting objects: 100% (13/13), done.
Delta compression using up to 2 threads
Compressing objects: 100% (7/7), done.
Writing objects: 100% (7/7), 3.89 MiB | 3.26 MiB/s, done.
Total 7 (delta 4), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (4/4), completed with 4 local objects.[K
To https://github.com/reemchaaban/game_system.git
   433090d..f8c774f  main -> main
/content/game_system
