<a href="https://colab.research.google.com/github/reemchaaban/game_system/blob/main/data-processing/game_rec_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install lightfm
!pip install huggingface_hub
!pip install mlflow
!pip install pyngrok

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/316.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp311-cp311-linux_x86_64.whl size=831166 sha256=53fbe2a9d7c9d2e2968976b394af68cb1b8ae4a76595c8bc1e998fc11c6ab9ea
  Stored in directory: /root/.cache/pip/wheels/b9/0d/8a/0729d2e6e3ca2a898ba55201f905da7db3f838a33df5b3fcdd
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17
Collecting mlflow
  Downloading mlflow-2.21.3-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.21.3 (from mlflow)
  Downloading mlflow_skinny-2.21.3

In [2]:
import os
import json
from google.colab import drive, files, userdata
import shutil
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import pickle
from scipy.sparse import coo_matrix

from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k, reciprocal_rank, auc_score
from sklearn.preprocessing import MinMaxScaler

from huggingface_hub import login, upload_folder
from pyngrok import ngrok
import mlflow
from tqdm.notebook import trange

In [3]:
HF_TOKEN = userdata.get("HF_TOKEN")
NGROK_AUTH_TOKEN = userdata.get('NGROK_AUTH_TOKEN')

login(token=HF_TOKEN)

In [4]:
drive.mount('/content/drive')
drive_base_path = '/content/drive/My Drive/503Nproj/datasets'

games_df = pd.read_csv(os.path.join(drive_base_path, 'game_library_data.csv'))

file_path = os.path.join(drive_base_path, 'synthetic_training_data.json')

with open(file_path, 'r') as file:
    player_data = json.load(file)

Mounted at /content/drive


In [5]:
# flatten player libraries into a dataframe
records = []
for player_id, pdata in player_data.items():
    for game_id, gdata in pdata['games'].items():
        records.append({
            'player_id': player_id,
            'game_id': game_id,
            'game_name': gdata['name'],
            'rating_ratio': float(gdata['rating_ratio']),
            'price': float(gdata['price']),
            'genres': gdata['genre'], #list
            'tags': gdata['tags'], #list
            'playtime_hours': gdata['hours']
        })

player_libraries = pd.DataFrame(records)

#normalizing playtime per player, where most-played game by player = 1.0 & others are relative
player_libraries['normalized_playtime'] = player_libraries.groupby('player_id')['playtime_hours'].transform(lambda x: x / x.max())

In [6]:
player_libraries.head()

Unnamed: 0,player_id,game_id,game_name,rating_ratio,price,genres,tags,playtime_hours,normalized_playtime
0,0,1551360,Forza Horizon 5,7.785,5999.0,"[Action, Adventure, Racing, Simulation, Sports]","[Racing, Open World, Driving, Multiplayer, Aut...",49,0.15655
1,0,244210,Assetto Corsa,12.532,1999.0,"[Indie, Racing, Simulation, Sports]","[Racing, Automobile Sim, Simulation, Driving, ...",235,0.750799
2,0,284160,BeamNG.drive,37.755,2499.0,"[Racing, Simulation, Early Access]","[Simulation, Driving, Physics, Realistic, Dest...",313,1.0
3,0,252950,Rocket League,6.945,0.0,"[Action, Indie, Racing, Sports]","[Multiplayer, Football (Soccer), Competitive, ...",92,0.29393
4,0,211500,RaceRoom Racing Experience,3.045,0.0,"[Racing, Simulation, Sports, Free To Play]","[Free to Play, Racing, Automobile Sim, Realist...",3,0.009585


In [7]:
dataset = Dataset()
dataset.fit(
    users=player_libraries['player_id'],
    items=games_df['game_id']
)

In [8]:
os.makedirs("game-rec", exist_ok=True)

In [10]:
#normalizing price & rating ratio
scaler = MinMaxScaler()
player_libraries[['norm_price', 'norm_rating_ratio']] = scaler.fit_transform(player_libraries[['price', 'rating_ratio']])
with open("game-rec/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)


#combining features
player_libraries['feature_list'] = player_libraries.apply(
    lambda row: [f'genre:{g}' for g in row['genres']] + #multi-hot categorical feature
                [f'tag:{t}' for t in row['tags']] + #multi-hot categorical feature
                [f'norm_price:{row.norm_price:.4f}'] +
                [f'norm_rating_ratio:{row.norm_rating_ratio:.4f}'], axis=1)

In [11]:
player_libraries.head()

Unnamed: 0,player_id,game_id,game_name,rating_ratio,price,genres,tags,playtime_hours,normalized_playtime,norm_price,norm_rating_ratio,feature_list
0,0,1551360,Forza Horizon 5,7.785,5999.0,"[Action, Adventure, Racing, Simulation, Sports]","[Racing, Open World, Driving, Multiplayer, Aut...",49,0.15655,0.749969,0.092774,"[genre:Action, genre:Adventure, genre:Racing, ..."
1,0,244210,Assetto Corsa,12.532,1999.0,"[Indie, Racing, Simulation, Sports]","[Racing, Automobile Sim, Simulation, Driving, ...",235,0.750799,0.249906,0.156498,"[genre:Indie, genre:Racing, genre:Simulation, ..."
2,0,284160,BeamNG.drive,37.755,2499.0,"[Racing, Simulation, Early Access]","[Simulation, Driving, Physics, Realistic, Dest...",313,1.0,0.312414,0.495093,"[genre:Racing, genre:Simulation, genre:Early A..."
3,0,252950,Rocket League,6.945,0.0,"[Action, Indie, Racing, Sports]","[Multiplayer, Football (Soccer), Competitive, ...",92,0.29393,0.0,0.081498,"[genre:Action, genre:Indie, genre:Racing, genr..."
4,0,211500,RaceRoom Racing Experience,3.045,0.0,"[Racing, Simulation, Sports, Free To Play]","[Free to Play, Racing, Automobile Sim, Realist...",3,0.009585,0.0,0.029144,"[genre:Racing, genre:Simulation, genre:Sports,..."


In [12]:
#grouping by game ID & aggregate features, i.e., genre, tag, price, rating ratio
item_features_df = player_libraries.groupby('game_id').agg({
    'feature_list': lambda lists: lists.iloc[0]  #features are the same for each game
}).reset_index()

In [13]:
#fill with all item features
all_features = set()
for feats in item_features_df['feature_list']:
    all_features.update(feats)

dataset.fit_partial(
    items=item_features_df['game_id'],
    item_features=list(all_features)
)

In [14]:
item_features_tuples = list(zip(item_features_df['game_id'], item_features_df['feature_list']))
item_features_matrix = dataset.build_item_features(item_features_tuples)
with open("game-rec/item_features_matrix.pkl", "wb") as f:
    pickle.dump(item_features_matrix, f)

In [15]:
#building interaction matrix for LightFM w/ weights
(interactions, weights) = dataset.build_interactions([
    (row['player_id'], row['game_id'], row['normalized_playtime'])
    for _, row in player_libraries.iterrows()
])

In [16]:
#training
model = LightFM(loss='warp')

num_epochs = 60
num_threads = 4

with mlflow.start_run(run_name="game-recommender"):
  mlflow.log_param("loss", "warp")
  mlflow.log_param("components", 30)

  for epoch in trange(num_epochs, desc="Training epochs"):
      model.fit_partial(
          interactions,
          sample_weight=weights,
          item_features=item_features_matrix,
          epochs=1,
          num_threads=num_threads
      )

      # Evaluate after this epoch
      precision = precision_at_k(model,
                                interactions,
                                k=5,
                                item_features=item_features_matrix).mean()

      recall = recall_at_k(model,
                          interactions,
                          k=5,
                          item_features=item_features_matrix).mean()

      auc = auc_score(model,
                      interactions,
                      item_features=item_features_matrix).mean()

      # Log with step=epoch
      mlflow.log_metric("Precision", precision, step=epoch)
      mlflow.log_metric("Recall", recall, step=epoch)
      mlflow.log_metric("AUC", auc, step=epoch)

Training epochs:   0%|          | 0/60 [00:00<?, ?it/s]

Model evaluation

In [17]:
# precision at k = how many of top-K recommended items were relevant
precision = precision_at_k(model, interactions, item_features=item_features_matrix, k=5).mean()
print(f'Precision at k=5: {precision:.4f}')

# recall at k = how many of top-k recommended items were relevant & successfully recommended
recall = recall_at_k(model, interactions, item_features=item_features_matrix, k=5).mean()
print(f'Recall at k=5: {recall:.4f}')

# mean reciprocal rank = measures rank position of 1st relevant item in rec list
mrr = reciprocal_rank(model, interactions, item_features=item_features_matrix).mean()
print(f'MRR: {recall:.4f}')

Precision at k=5: 0.4301
Recall at k=5: 0.1404
MRR: 0.1404


Ngrok

In [18]:
! pkill -f "mlflow ui"

get_ipython().system_raw("mlflow ui --port 5000 &") # run tracking UI in the background

In [21]:
# terminate any open tunnels
ngrok.kill()

ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# open HTTP tunnel on port 5000 = http://localhost:5000
public_url = ngrok.connect(5000, proto="http")

print(f" * Ngrok tunnel: {public_url}")

 * Ngrok tunnel: NgrokTunnel: "https://7057-34-147-121-171.ngrok-free.app" -> "http://localhost:5000"


Generate recommendations

In [22]:
#sample player ID
sample_player_id = player_libraries['player_id'].iloc[98375]

all_game_ids = games_df['game_id'].tolist()

#map player/game IDs to LightFM indices
user_id_map, _, item_id_map, item_feature_map = dataset.mapping()
with open("game-rec/dataset_mappings.pkl", "wb") as f:
    pickle.dump({
        "user_id_map": user_id_map,
        "item_id_map": item_id_map,
        "item_feature_map": item_feature_map
    }, f)
user_index = user_id_map[sample_player_id]

#predict scores for all games
scores = model.predict(
    user_ids=user_index,
    item_ids=np.arange(len(item_id_map)),
    item_features=item_features_matrix
)

#get top N game indices (that are not in the player's library)
played_game_ids = player_libraries[player_libraries['player_id'] == sample_player_id]['game_id'].tolist()
played_game_indices = [item_id_map[g] for g in played_game_ids if g in item_id_map]

#mask out games in player's library already
scores[played_game_indices] = -np.inf

#top 5 recommendations
top_indices = np.argsort(-scores)[:5]

#reverse mapping of item_index to game_id
inv_item_id_map = {v: k for k, v in item_id_map.items()}
recommended_game_ids = [inv_item_id_map[i] for i in top_indices]
recommended_game_ids = [int(gid) for gid in recommended_game_ids]

#print names of recommended games
print("Recommended games for player with ID", sample_player_id)
for gid in recommended_game_ids:
    match = games_df[games_df['game_id'] == gid]
    if not match.empty:
        name = match['name'].values[0]
    else:
        name = f"(Game ID {gid} not found)"
    print(f" - {name}")

Recommended games for player with ID 6145
 - Business Tour - Board Game with Online Multiplayer
 - A Story About My Uncle
 - Doki Doki Literature Club!
 - BattleBlock Theater
 - Sven Co-op


Save model

In [24]:
with open('game-rec.pkl', 'wb') as f:
    pickle.dump(model, f)

shutil.copy("game-rec.pkl", "game-rec/game-rec.pkl")

'game-rec/game-rec.pkl'

In [25]:
upload_folder(
    repo_id="reemchaaban/game-rec",
    folder_path="game-rec",
    path_in_repo=".",
    repo_type="model"
)

dataset_mappings.pkl:   0%|          | 0.00/558k [00:00<?, ?B/s]

game-rec.pkl:   0%|          | 0.00/6.73M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/reemchaaban/game-rec/commit/42ace7e6bc7b93604b69af3af28afb6a607e85e8', commit_message='Upload folder using huggingface_hub', commit_description='', oid='42ace7e6bc7b93604b69af3af28afb6a607e85e8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/reemchaaban/game-rec', endpoint='https://huggingface.co', repo_type='model', repo_id='reemchaaban/game-rec'), pr_revision=None, pr_num=None)

In [26]:
GITHUB_USERNAME = "reemchaaban"
GITHUB_EMAIL = "reem.chaabann@gmail.com"
REPO_NAME = "game_system"
BRANCH = "main"
TARGET_SUBDIR = "IEP2"

token = userdata.get('GITHUB_PAT')

!git config --global user.email "{GITHUB_EMAIL}"
!git config --global user.name "{GITHUB_USERNAME}"

!rm -rf {REPO_NAME}
!git clone https://{token}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git

!rm -rf {REPO_NAME}/{TARGET_SUBDIR}/model
!mkdir -p {REPO_NAME}/{TARGET_SUBDIR}/model
!cp game-rec/dataset_mappings.pkl {REPO_NAME}/{TARGET_SUBDIR}/model/
!cp game-rec/item_features_matrix.pkl {REPO_NAME}/{TARGET_SUBDIR}/model/

!cp game-rec.pkl {REPO_NAME}/{TARGET_SUBDIR}/model/game-rec.pkl

%cd {REPO_NAME}
!git add .
!git commit -m "Update IEP2 model and scaler from Colab"
!git push origin {BRANCH}

%cd ..

Cloning into 'game_system'...
remote: Enumerating objects: 161, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (42/42), done.[K
remote: Total 161 (delta 20), reused 16 (delta 5), pack-reused 114 (from 1)[K
Receiving objects: 100% (161/161), 16.95 MiB | 26.45 MiB/s, done.
Resolving deltas: 100% (57/57), done.
/content/game_system
[main b6b5bd8] Update IEP2 model and scaler from Colab
 3 files changed, 0 insertions(+), 0 deletions(-)
Enumerating objects: 13, done.
Counting objects: 100% (13/13), done.
Delta compression using up to 2 threads
Compressing objects: 100% (7/7), done.
Writing objects: 100% (7/7), 3.89 MiB | 6.07 MiB/s, done.
Total 7 (delta 4), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (4/4), completed with 4 local objects.[K
To https://github.com/reemchaaban/game_system.git
   3afe87e..b6b5bd8  main -> main
/content


In [None]:
GITHUB_USERNAME = "reemchaaban"
GITHUB_EMAIL = "reem.chaabann@gmail.com"
REPO_NAME = "game_system"
BRANCH = "main"

token = userdata.get('GITHUB_PAT')

!git config --global user.email "{GITHUB_EMAIL}"
!git config --global user.name "{GITHUB_USERNAME}"

!rm -rf {REPO_NAME}
!git clone https://{token}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git

!rm -rf {REPO_NAME}/data-processing/datasets
!mkdir -p {REPO_NAME}/data-processing/datasets

!cp /content/drive/MyDrive/503Nproj/game-rec/200/game_library_data.csv {REPO_NAME}/data-processing/datasets
!cp /content/drive/MyDrive/503Nproj/game-rec/200/processed_player_data.json {REPO_NAME}/data-processing/datasets
!cp /content/drive/MyDrive/503Nproj/game-rec/200/synthetic_training_data.json {REPO_NAME}/data-processing/datasets
!cp /content/drive/MyDrive/503Nproj/player-count-history/player_count_history.csv {REPO_NAME}/data-processing/datasets

%cd {REPO_NAME}
!git add .
!git commit -m "Update IEP2 model and scaler from Colab"
!git push origin {BRANCH}

%cd ..

Cloning into 'game_system'...
remote: Enumerating objects: 131, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 131 (delta 6), reused 0 (delta 0), pack-reused 114 (from 1)[K
Receiving objects: 100% (131/131), 14.44 MiB | 22.95 MiB/s, done.
Resolving deltas: 100% (43/43), done.
/content/game_system/game_system
[main c8569e5] Update IEP2 model and scaler from Colab
 4 files changed, 3532933 insertions(+)
 create mode 100644 data-processing/datasets/game_library_data.csv
 create mode 100644 data-processing/datasets/player_count_history.csv
 create mode 100644 data-processing/datasets/processed_player_data.json
 create mode 100644 data-processing/datasets/synthetic_training_data.json
Enumerating objects: 10, done.
Counting objects: 100% (10/10), done.
Delta compression using up to 2 threads
Compressing objects: 100% (7/7), done.
Writing objects: 100% (8/8), 66.97 MiB | 15.26 MiB/s, done.
Total 8 (delta 2), reused 