In [62]:
import pandas as pd
import json
import pickle
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("games.csv") # Open game data
df.head()

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck
0,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99,9.99,0.0,True
1,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99,2.99,0.0,True
2,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99,14.99,0.0,True
3,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99,14.99,0.0,True
4,249050,Dungeon of the ENDLESS™,2014-10-27,True,True,False,Very Positive,88,8784,11.99,11.99,0.0,True


In [51]:
# Open metadata to dataframe
meta_data = pd.read_json("games_metadata.json", lines=True)

In [52]:
len(meta_data)

50872

In [53]:
print(len(meta_data[(meta_data['tags'].str.len() == 0)])) # Number of rows without tags
print(len(meta_data[meta_data["description"] == ""])) # Number of rows without a description
print(len(meta_data[(meta_data['tags'].str.len() == 0) & (meta_data["description"] == "")])) # Number of rows without either a tag or a description

1244
10373
1229


In [54]:
# Remove rows that do not have either a tag or a description
meta_filtered = meta_data[~(meta_data['tags'].str.len() == 0) | ~(meta_data["description"] == "")]

In [55]:
meta_filtered.head()

Unnamed: 0,app_id,description,tags
0,13500,Enter the dark underworld of Prince of Persia ...,"[Action, Adventure, Parkour, Third Person, Gre..."
1,22364,,[Action]
2,113020,Monaco: What's Yours Is Mine is a single playe...,"[Co-op, Stealth, Indie, Heist, Local Co-Op, St..."
3,226560,Escape Dead Island is a Survival-Mystery adven...,"[Zombies, Adventure, Survival, Action, Third P..."
4,249050,Dungeon of the Endless is a Rogue-Like Dungeon...,"[Roguelike, Strategy, Tower Defense, Pixel Gra..."


In [56]:
df_complete = pd.merge(df, meta_filtered, on="app_id", how="left").dropna() # Merge the metadata and games data
df_complete = df_complete[~df_complete["title"].str.contains(r"[Ss]oundtrack")]
df_complete = df_complete[~df_complete["title"].str.contains(r"DLC")]
df_complete = df_complete[df_complete["title"].str.contains(r"[a-z]")]

In [57]:
df_complete.head()

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck,description,tags
0,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99,9.99,0.0,True,Enter the dark underworld of Prince of Persia ...,"[Action, Adventure, Parkour, Third Person, Gre..."
1,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99,2.99,0.0,True,,[Action]
2,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99,14.99,0.0,True,Monaco: What's Yours Is Mine is a single playe...,"[Co-op, Stealth, Indie, Heist, Local Co-Op, St..."
3,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99,14.99,0.0,True,Escape Dead Island is a Survival-Mystery adven...,"[Zombies, Adventure, Survival, Action, Third P..."
4,249050,Dungeon of the ENDLESS™,2014-10-27,True,True,False,Very Positive,88,8784,11.99,11.99,0.0,True,Dungeon of the Endless is a Rogue-Like Dungeon...,"[Roguelike, Strategy, Tower Defense, Pixel Gra..."


In [58]:
# Combine Tags and Descriptions
df_complete['combined_features'] = df_complete['description'].fillna('') + ' ' + df_complete['tags'].apply(lambda tags: ' '.join(tags) if isinstance(tags, list) else '')
df_complete = df_complete.reset_index(drop=True)

In [11]:
# Vectorize Combined Features
count_vectorizer = CountVectorizer(max_features = 10_000, stop_words='english')
combined_features_vector = count_vectorizer.fit_transform(df_complete['combined_features'].values.astype("U")).toarray()

In [12]:
# Calculate Cosine Similarity
cosine_similarities = cosine_similarity(combined_features_vector)

In [32]:
# Get Recommendations
def get_combined_recommendations(game_title, cosine_similarities, df, top_n=5):
    game_index = df.index[df['title'] == game_title].tolist()[0]
    similarity_scores = list(enumerate(cosine_similarities[game_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, _ in similarity_scores[1:top_n+1]]
    recommendations = df.loc[top_indices, 'title'].tolist()
    return recommendations

In [39]:
game_title = ""
get_combined_recommendations(game_title, cosine_similarities, df_complete)

['VAIL VR Founder Pack',
 'Team Fortress Classic',
 'Operation swat',
 'Netherverse',
 'Death Toll']