<a href="https://colab.research.google.com/github/renadalahmadi/BigData-and-AI/blob/main/recommendation_systems.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Build a content-based recommendation system for the Steam video game

- Renad Alahmadi

In [None]:
!pip install scikit-surprise

#conda update --all
#conda install -c conda-forge scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 4.3 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1633976 sha256=0e97509e809fea03c54800cbf482be51d2cb26fe9e499f8444ee20a130b49cc0
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [None]:
# imports
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from ast import literal_eval

from sklearn.model_selection import KFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

from surprise import Reader, Dataset, SVD # for collabrotative filtering and matrix factorization
from surprise.model_selection import cross_validate

import warnings; warnings.simplefilter('ignore')
pd.set_option('display.max_columns', None)

## Content Based Recommender

In [None]:
dataset = pd.read_csv('/content/Steam Dataset.csv')
dataset
# dataset = dataset[dataset['Game_id'].notnull()]['Game_id'].astype('int')

Unnamed: 0,Positive,Negative,Game_id,Game_name,Positive_Rate,links,Game_description
0,285421,3487,620,Portal 2,0.987930,https://store.steampowered.com/app/620,"<div class=""game_area_description"" id=""game_ar..."
1,102805,1640,400,Portal,0.984298,https://store.steampowered.com/app/400,"<div class=""game_area_description"" id=""game_ar..."
2,439665,8619,413150,Stardew Valley,0.980773,https://store.steampowered.com/app/413150,"<div class=""game_area_description"" id=""game_ar..."
3,476762,9701,431960,Wallpaper Engine,0.980058,https://store.steampowered.com/app/431960,"<div class=""game_area_description"" id=""game_ar..."
4,910629,19342,105600,Terraria,0.979202,https://store.steampowered.com/app/105600,"<div class=""game_area_description"" id=""game_ar..."
...,...,...,...,...,...,...,...
95,1108250,875814,578080,PUBG: BATTLEGROUNDS,0.558576,https://store.steampowered.com/app/578080,"<div class=""game_area_description"" id=""game_ar..."
96,115650,93024,433850,Z1 Battle Royale,0.554214,https://store.steampowered.com/app/433850,"<div class=""game_area_description"" id=""game_ar..."
97,1315,1176,439700,Z1 Battle Royale: Test Server,0.527900,https://store.steampowered.com/app/439700,"<div class=""game_area_description"" id=""game_ar..."
98,30501,29952,1089350,NBA 2K20,0.504541,https://store.steampowered.com/app/1089350,"<div class=""game_area_description"" id=""game_ar..."


In [None]:
print(dataset.isna().sum()) # -->  zero

Positive            0
Negative            0
Game_id             0
Game_name           0
Positive_Rate       0
links               0
Game_description    0
dtype: int64


In [None]:
# NOTE: there's no null values (in all columns)
#lets rmove null ids, as there are a few Nan values
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

dataset['Game_id'] = dataset['Game_id'].apply(convert_int)
print(dataset.Game_id.isna().sum())
dataset = dataset.dropna(axis=0, subset=['Game_id'])
print(dataset.Game_id.isna().sum())

Let's here transform the data to something the machine understand. 

In [None]:
# let's do some feature extraction from the tetx description using Tf-idf
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(dataset['Game_description'])

In [None]:
tfidf_matrix.shape

(100, 21959)

### Cosine Similarity

We will be using the Cosine Similarity to calculate a numeric quantity that denotes the similarity between two Games. Mathematically, it is defined as follows:

cosine(x,y)=x.y⊺/||x||.||y||

Since we have used the TF-IDF Vectorizer, calculating the Dot Product will directly give us the Cosine Similarity Score. Therefore, we will use sklearn's linear_kernel instead of cosine_similarities since it is much faster.

In [None]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
print(cosine_sim.shape)
print(cosine_sim[0])

(100, 100)
[1.         0.10467383 0.27041798 0.14488692 0.14849364 0.03103094
 0.18031773 0.10143294 0.16191556 0.20345069 0.05874203 0.05797221
 0.08736863 0.0418697  0.12424839 0.04388966 0.21795118 0.17790132
 0.18511241 0.05038385 0.12150246 0.14585019 0.09200489 0.11391628
 0.10138283 0.03924035 0.19778871 0.08757517 0.26690213 0.04152594
 0.06882484 0.30170971 0.14407873 0.14928214 0.23056317 0.11592873
 0.33866032 0.00961042 0.06047891 0.19261151 0.10302336 0.18877852
 0.01978548 0.22373538 0.15391474 0.0821589  0.0471858  0.2939425
 0.05394429 0.02431148 0.1149811  0.06852005 0.13991711 0.03194006
 0.03811705 0.18095348 0.19657051 0.03565375 0.1164851  0.10511097
 0.09545167 0.21810919 0.05833022 0.08958913 0.05889727 0.11348202
 0.04641729 0.01944042 0.04955943 0.09059759 0.0222264  0.10744498
 0.03317934 0.23020859 0.07597496 0.19302998 0.05213717 0.02721882
 0.05148509 0.14385143 0.12679521 0.05595783 0.20313094 0.02724275
 0.05885664 0.03775161 0.07650762 0.0614583  0.15584

In [None]:
type(cosine_sim)

numpy.ndarray



We now have a pairwise cosine similarity matrix for all the movies in our dataset. The next step is to write a function that returns the 30 most similar games based on the cosine similarity score.


In [25]:
sgm = dataset.reset_index()
Game_names = sgm['Game_name']

In [26]:
Game_names

0                          Portal 2
1                            Portal
2                    Stardew Valley
3                  Wallpaper Engine
4                          Terraria
                  ...              
95              PUBG: BATTLEGROUNDS
96                 Z1 Battle Royale
97    Z1 Battle Royale: Test Server
98                         NBA 2K20
99                 Battlefield 2042
Name: Game_name, Length: 100, dtype: object

In [None]:
sgm

Unnamed: 0,index,Positive,Negative,Game_id,Game_name,Positive_Rate,links,Game_description
0,0,285421,3487,620,Portal 2,0.987930,https://store.steampowered.com/app/620,"<div class=""game_area_description"" id=""game_ar..."
1,1,102805,1640,400,Portal,0.984298,https://store.steampowered.com/app/400,"<div class=""game_area_description"" id=""game_ar..."
2,2,439665,8619,413150,Stardew Valley,0.980773,https://store.steampowered.com/app/413150,"<div class=""game_area_description"" id=""game_ar..."
3,3,476762,9701,431960,Wallpaper Engine,0.980058,https://store.steampowered.com/app/431960,"<div class=""game_area_description"" id=""game_ar..."
4,4,910629,19342,105600,Terraria,0.979202,https://store.steampowered.com/app/105600,"<div class=""game_area_description"" id=""game_ar..."
...,...,...,...,...,...,...,...,...
95,95,1108250,875814,578080,PUBG: BATTLEGROUNDS,0.558576,https://store.steampowered.com/app/578080,"<div class=""game_area_description"" id=""game_ar..."
96,96,115650,93024,433850,Z1 Battle Royale,0.554214,https://store.steampowered.com/app/433850,"<div class=""game_area_description"" id=""game_ar..."
97,97,1315,1176,439700,Z1 Battle Royale: Test Server,0.527900,https://store.steampowered.com/app/439700,"<div class=""game_area_description"" id=""game_ar..."
98,98,30501,29952,1089350,NBA 2K20,0.504541,https://store.steampowered.com/app/1089350,"<div class=""game_area_description"" id=""game_ar..."


In [None]:
indices = pd.Series(sgm.index, index=sgm['Game_name'])

In [None]:
indices

Game_name
Portal 2                          0
Portal                            1
Stardew Valley                    2
Wallpaper Engine                  3
Terraria                          4
                                 ..
PUBG: BATTLEGROUNDS              95
Z1 Battle Royale                 96
Z1 Battle Royale: Test Server    97
NBA 2K20                         98
Battlefield 2042                 99
Length: 100, dtype: int64

In [27]:
def get_recommendations(Game_name):
    idx = indices[Game_name]
    sim_scores = list(enumerate(cosine_sim[idx])) #convert the cosine similarity into a list after enumeration
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31] # you can change these numbers to retrieve more or less trhan 30 recommendations
    game_indices = [i[0] for i in sim_scores]
    return Game_names.iloc[game_indices]

In [28]:
sim_scores = list(enumerate(cosine_sim[indices['Stardew Valley']]))

In [29]:
sim_scores

[(0, 0.27041798225887437),
 (1, 0.02601164529020126),
 (2, 0.9999999999999993),
 (3, 0.17122837753552306),
 (4, 0.17011420500589633),
 (5, 0.0045577077389344295),
 (6, 0.20845961581674116),
 (7, 0.08737991074182282),
 (8, 0.17584121867460784),
 (9, 0.22317060129915173),
 (10, 0.07265496242309076),
 (11, 0.01078517368880706),
 (12, 0.12654791082039507),
 (13, 0.08424122830513286),
 (14, 0.1380830524974362),
 (15, 0.012386312713232142),
 (16, 0.2395166734458289),
 (17, 0.21153332377897927),
 (18, 0.24192958505617468),
 (19, 0.011786593610520384),
 (20, 0.135691731316898),
 (21, 0.1706538442057924),
 (22, 0.1159781270385868),
 (23, 0.13818998949289424),
 (24, 0.1413363850789764),
 (25, 0.05489525023163465),
 (26, 0.21140502241186473),
 (27, 0.10757190866068149),
 (28, 0.3169791208469001),
 (29, 0.06883791307779402),
 (30, 0.10509412838867077),
 (31, 0.28101048794824324),
 (32, 0.13344289315324578),
 (33, 0.17856854265853075),
 (34, 0.27208543260017004),
 (35, 0.10588284894405281),
 (36, 0

# Lets try to look at the similarity between 'Stardew Valley' and other games. 

#Also, 'NBA 2K20'and other games. 

In [30]:
get_recommendations('Stardew Valley').head(10)

36                          Unturned
47                     7 Days to Die
94            Street Warriors Online
28                         Grim Dawn
31    Call of Duty: Modern Warfare 2
34                          Among Us
0                           Portal 2
43              No More Room in Hell
73                         Fallout 4
61                        Brawlhalla
Name: Game_name, dtype: object

In [32]:
get_recommendations('NBA 2K20').head(10)

23     The Elder Scrolls V: Skyrim
50    Mount & Blade II: Bannerlord
58           Monster Hunter: World
20                     Dying Light
24                Human: Fall Flat
91               Heroes & Generals
96                Z1 Battle Royale
40                        Warframe
2                   Stardew Valley
92    Counter-Strike Nexon: Studio
Name: Game_name, dtype: object