In [1]:
!python --version

Python 3.7.4


In [2]:
import json
import numpy as np
from tqdm import tqdm

import nltk
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re

from gensim.models import KeyedVectors
from scipy import spatial



In [3]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
with open('gamestop/games.txt','r') as f:
    l = json.load(f)

In [5]:
len(l)

1049

In [6]:
stopwords_eng = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def process_text(text):
    text = text.replace("\n"," ").replace("\r"," ")
    text = re.sub(r'“', " '' ", text)
    text = re.sub(r'”', " '' ", text)
    text = re.sub(r'"', " '' ", text)
    text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
    
    punc_list = '!"#$%()*+,-./:;<=>?@^_{|}~'
    t = str.maketrans(dict.fromkeys(punc_list," "))
    text = text.translate(t)
    
    t = str.maketrans(dict.fromkeys("'`",""))
    text = text.translate(t)
    
    tokens = regexp_tokenize(text,pattern='\s+',gaps=True)
    cleaned_tokens = []
    
    for t in tokens:
        if t not in stopwords_eng:
            l = lemmatizer.lemmatize(t)
            cleaned_tokens.append(l)
    
    return cleaned_tokens

In [7]:
process_text('Immerse yourself in “The World,” a MMORPG, and find out')

['Immerse', 'The', 'World', 'MMORPG', 'find']

In [8]:
%timeit process_text("Technology News | afr.com Business backs Malcolm Turnbull's 'fresh' look at reform | afr.com")

114 µs ± 5.53 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [9]:
model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin",binary=True)

In [10]:
def get_vec(word):
    try:
        return model[word]
    except:
        try:
            return model[int(word)]
        except:
            return np.zeros(300)

In [11]:
# we need name,genres,overview

game_vector = {}
for i in tqdm(l):
    text = i['Name'] + i['Genres'] + i['Overview']
    tokens = process_text(text)
    try:
        vector = sum([get_vec(t) for t in tokens]).tolist()

        game_vector[i['Name']] = vector
    except:
        game_vector[i['Name']] = np.zeros(300)
        
with open('gamestop/game_vector.txt','w') as f:
    json.dump(game_vector,f)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1049/1049 [00:01<00:00, 896.45it/s]


In [12]:
def get_sim(a,b):
    return 1-spatial.distance.cosine(game_vector[a],game_vector[b])

def get_5_sim(to_search):
    sim_measure = []
    for i in game_vector.keys():
        try:
            sim_measure.append(get_sim(to_search,i))
        except:
            sim_measure.append(0)
    sorted_sim_measure = np.argsort(sim_measure)[-6:-1][::-1]
    sim_games = [(l[s]['Name'],sim_measure[s]) for s in sorted_sim_measure]
    return sim_games

In [13]:
game_vector['FIFA 18']

[0.30756378173828125,
 0.37005615234375,
 2.43231201171875,
 -0.515380859375,
 -0.88629150390625,
 -0.2096710205078125,
 1.330078125,
 -2.1842193603515625,
 0.12841796875,
 1.4002685546875,
 -1.5438919067382812,
 -1.0692138671875,
 0.7731475830078125,
 -0.093994140625,
 -1.5003929138183594,
 1.2701416015625,
 1.2314453125,
 1.2930908203125,
 1.257659912109375,
 -1.141845703125,
 1.035888671875,
 1.6077880859375,
 -1.138671875,
 -1.59375,
 0.8523902893066406,
 -1.0439453125,
 -0.6619873046875,
 0.7646045684814453,
 2.3759765625,
 0.252685546875,
 -0.76611328125,
 -0.194091796875,
 -0.605560302734375,
 -0.31414794921875,
 -0.139404296875,
 -0.7406005859375,
 -0.0789794921875,
 -0.6356201171875,
 1.0743408203125,
 1.810882568359375,
 1.68878173828125,
 -0.811767578125,
 0.7783279418945312,
 1.646728515625,
 0.44989013671875,
 -1.635986328125,
 0.47576904296875,
 -0.5446624755859375,
 1.053955078125,
 2.159423828125,
 -0.58477783203125,
 1.4757080078125,
 0.24676513671875,
 -1.198486328125

In [14]:
g = get_5_sim('FIFA 18')
g

[('Need for Speed: Rivals', 0.7932066089120169),
 ('FIFA 17', 0.7908166863032599),
 ('FIFA 20', 0.7853443712550131),
 ('FIFA 15', 0.7814971174335752),
 ('FIFA 16', 0.7799982012665069)]

In [15]:
g = get_5_sim('Call of Duty: WWII')
g

[("The Witcher 3: Wild Hunt Collector's Edition", 0.8499542475722576),
 ('For Honor', 0.8394135023040565),
 ('Call of Duty 4: Modern Warfare Remastered', 0.8328136231669478),
 ('Cyberpunk 2077', 0.8286820375723332),
 ('Call of Duty: Black Ops IIII', 0.8282971809729374)]

In [16]:
g = get_5_sim('Far Cry Primal')
g

[("Assassin's Creed: Odyssey", 0.9136923208047454),
 ('Days Gone', 0.9099964073663812),
 ('Bullet Girls Phantasia', 0.8876354161516203),
 ("Don't Starve", 0.8873280244616404),
 ("Assassin's Creed Chronicles: Russia", 0.8871161420867303)]

Maybe we can use it as an api call for the front end? or just return an output. We can save game_vector in a file for smoother processing.

In [19]:
g[0][0]

"Assassin's Creed: Odyssey"