### Imports

In [1]:
import json

with open('igdb_games.json', 'r') as file:
    games = json.load(file)
    
with open('igdb_keywords.json', 'r') as file:
    keywords = json.load(file)

### Term lists

In [2]:
term_lists = []
term_list_CHINA = []
term_list_EGYPT = []
term_list_GREECE = []
term_list_JAPAN = []
term_list_MIDDLEEAST = []
term_list_NORWAY = []
term_list_ROME = []

with open('culture_term_lists/china.txt') as file:
    for line in file:
        term_list_CHINA.append(line.strip())

with open('culture_term_lists/egypt.txt') as file:
    for line in file:
        term_list_EGYPT.append(line.strip())
        
with open('culture_term_lists/greece.txt') as file:
    for line in file:
        term_list_GREECE.append(line.strip())
        
with open('culture_term_lists/japan.txt') as file:
    for line in file:
        term_list_JAPAN.append(line.strip())
        
with open('culture_term_lists/middle_east.txt') as file:
    for line in file:
        term_list_MIDDLEEAST.append(line.strip())
        
with open('culture_term_lists/norway.txt') as file:
    for line in file:
        term_list_NORWAY.append(line.strip())
        
with open('culture_term_lists/rome.txt') as file:
    for line in file:
        term_list_ROME.append(line.strip())
        
term_lists.extend(term_list_CHINA)
term_lists.extend(term_list_EGYPT)
term_lists.extend(term_list_GREECE)
term_lists.extend(term_list_JAPAN)
term_lists.extend(term_list_MIDDLEEAST)
term_lists.extend(term_list_NORWAY)
term_lists.extend(term_list_ROME)

### Preprocessing

In [3]:
# Turning keywords into a dictionary for later use
keywords_dict = {}
for keyword in keywords:
    keywords_dict[keyword['id']] = keyword['name']

letters = "abcdefghijklmnopqrstuvwxyz-"

ignore_list = []
with open('ignore_list.txt', 'r') as file:
    for line in file:
        ignore_list.append(line.strip())
ignore_list.append("")

In [4]:
# DATA PREPROCESSING

# 1. Concatenating storyline, summary, and keywords
# 2. Only allowing lowercase letters + hyphen
# 3. Removing all games that have 0 "culture words" from consideration
# 4. Only keeping games with 5 or more culture words

# These are done to prepare the data for sklearn TF-IDF vectorizer

# GAME ID -> TEXT
CLEANED_DATA = {}

for game in games:
    FULL_TEXT = "" #FULL GAME TEXT
    GAME_TEXT = "" #FINAL GAME TEXT
    
    if 'summary' in game:
        FULL_TEXT += game['summary'] + " "
        
    if 'storyline' in game:
        FULL_TEXT += game['storyline'] + " "
        
    if 'keywords' in game:
        KEYWORD_LIST = game['keywords']
        for keyword_id in KEYWORD_LIST:
            FULL_TEXT += ''.join(keywords_dict[keyword_id].split()) + " "
            
    for word in FULL_TEXT.split():
        CURRENT_WORD = ''.join(char for char in word.lower() if char in letters)
        if CURRENT_WORD in term_lists and CURRENT_WORD not in ignore_list:
            GAME_TEXT += CURRENT_WORD + " "
            
    if len(GAME_TEXT.split()) > 6:
        CLEANED_DATA[game['id']] = GAME_TEXT    

### Algorithm

In [5]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
from sklearn.feature_extraction.text import TfidfVectorizer

GAME_IDS = list(CLEANED_DATA.keys())
GAME_TEXTS = list(CLEANED_DATA.values())

VECTORIZER = TfidfVectorizer()
TFIDF_MATRIX = VECTORIZER.fit_transform(GAME_TEXTS)

print(TFIDF_MATRIX.get_shape())

(532, 237)


In [6]:
from sklearn.cluster import KMeans

KMEANS = KMeans(n_clusters=12, random_state=42)
KMEANS.fit(TFIDF_MATRIX)

LABELS = KMEANS.labels_
CLUSTERS = pd.DataFrame({
    'GAME_ID': GAME_IDS,
    'CLUSTER #': LABELS
})

print(CLUSTERS)

     GAME_ID  CLUSTER #
0        436          3
1        445         10
2        438         10
3        499          0
4        290         10
5        454          8
6        443          3
7        432          3
8        318          2
9        324          2
10       550         11
11       552          0
12       898          4
13       836          4
14       649          6
15       527          4
16       549         11
17       551          0
18       912          4
19       606          3
20      1284          9
21      1158          4
22      1253          8
23      1423          3
24      1291         11
25      1164          9
26      1271          0
27      1346          0
28      1979          6
29      1971          0
30      1732          4
31      2059          9
32      2060          9
33      2061          9
34      2062          9
35      2266          1
36      2598          1
37      2863          3
38      2901          9
39      2865          4
40      2525    

In [7]:
CLUSTERS.to_csv('clusters.csv', index=False)

Clusters:

0 - greek

1 - japan

2 - norse

3 - japan

4 - egypt

5 - norse

6 - china

7 - japan

8 - norse

9 - japan

10 - rome

11 - greek