# Loading files

In [42]:
import json

with open('igdb_games.json', 'r') as file:
    games = json.load(file)
    
with open('igdb_keywords.json', 'r') as file:
    keywords = json.load(file)
    
with open('igdb_characters.json', 'r') as file:
    characters = json.load(file)

# Checking some stats for the data

In [43]:
# checking fields
# Just for the UI, this will not be used in the classification/clustering algorithm.
games_with_covers = [game for game in games if 'cover' in game]
print(f"There are {len(games_with_covers)} games with covers. This is {len(games_with_covers) / len(games) * 100:.4f}%.")

# The rest of these will maybe be used for the classification/clustering algorithm.
games_with_genres = [game for game in games if 'genres' in game]
print(f"There are {len(games_with_genres)} games with genres. This is {len(games_with_genres) / len(games) * 100:.4f}%.")

games_with_keywords = [game for game in games if 'keywords' in game]
print(f"There are {len(games_with_keywords)} games with keywords. This is {len(games_with_keywords) / len(games) * 100:.4f}%.")

games_with_names = [game for game in games if 'name' in game]
print(f"There are {len(games_with_names)} games with names. This is {len(games_with_names) / len(games) * 100:.4f}%.")

games_with_similar_games = [game for game in games if 'similar_games' in game]
print(f"There are {len(games_with_similar_games)} games with similar games. This is {len(games_with_similar_games) / len(games) * 100:.4f}%.")

games_with_storylines = [game for game in games if 'storyline' in game]
print(f"There are {len(games_with_storylines)} games with storylines. This is {len(games_with_storylines) / len(games) * 100:.4f}%.")

games_with_summaries = [game for game in games if 'summary' in game]
print(f"There are {len(games_with_summaries)} games with summaries. This is {len(games_with_summaries) / len(games) * 100:.4f}%.")

games_with_themes = [game for game in games if 'themes' in game]
print(f"There are {len(games_with_themes)} games with themes. This is {len(games_with_themes) / len(games) * 100:.4f}%.")

# Getting the number of games that have a character
games_with_characters = set()
for character in characters:
    if 'games' in character:
        for game in character['games']:
            games_with_characters.add(game)
        
print(f"There are {len(games_with_characters)} games with characters. This is {len(games_with_characters) / len(characters) * 100:.4f}%.")

characters_with_descriptions = [character for character in characters if 'description' in character]
print(f"There are {len(characters_with_descriptions)} characters with descriptions. This is {len(characters_with_descriptions) / len(characters) * 100:.4f}%.")

characters_with_countries = [character for character in characters if 'country_name' in character]
print(f"There are {len(characters_with_countries)} characters with countries. This is {len(characters_with_countries) / len(characters) * 100:.4f}%.")

characters_with_genders = [character for character in characters if 'gender' in character]
print(f"There are {len(characters_with_genders)} characters with genders. This is {len(characters_with_genders) / len(characters) * 100:.4f}%.")

characters_with_species = [character for character in characters if 'species' in character]
print(f"There are {len(characters_with_species)} characters with species. This is {len(characters_with_species) / len(characters) * 100:.4f}%.")

There are 229887 games with covers. This is 77.8737%.
There are 243023 games with genres. This is 82.3235%.
There are 100034 games with keywords. This is 33.8863%.
There are 295205 games with names. This is 100.0000%.
There are 241584 games with similar games. This is 81.8360%.
There are 26266 games with storylines. This is 8.8975%.
There are 253330 games with summaries. This is 85.8149%.
There are 163230 games with themes. This is 55.2938%.
There are 980 games with characters. This is 5.9387%.
There are 201 characters with descriptions. This is 1.2180%.
There are 0 characters with countries. This is 0.0000%.
There are 511 characters with genders. This is 3.0966%.
There are 510 characters with species. This is 3.0905%.


# TF-IDF for summaries + storylines

### Combining storylines and summaries before calculating TF-IDF as this makes sense logically

In [44]:
import math
from collections import deque
# text: game id -> summary + storyline
game_text = {}
# tfidfs: game id -> term -> tf-idf
tfidfs = {}
# ndwet: term -> number of documents with each term (ndwet)
ndwet = {}

letters = "abcdefghijklmnopqrstuvwxyz-"
ignore_list = []

with open('ignore_list.txt', 'r') as file:
    for line in file:
        ignore_list.append(line.strip())
ignore_list.append("")
print(ignore_list)

max_tfidf = 0
total_tfidf = 0
n_tfidf = 0

x = 20
n_greater = 0
queue = deque()

# combining storylines and summaries into a single "document" for each game
for game in games:
    text = ""
    if 'storyline' in game:
        text += game['storyline'] + " "
    if 'summary' in game:
        text += game['summary'] + " "
    if 'storyline' in game or 'summary' in game:
        game_text[game['id']] = text
        
# lowercasing and removing unallowed characters and words
for id in game_text:
    cleaned_text = []
    text = game_text[id].split()
    for i in range(len(text)):
        text[i] = text[i].lower()
        text[i] = ''.join(char for char in text[i] if char in letters)
        
        if text[i] not in ignore_list:
            cleaned_text.append(text[i])

    game_text[id] = cleaned_text

# getting number of documents with each term
for id in game_text:
    text = list(set(game_text[id]))

    for term in text:
        if term not in ndwet:
            ndwet[term] = 1
        else:
            ndwet[term] += 1

# TF calculation, then TF-IDF calculation
for id in game_text:
    tfidfs[id] = {}
    
    text = game_text[id]
    
    for term in text:
        if term not in tfidfs[id]:
            tfidfs[id][term] = 1
        else:
            tfidfs[id][term] += 1
            
    for term in tfidfs[id]:
        # TF-IDF
        tfidfs[id][term] = (math.log(tfidfs[id][term]) + 1) * (math.log((len(game_text) + 1) / (ndwet[term] + 1)) + 1)
        total_tfidf += tfidfs[id][term]
        n_tfidf += 1
        if tfidfs[id][term] > max_tfidf:
            max_tfidf = tfidfs[id][term]
            queue.append(term)
            if len(queue) == 10:
                queue.pop()
        if tfidfs[id][term] > x:
            n_greater += 1

print(f"max terms: {queue}")
print(f"max tf-idf: {max_tfidf:.4f}")
print(f"avg tf-idf: {total_tfidf / n_tfidf:.4f}")
print(f"num tf-idf: {n_tfidf}")
print(f"num tf-idf > x = {x}: {n_greater}; {n_greater / n_tfidf * 100 :.4f}%")

['digital distribution', 'steam', 'steam achievements', 'steam trading cards', 'steam workshop', 'mod support', 'untagged', 'the', 'to', 'of', 'and', 'a', 'in', 'is', 'as', 'you', 'your', 'with', 'for', 'on', 'an', 'by', 'from', 'new', 'that', 'this', 'it', 'are', '']
max terms: deque(['ballblazer', 'scalar', 'aguire', 'powell', 'kerrigan', 'fei', 'methus', 'stocke', 'hendricks'])
max tf-idf: 65.3755
avg tf-idf: 6.6433
num tf-idf: 8365004
num tf-idf > x = 20: 56935; 0.6806%


# Tagging games with keywords and TF-IDF

In [45]:
# tagged_games: game id -> term/keyword -> TF-IDF/max TF-IDF (?)
tagged_games = {}
# keywords_dict: keyword id -> keyword(s)
keywords_dict = {}

for keyword in keywords:
    keywords_dict[keyword['id']] = keyword['name']

# Tagging with storyline + summary TF-IDFs, then keywords (overwriting if necessary)
for game in games:
    game_id = game['id']
    
    if game_id in game_text:
        tagged_games[game_id] = tfidfs[game_id]
        
    if 'keywords' in game:
        if game_id not in tagged_games:
            tagged_games[game_id] = {}
            
        keyword_list = game['keywords']
        for keyword_id in keyword_list:
            tagged_games[game_id][keywords_dict[keyword_id]] = 20

# Term lists

In [46]:
term_lists = []
term_list_CHINA = []
term_list_EGYPT = []
term_list_GREECE = []
term_list_JAPAN = []
term_list_MIDDLEEAST = []
term_list_NORWAY = []
term_list_ROME = []

with open('culture_term_lists/china.txt') as file:
    for line in file:
        term_list_CHINA.append(line.strip())

with open('culture_term_lists/egypt.txt') as file:
    for line in file:
        term_list_EGYPT.append(line.strip())
        
with open('culture_term_lists/greece.txt') as file:
    for line in file:
        term_list_GREECE.append(line.strip())
        
with open('culture_term_lists/japan.txt') as file:
    for line in file:
        term_list_JAPAN.append(line.strip())
        
with open('culture_term_lists/middle_east.txt') as file:
    for line in file:
        term_list_MIDDLEEAST.append(line.strip())
        
with open('culture_term_lists/norway.txt') as file:
    for line in file:
        term_list_NORWAY.append(line.strip())
        
with open('culture_term_lists/rome.txt') as file:
    for line in file:
        term_list_ROME.append(line.strip())
        
term_lists.append(term_list_CHINA)
term_lists.append(term_list_EGYPT)
term_lists.append(term_list_GREECE)
term_lists.append(term_list_JAPAN)
term_lists.append(term_list_MIDDLEEAST)
term_lists.append(term_list_NORWAY)
term_lists.append(term_list_ROME)

# term_lists.extend(term_list_CHINA)
# term_lists.extend(term_list_EGYPT)
# term_lists.extend(term_list_GREECE)
# term_lists.extend(term_list_JAPAN)
# term_lists.extend(term_list_MIDDLEEAST)
# term_lists.extend(term_list_NORWAY)
# term_lists.extend(term_list_ROME)

# Labelling/classifying games with cultures they might be related to

In [47]:
# game_categories: game id -> categories [(category, category value/weight)]
game_categories = {}
# C, E, G, J, M, N, R
counts = [0 for i in range(7)]

for id in tagged_games:
    game_categories[id] = []
    # C, E, G, J, M, N, R
    values = [0 for i in range(7)]
    for term in tagged_games[id]:
        for i in range(7):
            if term in term_lists[i]:
                values[i] += tagged_games[id][term]
    
    for i in range(7):
        if values[i] >= 20:
            # include value for ranking? later
            game_categories[id].append((i, values[i]))
            counts[i] += 1

print(f"China: {counts[0]}")
print(f"Egypt: {counts[1]}")
print(f"Greece: {counts[2]}")
print(f"Japan: {counts[3]}")
print(f"Middle east: {counts[4]}")
print(f"Norway: {counts[5]}")
print(f"Rome: {counts[6]}")

China: 336
Egypt: 548
Greece: 297
Japan: 2048
Middle east: 153
Norway: 271
Rome: 158
