In [1]:
import os

# The notebook is a subfolder, must go back to parent folder
# Note that if if cell is executed twice, the remaining cells
# will fail because the current working directory will be wrong
os.chdir("..")

In [17]:
from src.vectorize import EmbeddingsModel
from tqdm import tqdm
import numpy as np
from pattern.text.en import parse
import pandas as pd
import jsbeautifier
from src.scrape import build_dataframe, normalize_csv, is_a_game, unique_games, get_review, games_abbreviation, look_for_games, get_user_ids 

# Part I: Identify a few games

With word embeddings, a lot of games can be found automatically. 
However, before doing so, we need to identify some games first (we will assume we know none of the games that are being reviewed).

In [18]:
if scrape_games:
    is_a_game()
    unique_games()
    games_abbreviation()
    look_for_games()

Finding games...


# Part II: Create a dataframe

All the relevant information within the json file will be organised into a well-organised pandas DataFrame to make all subsequent steps easier.

The Dataframe will be saved as a csv file which can be reloaded later to accelerate the process. 

The following step needs only to be done once per corpus

In [19]:
make_csv = True
if make_csv:
    build_dataframe()
    normalize_csv()

  0%|          | 0/25799 [00:00<?, ?it/s]

Building the dataframe in extracted/dataframes...


100%|██████████| 25799/25799 [14:18<00:00, 30.05it/s]  



Normalizing csv...
Normalizing text...


  0%|          | 0/466544 [00:00<?, ?it/s]

Creating dictionary for symspell
Creating Dictionary from array...


 60%|██████    | 280980/466544 [03:18<02:11, 1415.70it/s]


KeyboardInterrupt: 

### Find The most mentionned games among the ones that were found

In [20]:
scraped_games = pd.read_table("extracted/lists/game_lines.csv", sep='[')
scraped_games.columns = ["Game", "mentions"]
games = np.array(scraped_games["Game"])
games = np.array([g.strip(",") for g in games])
scraped_games["Game"] = games

new_df = pd.DataFrame(index=games, data=np.array([None for _ in games]).reshape(-1, 1), columns=["n_mentions"])
for i, key in enumerate(range(len(scraped_games["Game"]))):
    new_df["n_mentions"][i] = len(scraped_games["mentions"][i].split(","))

In [21]:
# There is some noise: The Game, Well There, The Launcher et Now 
# Insurgency Really is about a game called Insurgency

new_df = new_df.sort_values(by=['n_mentions'], ascending=False)
new_df[:20]

Unnamed: 0,n_mentions
Team Fortress 2,251
Nuclear Throne,232
Dont Starve,86
Dota 2,79
Insurgency Really,65
Portal,51
Sonic Generations,49
Counter Strike Global Offensive,43
Well There,36
The Culling,27


# Part III: Identify word embeddings (word2vec)


In [24]:
my_model = EmbeddingsModel()
train = False
direc = "saved_models/"
checkpoint = "gensim_model_pretrainedGoogleNews-vectors-negative300.bin.model"
if train:
    my_model.train("steam_australia_norm_w2v.csv", checkpoint=direc + "GoogleNews-vectors-negative300.bin")
    my_model.save_model(checkpoint)
my_model.load_model(direc + checkpoint)

2019-12-17 17:21:38,169 : INFO : loading Word2Vec object from saved_models/gensim_model_pretrainedGoogleNews-vectors-negative300.bin.model


FileNotFoundError: [Errno 2] No such file or directory: 'saved_models/gensim_model_pretrainedGoogleNews-vectors-negative300.bin.model'

In [23]:
tops_games = {
    'Borderlands': None,
    'borderlands': None,
    'FFIV': None,
    'fortress': None,
}
for top in tops_games:
    tops_games[top] = my_model.model.most_similar(top, topn=100)
    tops_games[top] = pd.DataFrame(tops_games[top], columns=["word", "similarity"])

NameError: name 'my_model' is not defined

In [250]:
tops_games['borderlands'][:20]

Unnamed: 0,word,similarity
0,bioshock,0.470961
1,mexicana,0.405006
2,indium_enriched,0.402543
3,sangre,0.398689
4,creed,0.394627
5,portal,0.39319
6,rebellion,0.391285
7,bl,0.390408
8,original,0.389454
9,borderland,0.383267


In [256]:
my_model.model.most_similar("Bioshock", topn=100)

[('BioShock', 0.8794984817504883),
 ('Killzone', 0.7690650820732117),
 ('GoW', 0.7541419267654419),
 ('Crysis', 0.7426438927650452),
 ('Metroid_Prime', 0.7352886199951172),
 ('Timesplitters', 0.7320432066917419),
 ('Mass_Effect', 0.7320175170898438),
 ('Mario_Galaxy', 0.7282251119613647),
 ('Assassin_Creed', 0.7241993546485901),
 ('GTAIV', 0.7229788303375244),
 ('Modern_Warfare', 0.7205285429954529),
 ('GTA4', 0.7203500270843506),
 ('Dead_Space', 0.7179621458053589),
 ('Darksiders', 0.7170871496200562),
 ('Alien_Breed', 0.7164978981018066),
 ('Dynasty_Warriors_Gundam', 0.7117792963981628),
 ('Bungie_Halo', 0.7100509405136108),
 ('Far_Cry', 0.7091567516326904),
 ('Resident_Evil', 0.7088419198989868),
 ('Geometry_Wars', 0.705782949924469),
 ('Twilight_Princess', 0.701901912689209),
 ('GTA_IV', 0.7005239725112915),
 ('Modern_Warfare_2', 0.6992732882499695),
 ('Ghost_Recon_Advanced_Warfighter', 0.6985984444618225),
 ('Dragon_Age', 0.6985704898834229),
 ('PixelJunk_Shooter', 0.6982147097587

In [248]:
my_model.model.most_similar("assassin_creed_brotherhood", topn=100)

KeyError: "word 'assassin_creed_brotherhood' not in vocabulary"

In [106]:
tops_nn = {
    'facets': None,
    'facet': None,
    'game': None,
    'games': None,
    'mode': None,
    'modes': None,
    'adjective': None,
    'collection': None,
    'descriptor': None,
    'description': None,
    'gametype': None,  # GOOD ONE FOR 4- identify types of games
    'classification': None,
    'experience': None,
    'category': None,
    'genre': None,
    'genres': None,
    'gametypes': None,
    'gamemode': None,
    'gamemodes': None,
    'multiplayer': None,
    'characteristics': None,
    'gameplay': None,
    'aspects': None,
    'features': None,   # GOOD ONE FOR 2- identify aspects (features) of a game
    'feature': None,
    'mechanics': None,
    'style': None,
    'cool': None,
    'awesome': None,
    'boring': None,
    'soundtrack': None,
    'elements': None,
    'storyline': None,
    'graphics': None,
    'graphic': None,
    'developer': None,
    'developers': None,
    'artwork': None,
    'music': None,
    'sound': None,
}
for top in tops_nn:
    tops_nn[top] = my_model.model.most_similar(top, topn=100)
    top_words = np.array(tops_nn[top])[:, 0].tolist()
    scores = np.array(tops_nn[top])[:, 1].tolist()
    top_words = [parse(str(k) + "\n") for k in top_words]
    top_nn_indices = [i for i, x in enumerate(top_words) if x.split("/")[1] in ["NN", "NNS"]]

    top_words = np.concatenate((np.array([top_words[i].split("/")[0] for i in top_nn_indices]).reshape(-1, 1), 
                                np.array([np.round(float(scores[i]), 3) for i in top_nn_indices]).reshape(-1, 1)), 1)
    tops_nn[top] = pd.DataFrame(top_words, columns=["word", "similarity"])

# Identify Games

In [112]:
tops_nn['mode'][:20]

Unnamed: 0,word,similarity
0,modes,0.447
1,freeplay_mode,0.389
2,Modes,0.386
3,างๆมากมายและย,0.383
4,alligns,0.382
5,option,0.376
6,difficulty,0.376
7,mission,0.373
8,difficulties,0.369
9,gamemode,0.366


In [107]:
tops_nn['mechanics'][:20]

Unnamed: 0,word,similarity
0,elements,0.477
1,mechanic,0.459
2,concepts,0.417
3,dynamics,0.415
4,controls,0.409
5,features,0.404
6,aspects,0.402
7,visuals,0.402
8,machanics,0.399
9,ideas,0.392


In [104]:
tops_nn['features'][:20]

Unnamed: 0,word,similarity
0,elements,0.53
1,aspects,0.499
2,things,0.449
3,functionality,0.442
4,possiblities,0.436
5,functionalities,0.436
6,concepts,0.432
7,gadgets,0.432
8,ideas,0.427
9,biomes,0.426


In [103]:
tops_nn['gametypes'][:30]

Unnamed: 0,word,similarity
0,pinball_tables,0.523
1,modes,0.494
2,gametype,0.469
3,gameplay_mechanics,0.467
4,variations,0.461
5,addicting_gameplay,0.461
6,CG_cutscenes,0.46
7,player_splitscreen,0.459
8,Overlord_Minions,0.455
9,deathmatch_mode,0.455


In [101]:
tops_nn['artwork'][:10]

Unnamed: 0,word,similarity
0,landscapes,0.533
1,abstract_compositions,0.523
2,sountrack,0.517
3,presentation,0.515
4,narration,0.504
5,backdrops,0.501
6,monochromatic_palette,0.499
7,pixelovej,0.499
8,anime_cutscenes,0.498
9,tenhle,0.496


In [102]:
tops_nn['music'][:10]

Unnamed: 0,word,similarity
0,soundtrack,0.577
1,tunes,0.521
2,acoustic_instrumentation,0.505
3,sonic_landscapes,0.497
4,soothing_melodies,0.491
5,sound,0.49
6,layered_vocals,0.485
7,song,0.483
8,soothing_melody,0.48
9,lilting_melody,0.478


In [93]:
tops_nn['developers'][:10]

Unnamed: 0,word,similarity
0,devs,0.674
1,developer,0.552
2,dev,0.546
3,creators,0.49
4,makers,0.397
5,throutle,0.393
6,people,0.391
7,Developers,0.389
8,valve,0.386
9,vilans,0.385


In [85]:
tops_nn['graphics'][:10]

Unnamed: 0,word,similarity
0,graphics,0.56
1,pixellation,0.442
2,pixilation,0.437
3,alpha_blending,0.437
4,JPEG_compression,0.431
5,bitmap_images,0.428
6,font_smoothing,0.42
7,polygon_counts,0.414
8,visuals,0.412
9,teacher_Rizwanur_Rehman,0.411


In [83]:
tops_nn['storyline'][:10]

Unnamed: 0,word,similarity
0,story,0.62
1,plot,0.539
2,subplots,0.481
3,backstory,0.48
4,soundtrack,0.475
5,gameplay,0.448
6,storylines,0.442
7,subplot,0.439
8,mythos,0.438
9,artwork,0.423


In [79]:
tops_nn['soundtrack'][:10]

Unnamed: 0,word,similarity
0,music,0.577
1,vocals,0.515
2,artwork,0.488
3,storyline,0.475
4,atmosphere,0.467
5,falsetto_vocals,0.467
6,cinematography,0.465
7,inventive_choreography,0.461
8,lush_harmonies,0.46
9,ambient_textures,0.459


In [63]:
tops_nn['gametype'][:20]
# 집짓는다 google translate: Build a house in korean

Unnamed: 0,word,similarity
0,Singleplayer,0.586
1,inexistente,0.582
2,deathmatch_mode,0.572
3,집짓는다,0.562
4,AI_bots,0.558
5,branching_paths,0.557
6,amenos,0.556
7,서버로,0.552
8,dungeon_crawl,0.546
9,Deathmatch,0.545


In [64]:
tops_nn['multiplayer'][:20]

Unnamed: 0,word,similarity
0,singleplayer,0.442
1,multiplayer_mode,0.437
2,AI_bots,0.435
3,matchmaking,0.422
4,gameplay,0.417
5,resalution,0.414
6,game,0.41
7,coop,0.399
8,configs,0.397
9,iWork.com,0.395


In [65]:
tops_nn['mode'][:20]

Unnamed: 0,word,similarity
0,modes,0.447
1,freeplay_mode,0.389
2,Modes,0.386
3,างๆมากมายและย,0.383
4,alligns,0.382
5,option,0.376
6,difficulty,0.376
7,mission,0.373
8,difficulties,0.369
9,gamemode,0.366


In [66]:
tops_nn['modes'][:20]

Unnamed: 0,word,similarity
0,gametypes,0.494
1,Multiplayer_modes,0.448
2,mode,0.447
3,maps,0.443
4,multiplayer_modes,0.436
5,classes,0.433
6,branching_paths,0.433
7,neste,0.43
8,cooperative_multiplayer_mode,0.428
9,freeplay_mode,0.425


In [68]:
tops_nn['gamemode'][:20]

Unnamed: 0,word,similarity
0,deathmatch,0.439
1,severs,0.427
2,continent,0.411
3,channels,0.411
4,versus,0.403
5,class,0.396
6,terrorist,0.394
7,battleblock,0.387
8,deathmatches,0.381
9,şansımız,0.378


In [13]:
vocab = list(my_model.get_vocab().keys())
w = open(direc + "vocab_" + checkpoint, "w+")
with tqdm(total=len(vocab)) as pbar:
    for v in vocab:
        w.write(v + "\n")
        pbar.update(1)
w.close()

#w = open("vocab_word2vec_POS.txt", "w+")
#with tqdm(total=len(vocab)) as pbar:
#    for v in vocab:
#        w.write(tag(v) + "\n")
#        pbar.update(1)
#w.close()

w = open(checkpoint + "_parse.txt", "w+")
with tqdm(total=len(vocab)) as pbar:
    for v in vocab:
        w.write(parse(str(v) + "\n") + "\n")
        pbar.update(1)
w.close()


# These names are in game_names, obtained in draft.py from taking words before the pattern "is a * game"
# The list is small and noisy. We will take a single popular game,
# top100 = my_model.model.most_similar('Borderlands', topn=100)


100%|██████████| 3048327/3048327 [00:04<00:00, 659277.07it/s]
  1%|          | 21959/3048327 [00:09<21:18, 2367.45it/s]


KeyboardInterrupt: 

In [77]:
tops_nn['elements'][:20]

Unnamed: 0,word,similarity
0,features,0.53
1,aspects,0.502
2,mechanics,0.477
3,techniques,0.472
4,ideas,0.457
5,поиграл,0.435
6,concepts,0.433
7,things,0.421
8,influences,0.419
9,วไปเเน,0.412


In [17]:
tops = {
    'Borderlands': None,
    'facets': None,
    'adjective': None,
    'type': None,
    'multiplayer': None,
    'characteristics': None,
    'gameplay': None,   # Good example for 3- find adjectives (or other information) characteristics of a facet of a game
    'god': None,
    'FF7': None,
    'rts': None,
}
for top in tops:
    tops[top] = my_model.model.most_similar(top, topn=1000)
    tops[top] = np.array(tops[top])[:, 0].tolist()
    tops[top] = [parse(k) for k in tops[top]]
    tops[top] = [x.split("/")[0] for x in tops[top] if x.split("/")[1] in ["JJ", "JJR", "JJS"]]




In [18]:
tops["gameplay"]

['interesting',
 'fabletown',
 'audio',
 'undepowered',
 'suggestive',
 'fortissimos',
 'awesomemusic',
 'Densely_packed',
 'satiny',
 'visual',
 'textural',
 'Schubertian',
 'delicate_filigree',
 'narrative',
 'fanlike',
 'dull',
 'gameplay_mechanic',
 'floral_motifs',
 'flowerlike',
 'varied',
 'precise',
 'fried_eggplant',
 'objets_de',
 'violenced',
 'lace_applique_floral',
 'enantiomeric',
 'Service_Medal_Noncommissioned',
 'fernlike',
 'good',
 'fluted_columns',
 'digressive',
 'Lutoslawski',
 'inaccurate',
 'organzas',
 'overworld_map',
 'flimsy',
 'orangey_yellow',
 'Mindquarry_Mindshare_MindTouch_Molecular',
 'honeycomb_mesh',
 'grated_ginger',
 'strummed_acoustic',
 'unispired',
 'Lisztian',
 'crumbled_feta_cheese',
 'orange_zest',
 'Hedgehog_Sonic',
 'mashed_garlic',
 'custardy',
 'roasted_eggplant',
 'Pradeep_Unni_senior',
 'concentric_circular',
 'Ravel_La_Valse',
 'timbral',
 'intricate_geometric',
 'fontina_cheese',
 'geometrical',
 'fresh_basil',
 'clunky',
 'meticulous

In [19]:
tops["multiplayer"]

['online',
 'iPhoneDrive',
 'möglich',
 'onze',
 'sensing_controller',
 'cooperative',
 'LifeBook_MH',
 'anonymising',
 'VIERA_Cast',
 'fullscreen',
 'ZumoCast',
 'MindAlign',
 'FormoPublish',
 'Onlive',
 'L#D',
 'PacketCable_compliant',
 'i5&slash;OS_V#R',
 'Wordscraper',
 'neoTouch',
 'BuzzCast',
 'QuickTime',
 'uniprocessor',
 'IP_Multicast',
 'Postgres_Plus_Advanced',
 'Microsoft_SkyDrive',
 'Deutsche_Bourse',
 'Tomos_Livingstone_Western',
 'DNS_prefetching',
 'MSN_Virtual',
 'v',
 '1GB_Nvidia_GeForce',
 'Orc_Hosted',
 'Moveable_Type',
 'eCosCentric',
 'wаtch_NFL_Live',
 'competitive',
 'chanted_Jahvid',
 'ausschließlich',
 'Ruzan_Harutyunyan',
 'oak_savannas',
 'memcached',
 'VIERA_CAST_enabled',
 'Omnidrive',
 'Windows_Live_Mesh',
 'instant_messenger_buddy',
 'NAT_firewall',
 'Wii_Speak',
 'CD-RW&slash;DVD_combo_drive',
 '1Gbyte',
 'Madeleine_Brindley_Western']

In [20]:
display_closestwords_pcascatterplot(my_model.model, "Borderlands")
display_closestwords_pcascatterplot(my_model.model, "Portals")
display_closestwords_pcascatterplot(my_model.model, "FFVII")

NameError: name 'display_closestwords_pcascatterplot' is not defined