In [1]:
import os

# The notebook is a subfolder, must go back to parent folder
# Note that if if cell is executed twice, the remaining cells
# will fail because the current working directory will be wrong
os.chdir("..")

In [2]:
from src.vectorize import EmbeddingsModel
from tqdm import tqdm
import numpy as np
from pattern.text.en import parse
import pandas as pd
import jsbeautifier
from src.scrape import build_dataframe, normalize_csv, is_a_game, unique_games, get_review, games_abbreviation, look_for_games, get_user_ids 

# Part I: Identify a few games

With word embeddings, a lot of games can be found automatically. 
However, before doing so, we need to identify some games first (we will assume we know none of the games that are being reviewed).

In [3]:
scrape_games = False
if scrape_games:
    is_a_game()
    unique_games()
    games_abbreviation()
    look_for_games()

# Part II: Create a dataframe

All the relevant information within the json file will be organised into a well-organised pandas DataFrame to make all subsequent steps easier.

The Dataframe will be saved as a csv file which can be reloaded later to accelerate the process. 

The following step needs only to be done once per corpus

In [4]:
make_csv = False
if make_csv:
    build_dataframe()
    normalize_csv()

### Find The most mentionned games among the ones that were found

In [5]:
scraped_games = pd.read_table("extracted/lists/game_lines.csv", sep='[')
scraped_games.columns = ["Game", "mentions"]
games = np.array(scraped_games["Game"])
games = np.array([g.strip(",") for g in games])
scraped_games["Game"] = games

new_df = pd.DataFrame(index=games, data=np.array([None for _ in games]).reshape(-1, 1), columns=["n_mentions"])
for i, key in enumerate(range(len(scraped_games["Game"]))):
    new_df["n_mentions"][i] = len(scraped_games["mentions"][i].split(","))
# There is some noise: The Game, Well There, The Launcher et Now 
# Insurgency Really is about a game called Insurgency

new_df = new_df.sort_values(by=['n_mentions'], ascending=False)
new_df

Unnamed: 0,n_mentions
Team Fortress 2,251
Nuclear Throne,232
Dont Starve,86
Dota 2,79
Insurgency Really,65
Portal,51
Sonic Generations,49
Counter Strike Global Offensive,43
Well There,36
The Culling,27


# Part III: Identify word embeddings (word2vec)


In [8]:
my_model = EmbeddingsModel()
train = False
direc = "saved_models/gensim_model"
model_name = "pretrained_none"
checkpoint_fname = "steam_australia_norm_w2v"
chechpoint_path = "{}/{}".format(direc, model_name)
if train:
    my_model.train(checkpoint_fname + ".csv", checkpoint=chechpoint_path)
    my_model.save_model(checkpoint)
my_model.load_model("{}/{}.model".format(chechpoint_path, checkpoint_fname))

2019-12-17 21:40:03,553 : INFO : loading Word2Vec object from saved_models/gensim_model/pretrained_none/steam_australia_norm_w2v.model
2019-12-17 21:40:03,691 : INFO : loading wv recursively from saved_models/gensim_model/pretrained_none/steam_australia_norm_w2v.model.wv.* with mmap=None
2019-12-17 21:40:03,692 : INFO : loading vectors from saved_models/gensim_model/pretrained_none/steam_australia_norm_w2v.model.wv.vectors.npy with mmap=None
2019-12-17 21:40:03,860 : INFO : setting ignored attribute vectors_norm to None
2019-12-17 21:40:03,861 : INFO : loading vocabulary recursively from saved_models/gensim_model/pretrained_none/steam_australia_norm_w2v.model.vocabulary.* with mmap=None
2019-12-17 21:40:03,862 : INFO : loading trainables recursively from saved_models/gensim_model/pretrained_none/steam_australia_norm_w2v.model.trainables.* with mmap=None
2019-12-17 21:40:03,864 : INFO : loading syn1neg from saved_models/gensim_model/pretrained_none/steam_australia_norm_w2v.model.trainab

In [12]:
tops_games = {
    'Team_Fortress': None,
    'Dont_Starve': None,
}
for top in tops_games:
    tops_games[top] = my_model.model.most_similar(top, topn=100)
    tops_games[top] = pd.DataFrame(tops_games[top], columns=["word", "similarity"])

In [13]:
tops_games['Dont_Starve'][:20]

Unnamed: 0,word,similarity
0,exhilarations,0.410287
1,neatherd,0.394578
2,etamine,0.389226
3,tertians,0.372929
4,anicca,0.36615
5,pencils,0.33801
6,Dynasty_Warriors,0.324781
7,gavelock,0.320265
8,Tenpin_Bowling,0.320146
9,okey,0.315712


In [15]:
tops_games['Team_Fortress'][:20]

Unnamed: 0,word,similarity
0,preciously,0.480343
1,atf,0.416674
2,sandburs,0.384585
3,sweepback,0.376213
4,clank,0.370969
5,rea,0.366119
6,waymart,0.363605
7,machineries,0.3612
8,wackiness,0.358608
9,meshier,0.357917


In [14]:
my_model.model.most_similar("dungeon_crawl", topn=100)

KeyError: "word 'dungeon_crawl' not in vocabulary"

In [248]:
my_model.model.most_similar("assassin_creed_brotherhood", topn=100)

KeyError: "word 'assassin_creed_brotherhood' not in vocabulary"

In [16]:
tops_nn = {
    'facets': None,
    'facet': None,
    'game': None,
    'games': None,
    'mode': None,
    'modes': None,
    'adjective': None,
    'collection': None,
    'descriptor': None,
    'description': None,
    'gametype': None,  # GOOD ONE FOR 4- identify types of games
    'classification': None,
    'experience': None,
    'category': None,
    'genre': None,
    'genres': None,
    'gametypes': None,
    'multiplayer': None,
    'characteristics': None,
    'gameplay': None,
    'aspects': None,
    'features': None,   # GOOD ONE FOR 2- identify aspects (features) of a game
    'feature': None,
    'mechanics': None,
    'style': None,
    'cool': None,
    'awesome': None,
    'boring': None,
    'soundtrack': None,
    'elements': None,
    'storyline': None,
    'graphics': None,
    'graphic': None,
    'developer': None,
    'developers': None,
    'artwork': None,
    'music': None,
    'sound': None,
}
for top in tops_nn:
    tops_nn[top] = my_model.model.most_similar(top, topn=100)
    top_words = np.array(tops_nn[top])[:, 0].tolist()
    scores = np.array(tops_nn[top])[:, 1].tolist()
    top_words = [parse(str(k) + "\n") for k in top_words]
    top_nn_indices = [i for i, x in enumerate(top_words) if x.split("/")[1] in ["NN", "NNS"] and x.split("/")[0].lower() == x.split("/")[0]]

    top_words = np.concatenate((np.array([top_words[i].split("/")[0] for i in top_nn_indices]).reshape(-1, 1), 
                                np.array([np.round(float(scores[i]), 3) for i in top_nn_indices]).reshape(-1, 1)), 1)
    tops_nn[top] = pd.DataFrame(top_words, columns=["word", "similarity"])

RuntimeError: generator raised StopIteration

In [28]:
tops_nn['mode'][:20]

Unnamed: 0,word,similarity
0,modes,0.421
1,SUMOylation,0.382
2,Modes,0.375
3,PvP_battles,0.368
4,genialness,0.361
5,pvp,0.36
6,versus,0.358
7,geth,0.358
8,coop,0.358
9,challenges,0.356


In [29]:
tops_nn['mechanics'][:20]

Unnamed: 0,word,similarity
0,elements,0.479
1,concepts,0.455
2,controls,0.42
3,mechanic,0.419
4,features,0.41
5,aspects,0.403
6,pneumatics_hydraulics,0.39
7,tactics,0.388
8,animations,0.386
9,dynamics,0.382


In [30]:
tops_nn['features'][:20]

Unnamed: 0,word,similarity
0,elements,0.505
1,tweaks,0.462
2,additions,0.453
3,aspects,0.448
4,ideas,0.438
5,functionality,0.416
6,mechanics,0.41
7,things,0.408
8,concepts,0.404
9,challenges,0.397


In [31]:
tops_nn['gametypes'][:30]

Unnamed: 0,word,similarity
0,gametype,0.771
1,deathmatch_mode,0.689
2,loadouts,0.678
3,Deathmatch_Team_Deathmatch,0.676
4,mutators,0.669
5,mulitplayer,0.668
6,deathmatches,0.661
7,killstreaks,0.656
8,playstyle,0.655
9,playstyles,0.654


In [102]:
tops_nn['music'][:10]

Unnamed: 0,word,similarity
0,soundtrack,0.577
1,tunes,0.521
2,acoustic_instrumentation,0.505
3,sonic_landscapes,0.497
4,soothing_melodies,0.491
5,sound,0.49
6,layered_vocals,0.485
7,song,0.483
8,soothing_melody,0.48
9,lilting_melody,0.478


In [93]:
tops_nn['developers'][:10]

Unnamed: 0,word,similarity
0,devs,0.674
1,developer,0.552
2,dev,0.546
3,creators,0.49
4,makers,0.397
5,throutle,0.393
6,people,0.391
7,Developers,0.389
8,valve,0.386
9,vilans,0.385


In [85]:
tops_nn['graphics'][:10]

Unnamed: 0,word,similarity
0,graphics,0.56
1,pixellation,0.442
2,pixilation,0.437
3,alpha_blending,0.437
4,JPEG_compression,0.431
5,bitmap_images,0.428
6,font_smoothing,0.42
7,polygon_counts,0.414
8,visuals,0.412
9,teacher_Rizwanur_Rehman,0.411


In [79]:
tops_nn['soundtrack'][:10]

Unnamed: 0,word,similarity
0,music,0.577
1,vocals,0.515
2,artwork,0.488
3,storyline,0.475
4,atmosphere,0.467
5,falsetto_vocals,0.467
6,cinematography,0.465
7,inventive_choreography,0.461
8,lush_harmonies,0.46
9,ambient_textures,0.459


In [33]:
tops_nn['gametype'][:20]
# 집짓는다 google translate: Build a house in korean

Unnamed: 0,word,similarity
0,gametypes,0.771
1,deathmatch_mode,0.731
2,Deathmatch,0.679
3,deathmatch,0.675
4,loadouts,0.672
5,mulitplayer,0.666
6,deathmatches,0.662
7,Gameplay_wise,0.658
8,Achievements_Trophies,0.654
9,playstyle,0.651


In [64]:
tops_nn['multiplayer'][:20]

Unnamed: 0,word,similarity
0,singleplayer,0.442
1,multiplayer_mode,0.437
2,AI_bots,0.435
3,matchmaking,0.422
4,gameplay,0.417
5,resalution,0.414
6,game,0.41
7,coop,0.399
8,configs,0.397
9,iWork.com,0.395


In [83]:
tops_nn['storyline'][:10]

Unnamed: 0,word,similarity
0,story,0.62
1,plot,0.539
2,subplots,0.481
3,backstory,0.48
4,soundtrack,0.475
5,gameplay,0.448
6,storylines,0.442
7,subplot,0.439
8,mythos,0.438
9,artwork,0.423


# Identify Games

In [85]:
tops_games = {
    'Borderlands': None,
    'Super_Meat_Boy': None,
    'soundtrack': None,
    'multiplayer': None,
    'characteristics': None,
    'gameplay': None,   # Good example for 3- find adjectives (or other information) characteristics of a facet of a game
    'Singleplayer': None,   # Good example for 3- find adjectives (or other information) characteristics of a facet of a game
    'singleplayer': None,   # Good example for 3- find adjectives (or other information) characteristics of a facet of a game
    'FF7': None,
    'rts': None,
}
for top in tops_games:
    tops_games[top] = my_model.model.most_similar(top, topn=100)
    top_words = np.array(tops_games[top])[:, 0].tolist()
    scores = np.array(tops_games[top])[:, 1].tolist()
    
    # x.split("/")[0].lower() == x.split("/")[0] removes Caps as they are usually noise and not adjectives
    top_games_indices = [i for i, x in enumerate(top_words) if x.lower() != x]

    top_words = np.concatenate((np.array([top_words[i].split("/")[0] for i in top_games_indices]).reshape(-1, 1), 
                                np.array([np.round(float(scores[i]), 3) for i in top_games_indices]).reshape(-1, 1)), 1)
    tops_games[top] = pd.DataFrame(top_words, columns=["words", "similarity"])

In [88]:
# Games known for their multiplayer mode (not all terms are Games, but some are)
tops_games['Singleplayer'][:10]

Unnamed: 0,words,similarity
0,Online_Multiplayer,0.711
1,Multiplayer_Demo,0.705
2,Metal_Slug_XX,0.704
3,Multiplayer_Modes,0.703
4,CoD_Black_Ops,0.694
5,Onslaught_Mode,0.693
6,Multiplayer_Maps,0.685
7,Multiplayer,0.684
8,Patch_v#.##,0.682
9,X###,0.679


In [89]:
tops_games['singleplayer'][:10]

Unnamed: 0,words,similarity
0,Firefight_mode,0.673
1,Multiplayer,0.672
2,Kane_Wrath,0.669
3,Singleplayer,0.661
4,RE5,0.66
5,Multiwinia,0.652
6,Perseus_Mandate,0.647
7,E3_demo,0.646
8,Civ_IV,0.644
9,GRAW,0.642


In [79]:
# Games known for their multiplayer mode (not all terms are Games, but some are)
tops_games['multiplayer'][:50]

Unnamed: 0,adjectives,similarity
0,Multiplayer,0.838
1,Firefight_mode,0.717
2,Multiplayer_mode,0.7
3,AI_bots,0.69
4,Multiplayer_modes,0.688
5,PvP,0.686
6,StarCraft_II,0.677
7,PvP_combat,0.666
8,Horde_mode,0.661
9,Gameplay,0.659


In [78]:
# Games known for there good gameplay?
tops_games['gameplay'][:50]

Unnamed: 0,adjectives,similarity
0,Gameplay,0.791
1,Gameplay_wise,0.729
2,RPG_gameplay,0.719
3,Multiplayer,0.715
4,FPS_genre,0.713
5,Firefight_mode,0.7
6,FPS_gameplay,0.699
7,2D_platformer,0.692
8,RTS_genre,0.69
9,Ninja_Gaiden,0.687


In [77]:
# Strangely, Dong_Dong is king of relevant because is a a Chinese Gymnast (super meat boy jumps high and stuff)
tops_games['Super_Meat_Boy'][:50]

Unnamed: 0,adjectives,similarity
0,Ben_Hur_proportions,0.469
1,Mortal_Combat,0.468
2,BY_GAIL_WOOD,0.449
3,Milli_Vanilli_Orlowski,0.446
4,Cold_Hearted,0.444
5,Mario_Kart,0.441
6,Dong_Dong,0.441
7,Quack_Quack_Quack,0.44
8,grit_Laimbeer,0.438
9,BY_TRACEY_PRISK,0.437


In [76]:
tops_games['Borderlands'][:50]

Unnamed: 0,adjectives,similarity
0,Alien_Breed,0.663
1,Darksiders,0.658
2,GTAIV,0.655
3,BioShock,0.645
4,GRAW,0.637
5,Assassin_Creed_Brotherhood,0.636
6,ArmA_II,0.636
7,XBLA,0.635
8,Zeno_Clash,0.635
9,Bulletstorm,0.634


In [13]:
vocab = list(my_model.get_vocab().keys())
w = open(direc + "vocab_" + checkpoint, "w+")
with tqdm(total=len(vocab)) as pbar:
    for v in vocab:
        w.write(v + "\n")
        pbar.update(1)
w.close()

#w = open("vocab_word2vec_POS.txt", "w+")
#with tqdm(total=len(vocab)) as pbar:
#    for v in vocab:
#        w.write(tag(v) + "\n")
#        pbar.update(1)
#w.close()

w = open(checkpoint + "_parse.txt", "w+")
with tqdm(total=len(vocab)) as pbar:
    for v in vocab:
        w.write(parse(str(v) + "\n") + "\n")
        pbar.update(1)
w.close()


# These names are in game_names, obtained in draft.py from taking words before the pattern "is a * game"
# The list is small and noisy. We will take a single popular game,
# top100 = my_model.model.most_similar('Borderlands', topn=100)


100%|██████████| 3048327/3048327 [00:04<00:00, 659277.07it/s]
  1%|          | 21959/3048327 [00:09<21:18, 2367.45it/s]


KeyboardInterrupt: 

In [77]:
tops_nn['elements'][:20]

Unnamed: 0,word,similarity
0,features,0.53
1,aspects,0.502
2,mechanics,0.477
3,techniques,0.472
4,ideas,0.457
5,поиграл,0.435
6,concepts,0.433
7,things,0.421
8,influences,0.419
9,วไปเเน,0.412


In [22]:
tops = {
    'Super_Meat_Boy': None,
    'Team_Fortress': None,
    'Dont_Starve': None,
    'soundtrack': None,
}
for top in tops:
    tops[top] = my_model.model.most_similar(top, topn=1000)
    top_words = np.array(tops[top])[:, 0].tolist()
    scores = np.array(tops[top])[:, 1].tolist()
    top_words = [parse(str(k) + "\n") for k in top_words]
    
    # x.split("/")[0].lower() == x.split("/")[0] removes Caps as they are usually noise and not adjectives
    top_jj_indices = [i for i, x in enumerate(top_words) if x.split("/")[1] in ["JJ", "JJR", "JJS"] and x.split("/")[0].lower() == x.split("/")[0]]

    top_words = np.concatenate((np.array([top_words[i].split("/")[0] for i in top_jj_indices]).reshape(-1, 1), 
                                np.array([np.round(float(scores[i]), 3) for i in top_jj_indices]).reshape(-1, 1)), 1)
    tops[top] = pd.DataFrame(top_words, columns=["adjectives", "similarity"])

In [23]:
tops["soundtrack"][:30]

Unnamed: 0,adjectives,similarity
0,lotan,0.375
1,complimentary,0.362
2,addie,0.36
3,crayon,0.356
4,calligraphic,0.346
5,fulgid,0.344
6,wanweird,0.338
7,alrich,0.336
8,expressive,0.329
9,acoustic,0.328


In [60]:
# Enlever les termes avec Caps ou Caps_Caps
tops["gameplay"][:30]

Unnamed: 0,adjectives,similarity
0,gameplay_mechanic,0.746
1,replayable,0.654
2,minigame,0.653
3,overworld_map,0.645
4,sidescroller,0.631
5,side_scroller,0.629
6,fast_paced_arcade,0.624
7,metagame,0.608
8,side_scrolling_platformer,0.608
9,highly_replayable,0.607


In [61]:
tops["Super_Meat_Boy"][:20]

Unnamed: 0,adjectives,similarity
0,noemon,0.486
1,owena,0.464
2,uncharted,0.449
3,stochmal,0.447
4,dareful,0.44
5,virous,0.439
6,appassionate,0.438
7,goustrous,0.437
8,frousty,0.435
9,meinong,0.435


In [56]:
tops["Borderlands"][:20]

Unnamed: 0,adjectives,similarity
0,Dreamfall,0.621
1,Auto_Assault,0.605
2,Jetpac_Refuelled,0.593
3,Planetside,0.585
4,inFamous,0.578
5,Uncharted,0.577
6,Leliana_Song,0.577
7,Rearmed,0.576
8,inFAMOUS,0.572
9,Shin_Megami_Tensei_Nocturne,0.569


In [20]:
display_closestwords_pcascatterplot(my_model.model, "Borderlands")
display_closestwords_pcascatterplot(my_model.model, "Portals")
display_closestwords_pcascatterplot(my_model.model, "FFVII")

NameError: name 'display_closestwords_pcascatterplot' is not defined