In [3]:
import os
import nltk
import pickle
from glob import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import community

import networkx as nx

import config
from text_helpers import init_collection, populate_collection

In [4]:
# Download and import "book"
nltk.download('book', quiet=True)
from nltk import book

In [5]:
# get list of all characters which 
chars_with_comments = [
    path.split('\\')[-1].replace('.njson', '') 
    for path in glob('./data/char_comments/*.njson')
]

# read in character DataFrame
df = pd.read_csv(config.PATH_RES + 'df_chars.csv')

# remove chars that doesn't have comments from wowhead
df = df[df['Name'].apply(lambda n: n in chars_with_comments)]

df.head()

Unnamed: 0,Name,Gender,Race,Faction,Status
0,A'dal,Unknown,Naaru,Neutral,Alive
2,Aegwynn,Female,Human,Neutral,Deceased
3,Aessina,Female,Wisp,Neutral,Unknown
5,Agamaggan,Male,Boar,Neutral,Deceased
6,Agatha,Female,Val'Kyr,Neutral,Deceased


In [6]:
# load graph
Gcc = nx.read_gexf(config.PATH_RES + 'Gcc_wow.gexf').to_undirected()

# remove nodes from graph that doesn't have comments from wowhead
for node in list(Gcc.nodes()):
    if node.replace(' ', '_') not in chars_with_comments:
        Gcc.remove_node(node)

print(f'Number of\nNodes: {len(list(Gcc.nodes()))}\nEdges: {len(list(Gcc.edges()))}')

Number of
Nodes: 239
Edges: 2410


# Get Communities
Create or load community partitions.

In [8]:
# create communities if not done already, otherwise load
filename = config.PATH_RES + 'Communities.json'
if not os.path.isfile(filename):
    print('Creating new community partition.')
    partition = community.best_partition(Gcc)
    communities = []
    for p in set(partition.values()):
        names = [n for n in partition if partition[n] == p]
        communities.append(names)
    pickle.dump(communities, open(filename, 'wb'))
    print(f'Saved as pickle {filename}')
else: 
    print('Loading existing community partition.')
    print(f'from pickle {filename}')
    communities = pickle.load(open(filename, 'rb'))

# get top chars in each community
degs = list(Gcc.degree())
com_names = []
for com in communities:
    com_sorted = sorted([(n, v) for n, v in degs if n in com], key=lambda x: x[1], reverse=True)
    top_names = [n for n, _ in com_sorted[:3]]
    com_name = ', '.join(top_names)
    com_names.append(com_name)

Loading existing community partition.
from pickle ./store/Communities.json


# Prepare Corpus etc.

In [6]:
# word files for wowpedia character pages
file_list = [config.PATH_WORDS + n + '.txt' for n in chars_with_comments]
c_words_wiki = nltk.corpus.PlaintextCorpusReader('', file_list)
t_words_wiki = nltk.Text(c_words_wiki.words())

# word files for wowhead user comments
file_list = [config.PATH_COMMENTS_WORDS + n + '.txt' for n in chars_with_comments]
c_words_comments = nltk.corpus.PlaintextCorpusReader('', file_list)
t_words_comments = nltk.Text(c_words_wiki.words())

In [7]:
# define what to look into
attr_lookup = {
    'Gender': [('Male', '#0B1C51'), ('Female', '#FCB9B2')],
    'Faction': [('Alliance', config.COLOR_ALLIANCE), ('Horde', config.COLOR_HORDE)]
}

# Create Collections

In [8]:
# create collections for attributes for both wowpedia pages and wowhead comments
for source, corpus, path_words in [
    ('wowpedia/', c_words_wiki, config.PATH_WORDS), 
    ('wowhead/', c_words_comments, config.PATH_COMMENTS_WORDS)
]:
    for attr in attr_lookup:
        # check if collection already is created
        save_path = config.PATH_RES + source + attr + '_dict.json'
        if os.path.isfile(save_path):
            print(f'\nSkipping {attr} for {source} since it is already done.')
            continue
        else:
            print(f'\nDoing {attr} for {source}')
        
        # create collection and save it
        col = init_collection(df, attr, path_words, corpus)
        _ = populate_collection(col, save_path)


Doing Gender for wowpedia/


Computing values: 100%|██████████| 3/3 [03:45<00:00, 75.26s/it]
Computing wordclouds: 100%|██████████| 3/3 [00:00<00:00, 39.97it/s]



Doing Faction for wowpedia/


Computing values: 100%|██████████| 3/3 [03:22<00:00, 67.51s/it]
Computing wordclouds: 100%|██████████| 3/3 [00:00<00:00, 69.67it/s]



Doing Gender for wowhead/


Computing values: 100%|██████████| 3/3 [07:54<00:00, 158.30s/it]
Computing wordclouds: 100%|██████████| 3/3 [00:00<00:00, 48.27it/s]



Doing Faction for wowhead/


Computing values: 100%|██████████| 3/3 [06:36<00:00, 132.02s/it]
Computing wordclouds: 100%|██████████| 3/3 [00:00<00:00, 18.99it/s]


In [9]:
# create collections for communities for both wowpedia pages and wowhead comments
for source, corpus, path_words in [
    ('wowpedia/', c_words_wiki, config.PATH_WORDS), 
    ('wowhead/', c_words_comments, config.PATH_COMMENTS_WORDS)
]:  
    print(f'Computing collections for communities for {source}')
    col = {}
    save_path = config.PATH_RES + source + 'Louvain_dict.json'
    if os.path.isfile(save_path):
        continue

    for i, names in enumerate(communities): 
        paths = [
            path_words + n.replace(' ', '_') + '.txt' 
            for n in names
        ]
        # save text for community
        col[i] = {'text': nltk.Text(corpus.words(paths))}
    
    # create collection and save it
    col = populate_collection(col, save_path)

Computing collections for communities for wowpedia/


Computing values: 100%|██████████| 7/7 [14:48<00:00, 126.97s/it]
Computing wordclouds: 100%|██████████| 7/7 [00:00<00:00, 142.78it/s]


Computing collections for communities for wowhead/


Computing values: 100%|██████████| 7/7 [12:39<00:00, 108.44s/it]
Computing wordclouds: 100%|██████████| 7/7 [00:00<00:00, 107.70it/s]


# Top Words
Inspect top 5 words according to tf-idf for each attribute split and for the different communities by Louvain.

In [12]:
# display top words for attributes
for source in ['wowpedia/', 'wowhead/']:
    print(f"\n\nFor {source}")
    for attr in attr_lookup:
        print(f'\nTop 5 for attribute {attr}')
        col = pickle.load(open(config.PATH_RES + source + attr + '_dict.json', 'rb'))
        for split, _ in attr_lookup[attr]:
            top_5 = ', '.join(col[split]['words'][np.argsort(col[split]['tfidf'])[::-1]][:5])
            print(f'\t{split}: {top_5}')



For wowpedia/

Top 5 for attribute Gender
	Male: rexxar, voljin, demon, mannoroth, drekthar
	Female: ysera, elune, draka, talanji, aegwynn

Top 5 for attribute Faction
	Alliance: turalyon, antonidas, thassarian, koltira, faol
	Horde: rokhan, rexxar, nazgrel, bwonsamdi, eitrigg


For wowhead/

Top 5 for attribute Gender
	Male: razorgore, anzu, voljin, amalgamation, rexxar
	Female: whelp, onyxian, yula, lift, ony

Top 5 for attribute Faction
	Alliance: thassarian, koltira, skybreaker, lurid, naaru
	Horde: rexxar, blackhand, rokhan, ya, troll


In [13]:
# display top words per community
for source in ['wowpedia/', 'wowhead/']:
    print(f"\n\nFor {source}")
    print(f'Top 5 words for each community')
    col = pickle.load(open(config.PATH_RES + source + 'Louvain_dict.json', 'rb'))
    for i, com_name in enumerate(com_names):
        print(f'\n"{com_name}"')
        words = col[i]['words']
        tfidf = col[i]['tf'] * col[i]['idf']
        top_5 = ', '.join(words[np.argsort(tfidf)[::-1]][:5])
        print(top_5)



For wowpedia/
Top 5 words for each community

"Khadgar, Illidan Stormrage, Velen"
halduron, lorthemar, alleria, rommath, muru

"Deathwing, Sargeras, Yogg-Saron"
odyn, tyr, mimiron, thorim, nefarian

"Sylvanas Windrunner, Lich King, Varian Wrynn"
thassarian, koltira, darion, sylvanas, alexandros

"Malfurion Stormrage, Tyrande Whisperwind, Alexstrasza"
malorne, jarod, aviana, eranikus, tyrande

"Thrall, Ner'zhul, Orgrim Doomhammer"
draka, horde, drekthar, maraad, orgrim

"Anzu, Terokk, Talon King Ikiss"
ikiss, terokk, anzu, terokks, rukhmar

"Jaina Proudmoore, Anduin Wrynn, Garrosh Hellscream"
li, chen, garrosh, voljin, horde


For wowhead/
Top 5 words for each community

"Khadgar, Illidan Stormrage, Velen"
gravity, lapse, capernian, pyroblast, phoenix

"Deathwing, Sargeras, Yogg-Saron"
amalgamation, whelp, thorim, ony, tendon

"Sylvanas Windrunner, Lich King, Varian Wrynn"
darion, lich, frostmourne, arthas, thassarian

"Malfurion Stormrage, Tyrande Whisperwind, Alexstrasza"
drelanim, 