# Text Analysis Computations
The following notebook will compute tf-idf values and wordcloud strings for the list of words for each character for both wowpedia pages and wowhead comments.

In [1]:
import os
import nltk
import pickle
from glob import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import community

import networkx as nx

import config
from text_helpers import init_collection, populate_collection

In [2]:
# Download and import "book"
nltk.download('book', quiet=True)
from nltk import book

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


# Load in necessary data

In [3]:
# get list of all characters which 
chars_with_comments = [
    path.split('\\')[-1].replace('.njson', '') 
    for path in glob('./data/char_comments/*.njson')
]

# read in character DataFrame
df = pd.read_csv(config.PATH_RES + 'df_chars.csv')

# remove chars that doesn't have comments from wowhead
df = df[df['Name'].apply(lambda n: n in chars_with_comments)]

df.head()

Unnamed: 0,Name,Gender,Race,Faction,Status
0,A'dal,Unknown,Naaru,Neutral,Alive
2,Aegwynn,Female,Human,Neutral,Deceased
3,Aessina,Female,Wisp,Neutral,Unknown
5,Agamaggan,Male,Boar,Neutral,Deceased
6,Agatha,Female,Val'Kyr,Neutral,Deceased


In [4]:
# load graph
Gcc = nx.read_gexf(config.PATH_RES + 'Gcc_wow.gexf').to_undirected()

# remove nodes from graph that doesn't have comments from wowhead
for node in list(Gcc.nodes()):
    if node.replace(' ', '_') not in chars_with_comments:
        Gcc.remove_node(node)

print(f'Number of\nNodes: {len(list(Gcc.nodes()))}\nEdges: {len(list(Gcc.edges()))}')

Number of
Nodes: 239
Edges: 2410


# Get Communities
Create or load community partitions.

In [5]:
# create communities if not done already, otherwise load
filename = config.PATH_RES + 'Communities.json'
if not os.path.isfile(filename):
    print('Creating new community partition.')
    partition = community.best_partition(Gcc)
    communities = []
    for p in set(partition.values()):
        names = [n for n in partition if partition[n] == p]
        communities.append(names)
    pickle.dump(communities, open(filename, 'wb'))
    print(f'Saved as pickle {filename}')
else: 
    print('Loading existing community partition.')
    print(f'from pickle {filename}')
    communities = pickle.load(open(filename, 'rb'))

# get top chars in each community
degs = list(Gcc.degree())
com_names = []
for com in communities:
    com_sorted = sorted([(n, v) for n, v in degs if n in com], key=lambda x: x[1], reverse=True)
    top_names = [n for n, _ in com_sorted[:3]]
    com_name = ', '.join(top_names)
    com_names.append(com_name)

Loading existing community partition.
from pickle ./store/Communities.json


In [6]:
print('List of top 3 characters for each community based on node degree')
for i, com in enumerate(com_names):
    print(f'\t{i+1}. {com}')

List of top 3 characters for each community based on node degree
	1. Khadgar, Illidan Stormrage, Velen
	2. Deathwing, Sargeras, Yogg-Saron
	3. Sylvanas Windrunner, Lich King, Varian Wrynn
	4. Malfurion Stormrage, Tyrande Whisperwind, Alexstrasza
	5. Thrall, Ner'zhul, Orgrim Doomhammer
	6. Anzu, Terokk, Talon King Ikiss
	7. Jaina Proudmoore, Anduin Wrynn, Garrosh Hellscream


# Prepare Corpus etc.

In [7]:
# word files for wowpedia character pages
file_list = [config.PATH_WORDS + n + '.txt' for n in chars_with_comments]
c_words_wiki = nltk.corpus.PlaintextCorpusReader('', file_list)
t_words_wiki = nltk.Text(c_words_wiki.words())

# word files for wowhead user comments
file_list = [config.PATH_COMMENTS_WORDS + n + '.txt' for n in chars_with_comments]
c_words_comments = nltk.corpus.PlaintextCorpusReader('', file_list)
t_words_comments = nltk.Text(c_words_wiki.words())

In [8]:
# define what to look into
attr_lookup = {
    'Gender': ['Male', 'Female'],
    'Faction': ['Alliance', 'Horde'],
    'Status': ['Alive', 'Deceased']
}

# Create Collections
For both wowpedia pages and wowhead comments wordlists calculate tf-idf values for each word and create wordcloud strings, based on the splits defined in `attr_lookup`, and save results to `.json` files.

In [9]:
# create collections for attributes for both wowpedia pages and wowhead comments
for source, corpus, path_words in [
    ('wowpedia/', c_words_wiki, config.PATH_WORDS), 
    ('wowhead/', c_words_comments, config.PATH_COMMENTS_WORDS)
]:
    for attr in attr_lookup:
        # check if collection already is created
        save_path = config.PATH_RES + source + attr + '_dict.json'
        if os.path.isfile(save_path):
            print(f'\nSkipping {attr} for {source} since it is already done.')
            continue
        else:
            print(f'\nDoing {attr} for {source}')
        
        # create collection and save it
        col = init_collection(df, attr, path_words, corpus)
        _ = populate_collection(col, save_path)


Doing Gender for wowpedia/


Computing values: 100%|██████████| 3/3 [00:58<00:00, 19.59s/it]
Computing wordclouds: 100%|██████████| 3/3 [00:00<00:00, 166.44it/s]



Doing Faction for wowpedia/


Computing values: 100%|██████████| 3/3 [01:05<00:00, 21.78s/it]
Computing wordclouds: 100%|██████████| 3/3 [00:00<00:00, 166.41it/s]



Doing Status for wowpedia/


Computing values: 100%|██████████| 3/3 [01:03<00:00, 21.02s/it]
Computing wordclouds: 100%|██████████| 3/3 [00:00<00:00, 136.23it/s]



Doing Gender for wowhead/


Computing values: 100%|██████████| 3/3 [02:39<00:00, 53.01s/it]
Computing wordclouds: 100%|██████████| 3/3 [00:00<00:00, 115.26it/s]



Doing Faction for wowhead/


Computing values: 100%|██████████| 3/3 [02:39<00:00, 53.33s/it]
Computing wordclouds: 100%|██████████| 3/3 [00:00<00:00, 62.46it/s]



Doing Status for wowhead/


Computing values: 100%|██████████| 3/3 [02:53<00:00, 57.91s/it]
Computing wordclouds: 100%|██████████| 3/3 [00:00<00:00, 111.12it/s]


In [10]:
# create collections for communities for both wowpedia pages and wowhead comments
for source, corpus, path_words in [
    ('wowpedia/', c_words_wiki, config.PATH_WORDS), 
    ('wowhead/', c_words_comments, config.PATH_COMMENTS_WORDS)
]:  
    print(f'Computing collections for communities for {source}')
    col = {}
    save_path = config.PATH_RES + source + 'Louvain_dict.json'
    if os.path.isfile(save_path):
        continue

    for i, names in enumerate(communities): 
        paths = [
            path_words + n.replace(' ', '_') + '.txt' 
            for n in names
        ]
        # save text for community
        col[i] = {'text': nltk.Text(corpus.words(paths))}
    
    # create collection and save it
    col = populate_collection(col, save_path)

Computing collections for communities for wowpedia/


Computing values: 100%|██████████| 7/7 [07:30<00:00, 64.37s/it]
Computing wordclouds: 100%|██████████| 7/7 [00:00<00:00, 122.82it/s]


Computing collections for communities for wowhead/


Computing values: 100%|██████████| 7/7 [11:48<00:00, 101.27s/it]
Computing wordclouds: 100%|██████████| 7/7 [00:00<00:00, 98.59it/s]


# Top Words
Inspect top 5 words according to tf-idf for each attribute split and for the different communities by Louvain.

In [15]:
col[split]['tfidf']

array([4.56725585e-06, 4.56725585e-06, 4.56725585e-06, ...,
       4.56725585e-06, 4.56725585e-06, 4.56725585e-06])

In [20]:
# display top words for attributes
for source in ['wowpedia/', 'wowhead/']:
    print(f"\n\nFor {source}")
    for attr in attr_lookup:
        print(f'\nTop 5 for attribute {attr}')
        col = pickle.load(open(config.PATH_RES + source + attr + '_dict.json', 'rb'))
        for split in attr_lookup[attr]:
            tfidfs = col[split]['tfidf']
            idx = np.argsort(tfidfs)[::-1]
            top_5 = ', '.join(col[split]['words'][idx][:5])
            print(f'\t{split}: {top_5}')
            print(f'\t{tfidfs[idx][:5]}')



For wowpedia/

Top 5 for attribute Gender
	Male: demon, human, father, jaina, dreadlords
	[0.00094062 0.00072206 0.00055331 0.00049521 0.00046475]
	Female: mother, walker, musha, lady, demon
	[0.00102066 0.00093353 0.00091059 0.00077172 0.00077172]

Top 5 for attribute Faction
	Alliance: alleria, naaru, genn, koltira, eredar
	[0.00160336 0.0006989  0.00061266 0.00055696 0.00053445]
	Horde: bwonsamdi, darkspear, tyrathan, cairne, loa
	[0.00112597 0.00103126 0.00094091 0.00085537 0.00076819]

Top 5 for attribute Status
	Alive: dragon, could, alliance, force, adventurer
	[0.00175341 0.0013104  0.00116149 0.00110565 0.00098652]
	Deceased: dragon, could, first, god, force
	[0.0019276  0.00131693 0.00118948 0.00110983 0.00103549]


For wowhead/

Top 5 for attribute Gender
	Male: razorgore, amalgamation, molten, spine, scion
	[0.00202611 0.00052755 0.0003517  0.0003173  0.0002676 ]
	Female: whelp, yula, lift, ony, tail
	[0.00213983 0.00055569 0.00049764 0.00042299 0.0004147 ]

Top 5 for att

In [12]:
# display top words per community
for source in ['wowpedia/', 'wowhead/']:
    print(f"\n\nFor {source}")
    print(f'Top 5 words for each community')
    col = pickle.load(open(config.PATH_RES + source + 'Louvain_dict.json', 'rb'))
    for i, com_name in enumerate(com_names):
        print(f'\n"{com_name}"')
        words = col[i]['words']
        tfidf = col[i]['tf'] * col[i]['idf']
        top_5 = ', '.join(words[np.argsort(tfidf)[::-1]][:5])
        print(top_5)



For wowpedia/
Top 5 words for each community

"Khadgar, Illidan Stormrage, Velen"
rommath, lorthemar, halduron, alleria, aethas

"Deathwing, Sargeras, Yogg-Saron"
tyr, algalon, prestor, dragon, titan

"Sylvanas Windrunner, Lich King, Varian Wrynn"
darion, genn, koltira, sylvanas, muradin

"Malfurion Stormrage, Tyrande Whisperwind, Alexstrasza"
tyrande, jarod, maiev, shandris, malfurion

"Thrall, Ner'zhul, Orgrim Doomhammer"
muln, horde, orgrim, maraad, doomhammer

"Anzu, Terokk, Talon King Ikiss"
ikiss, rukhmar, sethekk, skettis, sethe

"Jaina Proudmoore, Anduin Wrynn, Garrosh Hellscream"
li, chen, garrosh, horde, baine


For wowhead/
Top 5 words for each community

"Khadgar, Illidan Stormrage, Velen"
gravity, lapse, capernian, pyroblast, phoenix

"Deathwing, Sargeras, Yogg-Saron"
amalgamation, whelp, ony, tendon, sara

"Sylvanas Windrunner, Lich King, Varian Wrynn"
darion, frostmourne, arthas, lichking, kel

"Malfurion Stormrage, Tyrande Whisperwind, Alexstrasza"
drelanim, jarod, wh