In [None]:
import os
import re
import json
import nltk
import pickle
from glob import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import community

import networkx as nx

import config
from text_helpers import init_collection, populate_collection

In [None]:
# Download and import "book"
nltk.download('book', quiet=True)
from nltk import book

In [None]:
# get list of all characters which 
chars_with_comments = [
    path.split('\\')[-1].replace('.njson', '') 
    for path in glob('./data/char_comments/*.njson')
]

# read in character DataFrame
df = pd.read_csv(config.PATH_RES + 'df_chars.csv')

# remove chars that doesn't have comments from wowhead
df = df[df['Name'].apply(lambda n: n in chars_with_comments)]

df.head()

In [None]:
# load graph
Gcc = nx.read_gexf(config.PATH_RES + 'Gcc_wow.gexf').to_undirected()

# remove nodes from graph that doesn't have comments from wowhead
for node in list(Gcc.nodes()):
    if node.replace(' ', '_') not in chars_with_comments:
        Gcc.remove_node(node)

print(f'Number of\nNodes: {len(list(Gcc.nodes()))}\nEdges: {len(list(Gcc.edges()))}')

# Get Communities
Create or load community partitions.

In [None]:
# create communities if not done already, otherwise load
filename = config.PATH_RES + 'Communities.json'
if not os.path.isfile(filename):
    print('Creating new community partition.')
    partition = community.best_partition(Gcc)
    communities = []
    for p in set(partition.values()):
        names = [n for n in partition if partition[n] == p]
        communities.append(names)
    pickle.dump(communities, open(filename, 'wb'))
    print(f'Saved as pickle {filename}')
else: 
    print('Loading existing community partition.')
    print(f'from pickle {filename}')
    communities = pickle.load(open(filename, 'rb'))

# get top chars in each community
degs = list(Gcc.degree())
com_names = []
for com in communities:
    com_sorted = sorted([(n, v) for n, v in degs if n in com], key=lambda x: x[1], reverse=True)
    top_names = [n for n, _ in com_sorted[:3]]
    com_name = ', '.join(top_names)
    com_names.append(com_name)

# NLTK

In [None]:
# get file list for every character
f_wiki_words = []
f_wiki_clean = []
f_comments_words = []
f_comments_clean = []
for charname in chars_with_comments:
    f_wiki_words.append(config.PATH_WORDS + charname + '.txt')
    f_wiki_clean.append(config.PATH_CLEAN + charname + '.txt')
    f_comments_words.append(config.PATH_COMMENTS_WORDS + charname + '.txt')
    f_comments_clean.append(config.PATH_COMMENTS_CLEAN + charname + '.txt')

# create corpus for wiki and wowhead
c_words_wiki = nltk.corpus.PlaintextCorpusReader('', f_wiki_words)
c_clean_wiki = nltk.corpus.PlaintextCorpusReader('', f_wiki_clean)
c_words_comments = nltk.corpus.PlaintextCorpusReader('', f_comments_words)
c_clean_comments = nltk.corpus.PlaintextCorpusReader('', f_comments_clean)

# create text for wiki and wowhead
t_words_wiki = nltk.Text(c_words_wiki.words())
t_clean_wiki = nltk.Text(c_clean_wiki.words())
t_words_comments = nltk.Text(c_words_comments.words())
t_clean_comments = nltk.Text(c_clean_comments.words())

In [None]:
# define what to look into
attr_lookup = {
    'Gender': [('Male', '#0B1C51'), ('Female', '#FCB9B2')],
    'Faction': [('Alliance', config.COLOR_ALLIANCE), ('Horde', config.COLOR_HORDE)]
}

## Create Collections

In [None]:
# create collections for attributes for both wowpedia pages and wowhead comments
for source, corpus in [
    ('wowpedia/', c_words_wiki), 
    ('wowhead/', c_words_comments)
]:
    for attr in attr_lookup:
        save_path = config.PATH_RES + source + attr + '_dict.json'
        if os.path.isfile(save_path):
            print(f'\nSkipping {attr} for {source} since it is already done.')
            continue
        else:
            print(f'\nDoing {attr} for {source}')
        
        # create collection and save it
        col = init_collection(df, attr, corpus)
        col = populate_collection(col, save_path)

In [None]:
# create collections for communities for both wowpedia pages and wowhead comments
for source, corpus, path_words in [
    ('wowpedia/', c_words_wiki, config.PATH_WORDS), 
    ('wowhead/', c_words_comments, config.PATH_COMMENTS_WORDS)
]:  
    print(f'Computing collections for communities for {source}')
    col = {}
    save_path = config.PATH_RES + source + 'Louvain_dict.json'
    if os.path.isfile(save_path):
        continue

    for i, names in enumerate(communities): 
        paths = [
            path_words + n.replace(' ', '_') + '.txt' 
            for n in names
        ]
        # save text for community
        col[i] = {'text': nltk.Text(corpus.words(paths))}
    
    # create collection and save it
    col = populate_collection(col, save_path)

## Top Words
Inspect top 5 words according to tf-idf for each attribute split and for the different communities by Louvain.

In [None]:
# display top words for attributes
for source in ['wowpedia/', 'wowhead/']:
    print(f"\n\nFor {source}")
    for attr in attr_lookup:
        print(f'\nTop 5 for attribute {attr}')
        col = pickle.load(open(config.PATH_RES + source + attr + '_dict.json', 'rb'))
        for split, _ in attr_lookup[attr]:
            words = col[split]['words']
            tfidf = col[split]['tf'] * col[split]['idf']
            top_5 = ', '.join(words[np.argsort(tfidf)[::-1]][:5])
            print(f'\t{split}: {top_5}')

In [None]:
# display top words per community
for source in ['wowpedia/', 'wowhead/']:
    print(f"\n\nFor {source}")
    print(f'Top 5 words for each community')
    col = pickle.load(open(config.PATH_RES + source + 'Louvain_dict.json', 'rb'))
    for i, com_name in enumerate(com_names):
        print(f'\n"{com_name}"')
        words = col[i]['words']
        tfidf = col[i]['tf'] * col[i]['idf']
        top_5 = ', '.join(words[np.argsort(tfidf)[::-1]][:5])
        print(top_5)