In [None]:
import os
import re
import json
import nltk
import pickle
from glob import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import config

In [None]:
# Download and import "book"
nltk.download('book', quiet=True)
from nltk import book

In [None]:
# all_chars = set(map(lambda x: x.split('\\')[-1][:-4], glob('data/wow_chars/*.txt')))
# chars_with_comments = set(map(lambda x: x.split('\\')[-1][:-6], glob('data/char_comments/*.njson')))

In [None]:
# read in character DataFrame
df = pd.read_csv(config.PATH_RES + 'df_chars.csv')
df.head()

# NLTK on Cleaned Wikipages

In [None]:
# create NLTK text of cleaned texts
file_list = [fn for fn in glob(config.PATH_CLEAN + '*.txt')]
text_corpus = nltk.corpus.PlaintextCorpusReader('', file_list)
wiki_text = nltk.Text(text_corpus.words())

# create NLTK text of words
file_list = [fn for fn in glob(config.PATH_WORDS + '*.txt')]
words_corpus = nltk.corpus.PlaintextCorpusReader('', file_list)
wiki_words = nltk.Text(words_corpus.words())

## Worclouds of text surrounding specific words

In [None]:
# find occurrances of word
word_list = ['sword', 'shield', 'mace']
for word in word_list:
    print(wiki_text.concordance(word))

## Stuff with TF-IDF

In [None]:
def create_collection(df, attr, save=False):
    # get unique attributes
    attrs = df[attr].unique()

    # create collection with texts for each member
    col = {}
    for at in attrs:
        # create list of paths for every character of current faction
        names = df.loc[df[attr] == at, 'Name'].values
        paths = [
            config.PATH_WORDS + n.replace(' ', '_') + '.txt' 
            for n in names
        ]

        # save text for faction
        col[at] = {'text': nltk.Text(words_corpus.words(paths))}

    N = len(col)
    for split in col:
        text = col[split]['text']

        # calculate TC and TF
        words, tc = np.unique(text, return_counts=True)
        tf = tc / len(text)

        # calculate IDF
        IDF = []
        for word in words:
            n_t = 0
            for doc in col:
                txt = col[doc]['text']
                if txt.count(word):
                    n_t += 1
            IDF.append(np.log(N / n_t))

        # store stuff
        col[split]['words'] = words
        col[split]['tc'] = tc
        col[split]['tf'] = tf
        col[split]['idf'] = tf
    
    if save:
        with open(config.PATH_RES + attr + '_dict.json', 'wb') as f:
            pickle.dump(col, f)
    return col

In [None]:
for attr in ['Faction', 'Gender', 'Race', 'Status']:
    if not os.path.exists(config.PATH_RES + attr + '_dict.json'):
        print(f'Creating collection for {attr}')
        _ = create_collection(df, attr, save=True)
        print(f'\nDone with collection for {attr}\n')

## leftovers

In [None]:
# Get the frequency distribution of the text
fd = book.FreqDist(wiki_words)

# Display the 75 most common tokens in a cumulative frequency plot
plt.figure(figsize=(20, 8))
plt.title('Cumulative Frequency plot of 75 most common words')
fd.plot(75, cumulative=True)
plt.show()

# NLTK on WoWhead User Comments

# Test shit