# Generating global word count vectors

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

%matplotlib inline

In [2]:
import bz2
import os
import ujson
import attr
import re

import pandas as pd
import numpy as np
import statsmodels.api as sm

from glob import glob
from collections import Counter, UserDict
from itertools import islice
from tqdm import tqdm_notebook
from sklearn.linear_model import LinearRegression

  from pandas.core import datetools


In [None]:
# Initial count -- for weeding out words that never
#                  appear in any novel ten or more times

data_path = 'data/novels.100.json'

def record_to_count(record):
    tokens = record['token']
    return Counter([t.lower() for t in tokens])

_u_alpha = re.compile('^[\w.,"\'?!;:]+$', re.UNICODE)
def save_record(record, outpath='counts', 
                u_alpha=_u_alpha.match):
    id = record['identifier']
    outpath = os.path.join(outpath, '{}.csv'.format(id))
    with open(outpath, 'w', encoding='utf-8') as op:
        for w, ct in record_to_count(record).most_common():
            if ct < 10:
                break
            if u_alpha(w):
                op.write('{}\t{}\n'.format(w.replace('\t', '\\t'), ct))

def json_part_map(func, part):
    try:
        with bz2.open(part) as ip:
            for line in ip:
                yield func(ujson.loads(line))
    except EOFError:
        print('Error on file {}'.format(part))        
        return

all20 = [os.path.join(data_path, f) 
         for f in os.listdir(data_path)
         if f.endswith('.bz2')]
for f in all20:
    _x = list(json_part_map(save_record, f))

In [6]:
# Second count -- for generating vectors of counts of 
#                 the 10k most common words by document
#                 frequency from among the words selected
#                 in the initial count.

data_path = 'data/novels.100.json'
words_path = 'counts/10kwords.txt'

def load_words(path):
    with open(path, encoding='utf-8') as ip:
        return [w.strip() for w in ip]

def record_to_count_vector(record, words):
    tokens = record['token']
    ct = Counter([t.lower() for t in tokens])
    return np.array([ct[w] for w in words])

def part_record_iter(part):
    try:
        with bz2.open(part) as ip:
            for line in ip:
                yield ujson.loads(line)
    except EOFError:
        print('Error on file {}'.format(part))        
        return

_u_alpha = re.compile('^[\w.,"\'?!;:]+$', re.UNICODE)
def part_to_npz(partpath, words):
    outpath = partpath.replace('json.bz2', 'npz')
    arrays = {}
    for r in part_record_iter(partpath):
        arrays[r['identifier']] = record_to_count_vector(r, words)
    
    np.savez_compressed(outpath, **arrays)

all20 = [os.path.join(data_path, f) 
         for f in os.listdir(data_path)
         if f.endswith('.bz2')]
words = load_words(words_path)
for f in all20:
    part_to_npz(f, words)