# Loading Word Counts

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

%matplotlib inline

In [3]:
import bz2
import os
import ujson
import attr
import re

import pandas as pd
import numpy as np
import statsmodels.api as sm

from pprint import pprint
from glob import glob
from collections import Counter, UserDict
from itertools import islice
from tqdm import tqdm_notebook
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

  from pandas.core import datetools


# Loading .npz vectors, metadata, and quartiles

In [4]:
count_npz = '../counts/novels.100.npz'
feature_name_file = '../counts/10kwords.txt'
with open(feature_name_file) as ip:
    feature_names = [f.strip() for f in ip]

def load_all_counts(path):
    files = [os.path.join(path, f) 
             for f in os.listdir(path) 
             if f.endswith('.npz')]
    collected = {}
    for f in files:
        collected.update(np.load(f))
    return collected

In [5]:
md = pd.read_csv('../metadata/novels-metadata.csv',
                 index_col='identifier')

In [6]:
@attr.s
class Dataset:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '*.bz2'))
    
    def novels(self):
        for path in tqdm_notebook(self.paths()):
            with bz2.open(path) as fh:
                for line in fh:
                    yield ujson.loads(line)

In [7]:
ds = Dataset('../data/top200-4bins.json/')

In [8]:
novels = list(ds.novels())




In [9]:
# This generates quad labels like so:
#     2  3
#     0  1
# 0 == fall, fall
# 1 == rise, fall
# 2 == fall, rise
# 3 == rise, rise

def quartile_to_quad(quartiles, feature):
    t = quartiles[feature]
    return (t[0] < t[1]) + 2 * (t[2] < t[3]).astype(int)

In [14]:
feature_set = sorted(novels[0]['counts'].keys())
quartiles = {(f, i): {n['identifier']: n['counts'][f][i] for n in novels}
            for f in feature_set for i in range(4)}
quartiles = pd.DataFrame(quartiles)

In [22]:
gender_annotations = {n['identifier']: n['chicago_auth_gender'] 
                      for n in novels
                      if 'chicago_auth_gender' in n}
canon_annotations = {n['identifier']: n['chicago_auth_canon']
                     for n in novels
                     if 'chicago_auth_canon' in n}

test_word = 'a'

quad = quartile_to_quad(quartiles, test_word)
test_quad = max(range(4), key=lambda q: (quad == q).sum())
print(test_quad)

quad = quad.loc[gender_annotations.keys() & canon_annotations.keys()]
md = md.loc[quad.index].assign(quad=quad)

gender_annotations = [gender_annotations[nid] == 'F' for nid in md.index]
md = md.assign(gender_f=gender_annotations)

canon_annotations = [canon_annotations[nid] for nid in md.index]
md = md.assign(canon_y=canon_annotations)

works_available = set(md.index)

0


In [23]:
md.iloc[0:10]

Unnamed: 0,corpus,title,author_first,author_last,pub_year,quad,gender_f,canon_y
25523,chicago,The Jericho iteration,Allen M,Steele,1994,1,False,False
23900,chicago,The snares of death,Kate,Charles,1993,0,True,False
21852,chicago,High midnight,Stuart M,Kaminsky,1981,0,False,False
23248,chicago,Orc's opal,Piers,Anthony,1990,0,False,False
23978,chicago,The throat,Peter,Straub,1993,0,False,False
22922,chicago,The Avenue: Clayton City,C. Eric,Lincoln,1988,0,False,False
21890,chicago,Realities,Marian,Schwartz,1981,2,True,False
22562,chicago,This present darkness,Frank E,Peretti,1986,0,False,False
24646,chicago,The puzzled heart,Carolyn Gold,Heilbrun,1998,2,True,False
22410,chicago,Face,Cecile,Pineda,1985,1,True,False


In [33]:
features = load_all_counts(count_npz)
features = {k: v for k, v in features.items() if k in works_available}
features = pd.DataFrame(features).transpose()
features.columns = feature_names
features = features.loc[quad.index]
assert (features.index == md.index).all()
features.insert(loc=0, column='_gender_f', value=md.gender_f.values.astype(float))
# features.insert(loc=0, column='_canon_y', value=md.canon_y.values.astype(float))
# features = features.drop(columns=[test_word])
print(features.shape)

(6820, 10001)


# Constructing a cross-validation dataset

In [25]:
c0_sel, = (md['quad'] == test_quad).values.nonzero()
c1_sel, = (md['quad'] != test_quad).values.nonzero()
min_samples = min([len(s) for s in (c0_sel, c1_sel)])
n_cv_samples = 4000
np.random.seed(1000)
np.random.shuffle(c0_sel)
np.random.shuffle(c1_sel)
cvdata = np.hstack([c0_sel[0:n_cv_samples // 2], 
                    c1_sel[0:n_cv_samples // 2]])
np.random.shuffle(cvdata)
cvdata_md = md.iloc[cvdata]
cvdata_features = features.iloc[cvdata]

# Logistic regression (easy test)

In [26]:
n_features = 1000
n_train = len(cvdata_features) // 4 * 3
predict_key = 'gender_f'
predict_val = True

X = cvdata_features.iloc[:n_train].values[:, :n_features]
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X = (X - X_mean) / X_std
y = (cvdata_md.iloc[:n_train][predict_key] == predict_val).values.astype(float)
X_test = cvdata_features.iloc[n_train:].values[:, :n_features]
X_test = (X_test - X_mean) / X_std
y_test = (cvdata_md.iloc[n_train:][predict_key] == predict_val).values.astype(float)

lr = LogisticRegression(C=1.0)
lr.fit(X, y)
lr.score(X_test, y_test)


0.999

# Logistic regression (real test)

In [27]:
n_features = 1000
n_train = len(cvdata_features) // 4 * 3
predict_key = 'quad'
predict_val = test_quad

X = cvdata_features.iloc[:n_train].values[:, :n_features]
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X = (X - X_mean) / X_std
y = (cvdata_md.iloc[:n_train][predict_key] == predict_val).values.astype(float)
X_test = cvdata_features.iloc[n_train:].values[:, :n_features]
X_test = (X_test - X_mean) / X_std
y_test = (cvdata_md.iloc[n_train:][predict_key] == predict_val).values.astype(float)

lr = LogisticRegression(C=0.00003)
lr.fit(X, y)
lr.score(X_test, y_test)

0.541

In [28]:
cvdata_features.columns[0:10]

Index(['_gender_f', ',', '.', 'a', 'in', 'to', 'the', 'of', 'and', 'it'], dtype='object')

# Support vector machine (easy test)

In [29]:
n_features = 1000
n_train = len(cvdata_features) // 4 * 3
predict_key = 'gender_f'
predict_val = True

X = cvdata_features.iloc[:n_train].values[:, :n_features]
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X = (X - X_mean) / X_std
y = (cvdata_md.iloc[:n_train][predict_key] == predict_val).values.astype(float)
X_test = cvdata_features.iloc[n_train:].values[:, :n_features]
X_test = (X_test - X_mean) / X_std
y_test = (cvdata_md.iloc[n_train:][predict_key] == predict_val).values.astype(float)

lr = SVC(C=1.0)
lr.fit(X, y)
lr.score(X_test, y_test)

0.986

# Support vector machine (real test)

In [30]:
n_features = 1000
n_train = len(cvdata_features) // 4 * 3
predict_key = 'quad'
predict_val = test_quad

X = cvdata_features.iloc[:n_train].values[:, :n_features]
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X = (X - X_mean) / X_std
y = (cvdata_md.iloc[:n_train][predict_key] == predict_val).values.astype(float)
X_test = cvdata_features.iloc[n_train:].values[:, :n_features]
X_test = (X_test - X_mean) / X_std
y_test = (cvdata_md.iloc[n_train:][predict_key] == predict_val).values.astype(float)

lr = SVC(C=1.0)
lr.fit(X, y)
lr.score(X_test, y_test)

0.531

# MultinomialNB (easy test)

In [31]:
n_features = 1000
n_train = len(cvdata_features) // 4 * 3
predict_key = 'gender_f'
predict_val = True

X = cvdata_features.iloc[:n_train].values[:, :n_features]
y = (cvdata_md.iloc[:n_train][predict_key] == predict_val).values.astype(float)
X_test = cvdata_features.iloc[n_train:].values[:, :n_features]
y_test = (cvdata_md.iloc[n_train:][predict_key] == predict_val).values.astype(float)

lr = MultinomialNB()
lr.fit(X, y)
lr.score(X_test, y_test)

0.768

# MultinomialNB (real test)

In [32]:
n_features = 1000
n_train = len(cvdata_features) // 4 * 3
predict_key = 'quad'
predict_val = test_quad

X = cvdata_features.iloc[:n_train].values[:, :n_features]
y = (cvdata_md.iloc[:n_train][predict_key] == predict_val).values.astype(float)
X_test = cvdata_features.iloc[n_train:].values[:, :n_features]
y_test = (cvdata_md.iloc[n_train:][predict_key] == predict_val).values.astype(float)

lr = MultinomialNB()
lr.fit(X, y)
lr.score(X_test, y_test)

0.505