# Loading Word Counts

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

%matplotlib inline

In [133]:
import bz2
import os
import ujson
import attr
import re

import pandas as pd
import numpy as np
import statsmodels.api as sm

from pprint import pprint
from glob import glob
from collections import Counter, UserDict
from itertools import islice
from tqdm import tqdm_notebook
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

# Loading .npz vectors, metadata, and terciles

In [61]:
count_npz = '../counts/novels.100.npz'
feature_name_file = '../counts/10kwords.txt'
with open(feature_name_file) as ip:
    feature_names = [f.strip() for f in ip]

def load_all_counts(path):
    files = [os.path.join(path, f) 
             for f in os.listdir(path) 
             if f.endswith('.npz')]
    collected = {}
    for f in files:
        collected.update(np.load(f))
    return collected

In [4]:
md = pd.read_csv('../metadata/novels-metadata.csv',
                 index_col='identifier')

In [5]:
@attr.s
class Dataset:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '*.bz2'))
    
    def novels(self):
        for path in tqdm_notebook(self.paths()):
            with bz2.open(path) as fh:
                for line in fh:
                    yield ujson.loads(line)

In [6]:
ds = Dataset('../data/top200-3bins.json/')

In [7]:
novels = list(ds.novels())

Widget Javascript not detected.  It may not be installed or enabled properly.





In [51]:
# This generates quad labels like so:
#     2  3
#     0  1
# 0 == fall, fall
# 1 == rise, fall
# 2 == fall, rise
# 3 == rise, rise

def tercile_to_quad(terciles, feature):
    t = terciles[feature]
    return (t[0] < t[1]) + 2 * (t[1] < t[2]).astype(int)

In [9]:
feature_set = sorted(novels[0]['counts'].keys())
terciles = {(f, i): {n['identifier']: n['counts'][f][i] for n in novels}
            for f in feature_set for i in range(3)}
terciles = pd.DataFrame(terciles)

In [146]:
test_word = 'a'
test_quad = 0
quad = tercile_to_quad(terciles, test_word)
md = md.loc[quad.index].assign(quad=quad)
works_available = set(md.index)

In [147]:
features = load_all_counts(count_npz)
features = {k: v for k, v in features.items() if k in works_available}
features = pd.DataFrame(features).transpose()
features.columns = feature_names
# features = features.drop(columns=[test_word])

# Constructing a cross-validation dataset

In [148]:
c0_sel, = (md['quad'] == test_quad).values.nonzero()
c1_sel, = (md['quad'] != test_quad).values.nonzero()
min_samples = min([len(s) for s in (c0_sel, c1_sel)])
n_cv_samples = 8000
np.random.seed(1000)
np.random.shuffle(c0_sel)
np.random.shuffle(c1_sel)
cvdata = np.hstack([c0_sel[0:n_cv_samples // 2], 
                    c1_sel[0:n_cv_samples // 2]])
np.random.shuffle(cvdata)
cvdata_md = md.iloc[cvdata]
cvdata_features = features.iloc[cvdata]

# Logistic regression (easy test)

In [149]:
n_features = 1000
n_train = len(cvdata_features) // 4 * 3
predict_key = 'corpus'
predict_val = 'chicago'

X = cvdata_features.iloc[:n_train].values[:, :n_features]
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X = (X - X_mean) / X_std
y = (cvdata_md.iloc[:n_train][predict_key] == predict_val).values.astype(float)
X_test = cvdata_features.iloc[n_train:].values[:, :n_features]
X_test = (X_test - X_mean) / X_std
y_test = (cvdata_md.iloc[n_train:][predict_key] == predict_val).values.astype(float)

lr = LogisticRegression(C=1.0)
lr.fit(X, y)
lr.score(X_test, y_test)


0.996

# Logistic regression (real test)

In [150]:
n_features = 1000
n_train = len(cvdata_features) // 4 * 3
predict_key = 'quad'
predict_val = test_quad

X = cvdata_features.iloc[:n_train].values[:, :n_features]
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X = (X - X_mean) / X_std
y = (cvdata_md.iloc[:n_train][predict_key] == predict_val).values.astype(float)
X_test = cvdata_features.iloc[n_train:].values[:, :n_features]
X_test = (X_test - X_mean) / X_std
y_test = (cvdata_md.iloc[n_train:][predict_key] == predict_val).values.astype(float)

lr = LogisticRegression(C=0.3)
lr.fit(X, y)
lr.score(X_test, y_test)
sum(y_test) / len(y_test)

0.511

# Support vector machine (easy test)

In [121]:
n_features = 1000
n_train = len(cvdata_features) // 4 * 3
predict_key = 'corpus'
predict_val = 'chicago'

X = cvdata_features.iloc[:n_train].values[:, :n_features]
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X = (X - X_mean) / X_std
y = (cvdata_md.iloc[:n_train][predict_key] == predict_val).values.astype(float)
X_test = cvdata_features.iloc[n_train:].values[:, :n_features]
X_test = (X_test - X_mean) / X_std
y_test = (cvdata_md.iloc[n_train:][predict_key] == predict_val).values.astype(float)

lr = SVC(C=1.0)
lr.fit(X, y)
lr.score(X_test, y_test)

0.981

# Support vector machine (real test)

In [122]:
n_features = 1000
n_train = len(cvdata_features) // 4 * 3
predict_key = 'quad'
predict_val = test_quad

X = cvdata_features.iloc[:n_train].values[:, :n_features]
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X = (X - X_mean) / X_std
y = (cvdata_md.iloc[:n_train][predict_key] == predict_val).values.astype(float)
X_test = cvdata_features.iloc[n_train:].values[:, :n_features]
X_test = (X_test - X_mean) / X_std
y_test = (cvdata_md.iloc[n_train:][predict_key] == predict_val).values.astype(float)

lr = SVC(C=1.0)
lr.fit(X, y)
lr.score(X_test, y_test)

0.587

# MultinomialNB (easy test)

In [123]:
n_features = 1000
n_train = len(cvdata_features) // 4 * 3
predict_key = 'corpus'
predict_val = 'chicago'

X = cvdata_features.iloc[:n_train].values[:, :n_features]
y = (cvdata_md.iloc[:n_train][predict_key] == predict_val).values.astype(float)
X_test = cvdata_features.iloc[n_train:].values[:, :n_features]
y_test = (cvdata_md.iloc[n_train:][predict_key] == predict_val).values.astype(float)

lr = MultinomialNB()
lr.fit(X, y)
lr.score(X_test, y_test)

0.958

# MultinomialNB (real test)

In [124]:
n_features = 1000
n_train = len(cvdata_features) // 4 * 3
predict_key = 'quad'
predict_val = test_quad

X = cvdata_features.iloc[:n_train].values[:, :n_features]
y = (cvdata_md.iloc[:n_train][predict_key] == predict_val).values.astype(float)
X_test = cvdata_features.iloc[n_train:].values[:, :n_features]
y_test = (cvdata_md.iloc[n_train:][predict_key] == predict_val).values.astype(float)

lr = MultinomialNB()
lr.fit(X, y)
lr.score(X_test, y_test)

0.543