# Loading Word Counts

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

%matplotlib inline

In [104]:
import bz2
import os
import ujson
import attr
import re

import pandas as pd
import numpy as np
import statsmodels.api as sm

from glob import glob
from collections import Counter, UserDict
from itertools import islice
from tqdm import tqdm_notebook
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

# Loading .npz vectors, metadata, and terciles

In [19]:
count_npz = '../counts/novels.100.npz'

def load_all_counts(path):
    files = [os.path.join(path, f) 
             for f in os.listdir(path) 
             if f.endswith('.npz')]
    collected = {}
    for f in files:
        collected.update(np.load(f))
    return collected

In [5]:
md = pd.read_csv('../metadata/novels-metadata.csv',
                 index_col='identifier')

In [6]:
@attr.s
class Dataset:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '*.bz2'))
    
    def novels(self):
        for path in tqdm_notebook(self.paths()):
            with bz2.open(path) as fh:
                for line in fh:
                    yield ujson.loads(line)

In [7]:
ds = Dataset('../data/top200-3bins.json/')

In [8]:
novels = list(ds.novels())




In [63]:
def tercile_to_quad(terciles, feature):
    t = terciles[feature]
    return (t[0] < t[1]) + 2 * (t[1] < t[2]).astype(int)

In [20]:
feature_set = sorted(novels[0]['counts'].keys())
terciles = {(f, i): {n['identifier']: n['counts'][f][i] for n in novels}
            for f in feature_set for i in range(3)}
terciles = pd.DataFrame(terciles)

In [71]:
quad = tercile_to_quad(terciles, 'a')
md = md.loc[quad.index].assign(quad=quad)
works_available = set(md_tags.index)

In [69]:
features = load_all_counts(count_npz)
features = {k: v for k, v in features.items() if k in works_available}
features = pd.DataFrame(features).transpose()

# Constructing a cross-validation dataset

In [120]:
q0_sel, = (md['quad'] == 0).values.nonzero()
q1_sel, = (md['quad'] == 1).values.nonzero()
q2_sel, = (md['quad'] == 2).values.nonzero()
q3_sel, = (md['quad'] == 3).values.nonzero()
min_samples = min([len(s) for s in (q0_sel, q1_sel, q2_sel, q3_sel)])  # about 1500
n_cv_samples = 4000
np.random.seed(1000)
np.random.shuffle(q0_sel)
np.random.shuffle(q1_sel)
np.random.shuffle(q2_sel)
np.random.shuffle(q3_sel)
cvdata = np.hstack([q0_sel[0:n_cv_samples // 4], 
                    q1_sel[0:n_cv_samples // 4],
                    q2_sel[0:n_cv_samples // 4],
                    q3_sel[0:n_cv_samples // 4],
                   ])
np.random.shuffle(cvdata)
cvdata_md = md.iloc[cvdata]
cvdata_features = features.iloc[cvdata]

# Logistic regression (easy test)

In [122]:
n_features = 1000
n_train = len(cvdata_features) // 4 * 3
predict_key = 'corpus'
predict_val = 'chicago'

X = cvdata_features.iloc[:n_train].values[:, :n_features]
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X = (X - X_mean) / X_std
y = (cvdata_md.iloc[:n_train][predict_key] == predict_val).values.astype(float)
X_test = cvdata_features.iloc[n_train:].values[:, :n_features]
X_test = (X_test - X_mean) / X_std
y_test = (cvdata_md.iloc[n_train:][predict_key] == predict_val).values.astype(float)

lr = LogisticRegression(C=1.0)
lr.fit(X, y)
lr.score(X_test, y_test)


0.993

# Logistic regression (real test)

In [127]:
n_features = 1000
n_train = len(cvdata_features) // 4 * 3
predict_key = 'quad'
predict_val = 0

X = cvdata_features.iloc[:n_train].values[:, :n_features]
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X = (X - X_mean) / X_std
y = (cvdata_md.iloc[:n_train][predict_key]).values.astype(int)       # == predict_val).values.astype(float)
X_test = cvdata_features.iloc[n_train:].values[:, :n_features]
X_test = (X_test - X_mean) / X_std
y_test = (cvdata_md.iloc[n_train:][predict_key]).values.astype(int)  # == predict_val).values.astype(float)

lr = LogisticRegression(C=0.3)
lr.fit(X, y)
lr.score(X_test, y_test)

0.285

# MultinomialNB (easy test)

In [116]:
n_features = 1000
n_train = len(cvdata_features) // 4 * 3
predict_key = 'corpus'
predict_val = 'chicago'

X = cvdata_features.iloc[:n_train].values[:, :n_features]
y = (cvdata_md.iloc[:n_train][predict_key] == predict_val).values.astype(float)
X_test = cvdata_features.iloc[n_train:].values[:, :n_features]
y_test = (cvdata_md.iloc[n_train:][predict_key] == predict_val).values.astype(float)

lr = MultinomialNB()
lr.fit(X, y)
lr.score(X_test, y_test)


0.945

# MultinomialNB (real test)

In [126]:
n_features = 1000
n_train = len(cvdata_features) // 4 * 3
predict_key = 'quad'
predict_val = 0

X = cvdata_features.iloc[:n_train].values[:, :n_features]
y = (cvdata_md.iloc[:n_train][predict_key]).values.astype(int)       # == predict_val).values.astype(float)
X_test = cvdata_features.iloc[n_train:].values[:, :n_features]
y_test = (cvdata_md.iloc[n_train:][predict_key]).values.astype(int)  # == predict_val).values.astype(float)

lr = MultinomialNB()
lr.fit(X, y)
lr.score(X_test, y_test)

0.298

# Support vector machine (easy test)

In [86]:
n_features = 1000
n_train = len(cvdata_features) // 4 * 3
predict_key = 'corpus'
predict_val = 'chicago'

X = cvdata_features.iloc[:n_train].values[:, :n_features]
y = (cvdata_md.iloc[:n_train][predict_key] == predict_val).values.astype(float)
X_test = cvdata_features.iloc[n_train:].values[:, :n_features]
y_test = (cvdata_md.iloc[n_train:][predict_key] == predict_val).values.astype(float)

lr = SVC(C=1.0)
lr.fit(X, y)
lr.score(X_test, y_test)

0.722

# Support vector machine (real test)

In [125]:
n_features = 1000
n_train = len(cvdata_features) // 4 * 3
predict_key = 'quad'

X = cvdata_features.iloc[:n_train].values[:, :n_features]
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X = (X - X_mean) / X_std
y = (cvdata_md.iloc[:n_train][predict_key]).values.astype(int)
X_test = cvdata_features.iloc[n_train:].values[:, :n_features]
X_test = (X_test - X_mean) / X_std
y_test = (cvdata_md.iloc[n_train:][predict_key]).values.astype(int)

lr = SVC(C=1.0)
lr.fit(X, y)
lr.score(X_test, y_test)

0.296