In [2]:
import os
import string

from statistics import mean
from statistics import stdev
from collections import Counter


from spacy.en import English
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import spearmanr

import textacy
import numpy as np

In [3]:
# PATH CONFIG
datadir = "/data/panstuffs/"
pan15train = "pan15-authorship-verification-training-dataset-english-2015-04-19/"
pan15test = "pan15-authorship-verification-test-dataset2-english-2015-04-19/"
pan14traine = "pan14-author-verification-training-corpus-english-essays-2014-04-22/"
pan14teste = "pan14-author-verification-test-corpus2-english-essays-2014-04-22/"
pan14trainn = "pan14-author-verification-training-corpus-english-novels-2014-04-22/"
pan14testn = "pan14-author-verification-test-corpus2-english-novels-2014-04-22/"

In [4]:
def read_file(filepath):
    with open(filepath) as f:
        s = f.read()
    return s

class TextPair:
    def __init__(self, author, known, unknown, max_length=1200):
        self.author = author
        self.known = known
        self.unknown = unknown
        self.max_length = max_length

def get_string(filename):
    with open(filename, encoding="utf8") as f:
        s = f.read()
    return s

def get_texts(directory):
    authors = [x for x in os.listdir(directory) if x.startswith("EN")]
    tps = []
    for author in authors:
        known = os.path.join(directory, author, "known01.txt")
        unknown = os.path.join(directory, author, "unknown.txt")
        tps.append(TextPair(author, get_string(known), get_string(unknown)))
    return tps

def load_pan_data(directory):
    """Load known and unknown texts in the PAN data format"""
    # FIXME: assumes one known file per author, which is fine for English datasets only
    authors = sorted([x for x in os.listdir(directory) if os.path.isdir(os.path.join(directory, x))])
    known_texts = []
    unknown_texts = []
    for author in authors:
        kfs = os.listdir(os.path.join(directory, author))
        kfs = [os.path.join(directory, author, x) for x in kfs if x.startswith("known")]
        all_known = ""
        for kf in kfs:
            one_kf = read_file(kf)
            all_known += "\n{}\n".format(one_kf)
        known_texts.append(all_known)
        
        uf = os.path.join(directory, author, "unknown.txt")
        unknown_texts.append(read_file(uf))
        
    truthfile = os.path.join(directory, "truth.txt")
    with open(truthfile) as f:
        lines = f.read().strip().split("\n")
    y = [1 if line.split()[1] == "Y" else 0 for line in lines]
    return known_texts, unknown_texts, y

In [5]:
def _normalize_counter(counter, c):
    """Divide all the values in a Counter by a constant and remove padding"""
    for key in counter:
        counter[key] = (counter[key] - 1) / c
    return counter

class TextAnalyser:
    def __init__(self, nlp=None, funcwordsfile="function_words.txt"):
        if nlp:
            self.nlp = nlp
        else:
            self.nlp = English()
            
        # alphabet for letter ratios
        self.alphabet = string.ascii_lowercase + "!?:;,.'- "
        self.alphabet = "!?:;,.'- "
        
        # keys that we care about from textacy.stats
        self.basic_keys = ['n_long_words', 'n_monosyllable_words', 'n_polysyllable_words', 'n_sents', 'n_syllables', 'n_unique_words', 'n_words']
        
        # keys that we care about for textacy readability stats
        self.readability_keys = ['automated_readability_index','coleman_liau_index', 'flesch_kincaid_grade_level',
                                 'flesch_readability_ease', 'gulpease_index', 'gunning_fog_index', 'lix',
                                 'wiener_sachtextformel']
        
        # parts of speech that we care about from spacy (pos_ not tag_)
        self.pos_keys = ['ADJ', 'ADP', 'ADV', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SPACE', 'SYM', 'VERB', 'X']
        self.pos_keys_set = set(self.pos_keys)
        
        with open(funcwordsfile) as f:
            s = sorted(f.read().split("\n"))
        self.func_words = s

    def get_named_features(self, text):
        processed = self.nlp(text, entity=False, tag=True, parse=True)
        stats = textacy.text_stats.TextStats(processed)
        basic_stats = stats.basic_counts
        readability_stats = stats.readability_stats
        cleaned_text = ''.join(filter(lambda x: x in self.alphabet, text.lower() + self.alphabet))
        
        stats_ratios = {key: (basic_stats[key] / len(text)) for key in self.basic_keys}
        readability_ratios = {key: (readability_stats[key] / len(text)) for key in self.readability_keys}
        # stats_ratios.update(readability_ratios)
        

        # get only the characters we care about 
        # append alphabet so that each character artificially appears once
        char_ratios = Counter(cleaned_text)
        char_ratios = _normalize_counter(char_ratios, len(text))

        # calculate pos ratios
        tags = [word.pos_ for word in processed if word.pos_ in self.pos_keys_set] + self.pos_keys
        pos_ratios = Counter(tags)
        pos_ratios = _normalize_counter(pos_ratios, len(processed)) # normalize by word length
        
        word_counts = Counter([word.text.lower() for word in processed])
        func_words = {fword: (word_counts[fword]/len(processed)) for fword in self.func_words}
        
        res = stats_ratios
        res.update(char_ratios)
        res.update(pos_ratios)
        res.update(func_words)
        return [(key, res[key]) for key in sorted(res)]
    
    def calculate_mean_and_std(self, extracted_texts):
        """finds unusual patterns by calculating mean and std deviation for a list of 
           extracted features and sorting by z-score"""
        means = []
        stds = []
        sample = extracted_texts[0]  # get one text for feature size and names
        num_features = len(sample)
        # fi = feature index
        for fi in range(num_features):
            u = mean([stat[fi][1] for stat in extracted_texts])
            o = stdev([stat[fi][1] for stat in extracted_texts])
            means.append((sample[fi][0], u))
            stds.append((sample[fi][0], o))
        return means, stds
    
    def calculate_z_scores(self, extracted_text, means, stds):
        """Calculate the zscores for each features of a single text (extractions)"""
        # z = (X - μ) / σ
        zscores = []
        num_features = len(extracted_text)
        for fi in range(num_features):
            feature_name = extracted_text[fi][0]
            try:
                zscore = (extracted_text[fi][1] - means[fi][1]) / stds[fi][1]
            except ZeroDivisionError:
                zscore = 0
            zscores.append((zscore, feature_name))
        return zscores

In [6]:
def format_unusual_features(zscores, nfeatures=5):
    zscores = sorted(zscores)
    s = ""
    for j in range(nfeatures):
        fname = zscores[j][1]
        padding = " " * (30 - len(fname))
        s += "{} is low {}(zscore: {})\n".format(fname, padding, zscores[j][0])
    s += "    . . .     \n"
    for j in range(nfeatures):
        fname = zscores[-(j + 1)][1]
        padding = " " * (30 - len(fname))
        s += "{} is high{}(zscore: {})\n".format(fname, padding, zscores[-(j + 1)][0])
    return s

def supports_and_opposes(zscores1, zscores2, sup_threshold=1.5):
    supports = []
    opposes = []
    for i in range(len(zscores1)):
        # both are high or low
        if (zscores1[i][0] > sup_threshold and zscores2[i][0] > sup_threshold) or (
            zscores1[i][0] < -sup_threshold and zscores2[i][0] < -sup_threshold):
            supports.append((zscores1[i], zscores2[i]))
        # one is high and the other is low
        elif (zscores1[i][0] > sup_threshold and zscores2[i][0] < -sup_threshold) or (
            zscores1[i][0] < -sup_threshold and zscores2[i][0] > sup_threshold):
            opposes.append((zscores1[i], zscores2[i]))
    return supports, opposes 

def shared_top_bottom(zs1, zs2, n=10):
    """Find shared top/bottom n most unusual features between two texts"""
    zs1 = sorted(zs1)
    zs2 = sorted(zs2)
    
    t1, b1 = zs1[-n:], zs1[:n]
    t2, b2 = zs2[-n:], zs2[:n]
    
    shared_top = set([f[1] for f in t1]).intersection(set([f[1] for f in t2]))
    shared_bottom = set([f[1] for f in b1]).intersection(set([f[1] for f in b2]))
    
    opposed1 = set([f[1] for f in t1]).intersection(set([f[1] for f in b2]))
    opposed2 = set([f[1] for f in t2]).intersection(set([f[1] for f in b1]))
    
    return shared_top, shared_bottom, opposed1, opposed2

def get_predictions(knowns, unknowns, threshold=1.5):
    ta = TextAnalyser()
    knownstats = [ta.get_named_features(text) for text in knowns]
    unknownstats = [ta.get_named_features(text) for text in unknowns]
    means, stds = ta.calculate_mean_and_std(knownstats + unknownstats)
    knownzs = [ta.calculate_z_scores(ks, means, stds) for ks in knownstats]
    unknownzs = [ta.calculate_z_scores(ks, means, stds) for ks in unknownstats]
    
    preds = []
    for i in range(len(knowns)):
        z1 = knownzs[i]
        z2 = unknownzs[i]
        s, o = supports_and_opposes(z1, z2, sup_threshold=threshold)
        preds.append(len(s) > len(o))
    return preds

def get_predictions_correlation(knowns, unknowns, threshold=1.5):
    ta = TextAnalyser()
    knownstats = [ta.get_named_features(text) for text in knowns]
    unknownstats = [ta.get_named_features(text) for text in unknowns]
    means, stds = ta.calculate_mean_and_std(knownstats + unknownstats)
    knownzs = [ta.calculate_z_scores(ks, means, stds) for ks in knownstats]
    unknownzs = [ta.calculate_z_scores(ks, means, stds) for ks in unknownstats]
    
    preds = []
    for i in range(len(knowns)):
        z1 = knownzs[i]
        z2 = unknownzs[i]
        s, o = supports_and_opposes(z1, z2, sup_threshold=threshold)
        preds.append(len(s) > len(o))
    return preds

def compare_texts_example(knowns, unknowns, index=0):
    """Outputs most distinctive features of first text pair"""
    ta = TextAnalyser()
    knownstats = [ta.get_named_features(text) for text in knowns]
    unknownstats = [ta.get_named_features(text) for text in unknowns]
    means, stds = ta.calculate_mean_and_std(knownstats + unknownstats)
    knownzs = [ta.calculate_z_scores(ks, means, stds) for ks in knownstats]
    unknownzs = [ta.calculate_z_scores(ks, means, stds) for ks in unknownstats]
    print(format_unusual_features(knownzs[index], 10))
    print(format_unusual_features(unknownzs[index], 10))
    
def compare_texts_s_o(knowns, unknowns, index=0):
    """Outputs most distinctive features of first text pair"""
    ta = TextAnalyser()
    knownstats = [ta.get_named_features(text) for text in knowns]
    unknownstats = [ta.get_named_features(text) for text in unknowns]
    means, stds = ta.calculate_mean_and_std(knownstats + unknownstats)
    knownzs = [ta.calculate_z_scores(ks, means, stds) for ks in knownstats]
    unknownzs = [ta.calculate_z_scores(ks, means, stds) for ks in unknownstats]
    s, o = supports_and_opposes(knownzs[index], unknownzs[index], 0.75)
    return s, o
    
def get_explanatory_example(knowns, unknowns, threshold=1.5):
    """Find a text with a lot of supporting points"""
    ta = TextAnalyser()
    knownstats = [ta.get_named_features(text) for text in knowns]
    unknownstats = [ta.get_named_features(text) for text in unknowns]
    means, stds = ta.calculate_mean_and_std(knownstats + unknownstats)
    knownzs = [ta.calculate_z_scores(ks, means, stds) for ks in knownstats]
    unknownzs = [ta.calculate_z_scores(ks, means, stds) for ks in unknownstats]
    
    ratio_supports = []
    for i in range(len(knowns)):
        z1 = knownzs[i]
        z2 = unknownzs[i]
        s, o = supports_and_opposes(z1, z2, sup_threshold=threshold)
        support_ratio = len(s) / (len(s) + len(o))
        ratio_supports.append((support_ratio, i))
    return ratio_supports

In [9]:
def run_experiment(dataset, threshold):
    print("Running on {} with threshold {}".format(dataset, threshold))
    knowns, unknowns, labels = load_pan_data(datadir + dataset)
    preds = get_predictions(knowns, unknowns, threshold=threshold)
    print(Counter(preds))
    print(classification_report(labels, preds))
    print(accuracy_score(labels, preds))

In [10]:
# Run threshold experiments

In [11]:
%%time 
run_experiment(pan14traine, 0.1)
run_experiment(pan14trainn, 0.1)
run_experiment(pan14teste, 0.1)
run_experiment(pan14testn, 0.1)
run_experiment(pan15train, 0.1)
run_experiment(pan15test, 0.1)

Running on pan14-author-verification-training-corpus-english-essays-2014-04-22/ with threshold 0.1
Counter({True: 194, False: 6})
             precision    recall  f1-score   support

          0       0.83      0.05      0.09       100
          1       0.51      0.99      0.67       100

avg / total       0.67      0.52      0.38       200

0.52
Running on pan14-author-verification-training-corpus-english-novels-2014-04-22/ with threshold 0.1
Counter({True: 83, False: 17})
             precision    recall  f1-score   support

          0       1.00      0.34      0.51        50
          1       0.60      1.00      0.75        50

avg / total       0.80      0.67      0.63       100

0.67
Running on pan14-author-verification-test-corpus2-english-essays-2014-04-22/ with threshold 0.1
Counter({True: 193, False: 7})
             precision    recall  f1-score   support

          0       0.86      0.06      0.11       100
          1       0.51      0.99      0.68       100

avg / total 

  'precision', 'predicted', average, warn_for)


Counter({True: 492, False: 8})
             precision    recall  f1-score   support

          0       1.00      0.03      0.06       250
          1       0.51      1.00      0.67       250

avg / total       0.75      0.52      0.37       500

0.516
CPU times: user 5min 51s, sys: 27.4 s, total: 6min 19s
Wall time: 6min 29s


In [12]:
%%time 
run_experiment(pan14traine, 0.5)
run_experiment(pan14trainn, 0.5)
run_experiment(pan14teste, 0.5)
run_experiment(pan14testn, 0.5)
run_experiment(pan15train, 0.5)
run_experiment(pan15test, 0.5)

Running on pan14-author-verification-training-corpus-english-essays-2014-04-22/ with threshold 0.5
Counter({True: 153, False: 47})
             precision    recall  f1-score   support

          0       0.70      0.33      0.45       100
          1       0.56      0.86      0.68       100

avg / total       0.63      0.59      0.56       200

0.595
Running on pan14-author-verification-training-corpus-english-novels-2014-04-22/ with threshold 0.5
Counter({True: 68, False: 32})
             precision    recall  f1-score   support

          0       1.00      0.64      0.78        50
          1       0.74      1.00      0.85        50

avg / total       0.87      0.82      0.81       100

0.82
Running on pan14-author-verification-test-corpus2-english-essays-2014-04-22/ with threshold 0.5
Counter({True: 141, False: 59})
             precision    recall  f1-score   support

          0       0.78      0.46      0.58       100
          1       0.62      0.87      0.72       100

avg / tot

In [13]:
%%time 
run_experiment(pan14traine, 0.75)
run_experiment(pan14trainn, 0.75)
run_experiment(pan14teste, 0.75)
run_experiment(pan14testn, 0.75)
run_experiment(pan15train, 0.75)
run_experiment(pan15test, 0.75)

Running on pan14-author-verification-training-corpus-english-essays-2014-04-22/ with threshold 0.75
Counter({True: 146, False: 54})
             precision    recall  f1-score   support

          0       0.74      0.40      0.52       100
          1       0.59      0.86      0.70       100

avg / total       0.66      0.63      0.61       200

0.63
Running on pan14-author-verification-training-corpus-english-novels-2014-04-22/ with threshold 0.75
Counter({True: 64, False: 36})
             precision    recall  f1-score   support

          0       0.94      0.68      0.79        50
          1       0.75      0.96      0.84        50

avg / total       0.85      0.82      0.82       100

0.82
Running on pan14-author-verification-test-corpus2-english-essays-2014-04-22/ with threshold 0.75
Counter({True: 142, False: 58})
             precision    recall  f1-score   support

          0       0.83      0.48      0.61       100
          1       0.63      0.90      0.74       100

avg / t

In [14]:
%%time 
run_experiment(pan14traine, 1)
run_experiment(pan14trainn, 1)
run_experiment(pan14teste, 1)
run_experiment(pan14testn, 1)
run_experiment(pan15train, 1)
run_experiment(pan15test, 1)

Running on pan14-author-verification-training-corpus-english-essays-2014-04-22/ with threshold 1
Counter({True: 147, False: 53})
             precision    recall  f1-score   support

          0       0.79      0.42      0.55       100
          1       0.61      0.89      0.72       100

avg / total       0.70      0.66      0.63       200

0.655
Running on pan14-author-verification-training-corpus-english-novels-2014-04-22/ with threshold 1
Counter({True: 66, False: 34})
             precision    recall  f1-score   support

          0       0.94      0.64      0.76        50
          1       0.73      0.96      0.83        50

avg / total       0.83      0.80      0.79       100

0.8
Running on pan14-author-verification-test-corpus2-english-essays-2014-04-22/ with threshold 1
Counter({True: 128, False: 72})
             precision    recall  f1-score   support

          0       0.75      0.54      0.63       100
          1       0.64      0.82      0.72       100

avg / total     

In [15]:
%%time 
run_experiment(pan14traine, 1.5)
run_experiment(pan14trainn, 1.5)
run_experiment(pan14teste, 1.5)
run_experiment(pan14testn, 1.5)
run_experiment(pan15train, 1.5)
run_experiment(pan15test, 1.5)

Running on pan14-author-verification-training-corpus-english-essays-2014-04-22/ with threshold 1.5
Counter({True: 111, False: 89})
             precision    recall  f1-score   support

          0       0.63      0.56      0.59       100
          1       0.60      0.67      0.64       100

avg / total       0.62      0.61      0.61       200

0.615
Running on pan14-author-verification-training-corpus-english-novels-2014-04-22/ with threshold 1.5
Counter({True: 55, False: 45})
             precision    recall  f1-score   support

          0       0.67      0.60      0.63        50
          1       0.64      0.70      0.67        50

avg / total       0.65      0.65      0.65       100

0.65
Running on pan14-author-verification-test-corpus2-english-essays-2014-04-22/ with threshold 1.5
Counter({True: 117, False: 83})
             precision    recall  f1-score   support

          0       0.64      0.53      0.58       100
          1       0.60      0.70      0.65       100

avg / tot

In [17]:
def run_on_all_data(threshold):
    print("Running on all data with threshold {}".format(threshold))
    ta = TextAnalyser()
    all_knowns = []
    all_unknowns = []
    all_labels = []
    datasets = [pan14traine, pan14trainn, pan14teste, pan14testn, pan15train, pan15test]
    for dataset in datasets:
        knowns, unknowns, labels = load_pan_data(datadir + dataset)
        all_knowns += knowns
        all_unknowns += unknowns
        all_labels += labels
    preds = get_predictions(all_knowns, all_unknowns, threshold=threshold)
    print(Counter(preds))
    print(classification_report(all_labels, preds))
    print(accuracy_score(all_labels, preds))
    
def run_on_all_data_thresholds():
    ta = TextAnalyser()
    all_knowns = []
    all_unknowns = []
    all_labels = []
    datasets = [pan14traine, pan14trainn, pan14teste, pan14testn, pan15train, pan15test]
    for dataset in datasets:
        knowns, unknowns, labels = load_pan_data(datadir + dataset)
        all_knowns += knowns
        all_unknowns += unknowns
        all_labels += labels
    for threshold in [1.75, 1.8, 1.9, 2, 2.1, 2.2, 2.3, 2.5, 2.75, 3, 4, 5]:
        print("Threshold: {}".format(threshold))
        preds = get_predictions(all_knowns, all_unknowns, threshold=threshold)
        print(Counter(preds))
        print(classification_report(all_labels, preds))
        print(accuracy_score(all_labels, preds))

In [18]:
%%time
run_on_all_data(0.1)
run_on_all_data(0.5)
run_on_all_data(0.75)
run_on_all_data(1)
run_on_all_data(1.5)

Running on all data with threshold 0.1


KeyboardInterrupt: 

In [None]:
run_on_all_data(1.75)
run_on_all_data(2)
run_on_all_data(2.5)
run_on_all_data(3)

In [None]:
def get_rhos(knownstats, unknownstats):
    rhos = []
    for i in range(len(knownstats)):
        ks = [s[1] for s in knownstats[i]]
        us = [s[1] for s in unknownstats[i]]
        rhos.append(spearmanr(ks,us))
    return rhos

def run_correlation_experiment(dataset, threshold):
    ta = TextAnalyser()
    knowns, unknowns, labels = load_pan_data(datadir + dataset)
    knownstats = [ta.get_named_features(text) for text in knowns]
    unknownstats = [ta.get_named_features(text) for text in unknowns]
    rhos = get_rhos(knownstats, unknownstats)
    rhos = [r[0] for r in rhos]
    preds = [1 if x > threshold else 0 for x in rhos]
    print(Counter(preds))
    print(classification_report(labels, preds))
    print(accuracy_score(labels, preds))
    
def run_correlation_on_all_data(threshold):
    print("Running on all data with threshold {}".format(threshold))
    ta = TextAnalyser()
    all_knowns = []
    all_unknowns = []
    all_labels = []
    datasets = [pan14traine, pan14trainn, pan14teste, pan14testn, pan15train, pan15test]
    for dataset in datasets:
        knowns, unknowns, labels = load_pan_data(datadir + dataset)
        all_knowns += knowns
        all_unknowns += unknowns
        all_labels += labels
    knownstats = [ta.get_named_features(text) for text in all_knowns]
    unknownstats = [ta.get_named_features(text) for text in all_unknowns]
    rhos = get_rhos(knownstats, unknownstats)
    rhos = [r[0] for r in rhos]
    preds = [1 if x > threshold else 0 for x in rhos]
    print(Counter(preds))
    print(classification_report(all_labels, preds))
    print(accuracy_score(all_labels, preds))

In [None]:
%%time
run_correlation_experiment(pan14traine, 0.65)
run_correlation_experiment(pan14trainn, 0.65)
run_correlation_experiment(pan14teste,  0.65)
run_correlation_experiment(pan14testn,  0.65)
run_correlation_experiment(pan15train,  0.65)
run_correlation_experiment(pan15test,   0.65)

In [None]:
%%time 
run_correlation_experiment(pan14traine, 0.75)
run_correlation_experiment(pan14trainn, 0.75)
run_correlation_experiment(pan14teste,  0.75)
run_correlation_experiment(pan14testn,  0.75)
run_correlation_experiment(pan15train,  0.75)
run_correlation_experiment(pan15test,   0.75)

In [None]:
%%time 
run_correlation_experiment(pan14traine, 0.8)
run_correlation_experiment(pan14trainn, 0.8)
run_correlation_experiment(pan14teste,  0.8)
run_correlation_experiment(pan14testn,  0.8)
run_correlation_experiment(pan15train,  0.8)
run_correlation_experiment(pan15test,   0.8)

In [None]:
%%time
run_correlation_on_all_data(0.65)
run_correlation_on_all_data(0.75)
run_correlation_on_all_data(0.8)

In [145]:
"""Formated example"""
knowns, unknowns, labels = load_pan_data(datadir + pan15test)
r = get_explanatory_example(knowns, unknowns, 0.75)

In [147]:
sorted(r)

[(0.14285714285714285, 232),
 (0.15384615384615385, 374),
 (0.18181818181818182, 342),
 (0.2, 48),
 (0.2, 117),
 (0.21428571428571427, 417),
 (0.23076923076923078, 33),
 (0.23076923076923078, 186),
 (0.23076923076923078, 295),
 (0.23529411764705882, 169),
 (0.23809523809523808, 463),
 (0.25, 96),
 (0.25, 150),
 (0.25, 203),
 (0.25, 428),
 (0.2631578947368421, 394),
 (0.26666666666666666, 143),
 (0.26666666666666666, 273),
 (0.26666666666666666, 337),
 (0.26666666666666666, 480),
 (0.2727272727272727, 496),
 (0.27586206896551724, 255),
 (0.2777777777777778, 119),
 (0.2777777777777778, 259),
 (0.2857142857142857, 5),
 (0.2857142857142857, 155),
 (0.2857142857142857, 163),
 (0.2857142857142857, 237),
 (0.2857142857142857, 475),
 (0.2857142857142857, 479),
 (0.2916666666666667, 311),
 (0.29411764705882354, 408),
 (0.3076923076923077, 491),
 (0.3125, 400),
 (0.3157894736842105, 220),
 (0.3157894736842105, 386),
 (0.32142857142857145, 482),
 (0.3333333333333333, 55),
 (0.3333333333333333, 99

In [168]:
compare_texts_example(knowns, unknowns, index=249)

a is low                              (zscore: -1.3631859377723599)
' is low                              (zscore: -1.330374697989496)
NUM is low                            (zscore: -1.0216623099452453)
PART is low                           (zscore: -0.984767912015898)
it is low                             (zscore: -0.9629009430056533)
them is low                           (zscore: -0.9111994581394077)
like is low                           (zscore: -0.9091884013431722)
n_monosyllable_words is low           (zscore: -0.8766819643031585)
am is low                             (zscore: -0.8429081158257491)
n_words is low                        (zscore: -0.7926908289651476)
    . . .     
their is high                         (zscore: 4.963859653713718)
they is high                          (zscore: 4.027513929464848)
whose is high                         (zscore: 3.7145108573431664)
none is high                          (zscore: 3.3024317477432676)
which is high                         (zs

In [172]:
compare_texts_s_o(knowns, unknowns, 249)

([((-1.330374697989496, "'"), (-1.0085045296188926, "'")),
  ((2.3435246687571607, ':'), (1.1950977492780703, ':')),
  ((-1.3631859377723599, 'a'), (-1.1693968576909042, 'a')),
  ((1.2089331417245845, 'as'), (0.8771542552048426, 'as')),
  ((-0.9091884013431722, 'like'), (-0.9091884013431722, 'like')),
  ((1.499266335077023, 'my'), (3.4135798453393975, 'my')),
  ((1.571859119125854, 'n_long_words'), (0.9289180107168988, 'n_long_words')),
  ((-0.8766819643031585, 'n_monosyllable_words'),
   (-1.3059372616234524, 'n_monosyllable_words')),
  ((1.3234938033943573, 'n_polysyllable_words'),
   (1.982662422040904, 'n_polysyllable_words')),
  ((-0.7926908289651476, 'n_words'), (-0.7939992255209427, 'n_words')),
  ((1.581336717864196, 'no'), (1.7314268102305792, 'no')),
  ((0.8563240671934634, 'once'), (0.9234812478063622, 'once')),
  ((1.962493926685329, 'over'), (0.7539200772422432, 'over')),
  ((2.5120692912518465, 'which'), (4.208417904084811, 'which')),
  ((1.1677228135627893, 'with'), (0.9

In [174]:
barrie = [((-1.330374697989496, "'"), (-1.0085045296188926, "'")),
  ((2.3435246687571607, ':'), (1.1950977492780703, ':')),
  ((-1.3631859377723599, 'a'), (-1.1693968576909042, 'a')),
  ((1.2089331417245845, 'as'), (0.8771542552048426, 'as')),
  ((-0.9091884013431722, 'like'), (-0.9091884013431722, 'like')),
  ((1.499266335077023, 'my'), (3.4135798453393975, 'my')),
  ((1.571859119125854, 'n_long_words'), (0.9289180107168988, 'n_long_words')),
  ((-0.8766819643031585, 'n_monosyllable_words'),
   (-1.3059372616234524, 'n_monosyllable_words')),
  ((1.3234938033943573, 'n_polysyllable_words'),
   (1.982662422040904, 'n_polysyllable_words')),
  ((-0.7926908289651476, 'n_words'), (-0.7939992255209427, 'n_words')),
  ((1.581336717864196, 'no'), (1.7314268102305792, 'no')),
  ((0.8563240671934634, 'once'), (0.9234812478063622, 'once')),
  ((1.962493926685329, 'over'), (0.7539200772422432, 'over')),
  ((2.5120692912518465, 'which'), (4.208417904084811, 'which')),
  ((1.1677228135627893, 'with'), (0.9644714961289469, 'with'))]

In [175]:
len(barrie)

15

In [181]:
for feature in barrie:
    print("{0},{1:.2f},{2:.2f}".format(feature[0][1], feature[0][0], feature[1][0]))

',-1.33,-1.01
:,2.34,1.20
a,-1.36,-1.17
as,1.21,0.88
like,-0.91,-0.91
my,1.50,3.41
n_long_words,1.57,0.93
n_monosyllable_words,-0.88,-1.31
n_polysyllable_words,1.32,1.98
n_words,-0.79,-0.79
no,1.58,1.73
once,0.86,0.92
over,1.96,0.75
which,2.51,4.21
with,1.17,0.96


In [184]:
knowns[249].count(" ")

503

In [185]:
unknowns[249].count(" ")

428