In [1]:
import spacy
sp = spacy.load('en')

In [39]:
import re
import os
import random
import csv
from pprint import pprint
from collections import Counter

import numpy
import scipy
import pandas
import nearpy
from bs4 import BeautifulSoup
from scipy import signal

import matplotlib
%matplotlib inline

In [40]:
### Script Settings ###

# Modify these to change input files and other parameters.

# Input filenames:
home_folder = '../../../../'
original_script_filename = os.path.join(
    home_folder, 
    'original-scripts/force-awakens/force-awakens-lines.csv'
)
fan_work_directory = os.path.join(
    home_folder, 
    'fan-works/force-awakens-fullset/plaintext'
)

# Set N-Gram window size:
window_size = 10

# Set cosine distance matching threshold:
distance_threshold = 0.25

# Set approximate nearest neighbor parameters:
number_of_hashes = 15  # Bigger -> slower (linear), more matches
hash_dimensions = 14   # Bigger -> faster (???), fewer matches

In [41]:
# This cell is commented out until we need Bokeh.

#import bokeh.io
#from bokeh.io import push_notebook, show, output_notebook
#from bokeh.plotting import figure
#from bokeh.resources import INLINE
#from bokeh.models import Range1d

#from time import sleep
#output_notebook(resources=INLINE)
#sleep(1)                  # Otherwise `Run All` messes things up; 
#bokeh.io._nb_loaded=True  # see https://github.com/bokeh/bokeh/issues/4987

In [42]:
# Utility functions

def mk_vectors(sp_txt):
    """Given a parsed text in `spacy`'s native format, produce
    a sequence of vectors, one per token.
    """
    rows = len(sp_txt)
    cols = len(sp_txt[0].vector if rows else 0)
    vectors = numpy.empty((rows, cols), dtype=float)
    for i, word in enumerate(sp_txt):
        if word.has_vector:
            vectors[i] = word.vector
        else:
            # It seems `spacy` doesn't have a pre-trained vector for
            # this word. So we do something pretty dumb here to give
            # the word a vector that is unique to that word and not
            # too similar to other words.
            w_str = str(word)
            vectors[i] = 0
            vectors[i][hash(w_str) % cols] = 1.0
            vectors[i][hash(w_str * 2) % cols] = 1.0
            vectors[i][hash(w_str * 3) % cols] = 1.0
    return vectors

def cosine_distance(row_values, col_values):
    """Calculate the cosine distance between two vectos. Also
    accepts matrices and 2-d arrays, and calculates the 
    distances over the cross product of rows and columns.
    """
    verr_msg = '`cosine_distance` is not defined for {}-dimensional arrays.'
    if len(row_values.shape) == 1:
        row_values = row_values[None,:]
    elif len(row_values.shape) != 2:
        raise ValueError(verr_msg.format(len(row_values.shape)))
    
    if len(col_values.shape) == 1:
        col_values = col_values[:,None]
    elif len(col_values.shape) != 2:
        raise ValueError(verr_msg.format(len(col_values.shape)))

    row_norm = (row_values * row_values).sum(axis=1) ** 0.5
    row_norm = row_norm[:,None]
    
    col_norm = (col_values * col_values).sum(axis=0) ** 0.5
    col_norm = col_norm[None,:]

    result = row_values @ col_values
    result /= row_norm
    result /= col_norm
    return 1 - result

In [43]:
# Load original script:

def load_txt_script(filename):
    with open(filename) as orig_in:
        orig_txt = orig_in.read()
        orig_txt = re.sub(r'\s+', ' ', orig_txt).strip()
        orig = sp(orig_txt)

def load_csv_script(filename):
    with open(filename) as orig_in:
        orig_csv = list(csv.reader(orig_in))[1:]
        orig_txt = ' '.join(line.strip() for char, line in orig_csv)
        tokens = sp(orig_txt)
        
        characters = []
        char_lines = iter(orig_csv)
        char, line = next(char_lines, ('', ''))
        
        start = 0
        for end in range(1, len(tokens)):
            tok_line = str(tokens[start:end])
            if line == tok_line:
                characters.extend([char] * (end - start))
                char, line = next(char_lines, ('', ''))
                start = end            
        return tokens, characters

orig, characters = load_csv_script(original_script_filename)

In [93]:
# Build the ngram vectors using rolling windows. 
# Variables named `*_win_vectors` contain vectors for
# the given input, such that each row is the vector
# for a single window. Successive windows overlap
# at all words except for the first and last.
orig_vectors = mk_vectors(orig)
orig_win_vectors = numpy.array([orig_vectors[i:i + window_size, :].ravel()
                                for i in range(orig_vectors.shape[0] - window_size + 1)])

# Initialize the approximate nearest neighbor search algorithm.
# This creates the search "engine" and populates its index with
# the window-vectors from the original script. We can then pass
# over the window-vectors from a fan work, taking each vector
# and searching for good matches in the engine's index of script
# text.

# We could do the search in the opposite direction, storing 
# fan text in the engine's index, and passing over window-
# vectors from the original script, searching for matches in 
# the index of fan text. Unfortuantely, the quality of the 
# matches found goes down when you add too many values to the
# engine's index.
vector_dim = orig_win_vectors.shape[1]
hashes = []
for i in range(number_of_hashes):
    h = nearpy.hashes.RandomBinaryProjections('rbp{}'.format(i),
                                              hash_dimensions)
    hashes.append(h)

engine = nearpy.Engine(vector_dim,
                       lshashes=hashes,
                       distance=nearpy.distances.CosineDistance())

In [None]:
for ix, row in enumerate(orig_win_vectors):
    engine.store_vector(row, (ix, str(orig[ix: ix + window_size])))

In [None]:
orig_match_count = Counter()

# Load fan work:

fan_works = os.listdir(fan_work_directory)
fan_works = [os.path.join(fan_work_directory, f) 
             for f in fan_works]
random.seed(4815162342)  # This will always generate the same "random" sample.
random.shuffle(fan_works)
 
records = [['FAN_WORK_FILENAME', 
            'FAN_WORK_MATCH_INDEX', 
            'FAN_WORK_MATCH_TEXT',
            'ORIGINAL_SCRIPT_MATCH_INDEX',
            'ORIGINAL_SCRIPT_MATCH_TEXT',
            'ORIGINAL_SCRIPT_CHARACTERS',
            'MATCH_DISTANCE']]
n_windows_processed = 0
for works_processed, fan_filename in enumerate(fan_works[0:500], start=1):
    with open(fan_filename) as fan_file:
        fan = sp(fan_file.read())
    
    # Create the fan windows:
    fan_vectors = mk_vectors(fan)
    fan_win_vectors = numpy.array([fan_vectors[i:i + window_size, :].ravel()
                                   for i in range(fan_vectors.shape[0] - window_size + 1)])

    for fan_ix, row in enumerate(fan_win_vectors):
        n_windows_processed += 1
        fast_results = engine.neighbours(row)
        fast_results = fast_results[0:1]
        fast_results = [(match_ix, match_str, distance) 
                        for vec, (match_ix, match_str), distance in fast_results 
                        if distance < distance_threshold]

        if n_windows_processed % 100000 == 0:
            print('* {} texts processed...'.format(works_processed))
            print('* {} fan windows processed...'.format(n_windows_processed))
            print('{} matches found'.format(len(records)))

        if fast_results:
            #print()
            #print('----------------')
            #print('* Matches found!')
            #print('    Fan window    (at index {:>6}):   {}'.format(fan_ix, fan[fan_ix: fan_ix + window_size]))
            #print()       
            #print("* Approximate best matches (fast)")
            for match_ix, match_str, distance in fast_results:
                #print('    Script window (at index {:>6}):   {}'.format(match_ix, match_str))
                #print('    Cosine distance between windows:   {}'.format(distance))
                #print()
                
                records.append([fan_filename, 
                                fan_ix,
                                str(fan[fan_ix: fan_ix + window_size]),
                                match_ix,
                                match_str,
                                sorted(set(characters[match_ix: match_ix + window_size])), 
                                distance])
                orig_match_count[match_ix] += 1

In [None]:
with open('match-500-trial.csv', 'w', encoding='utf-8') as out:
    wr = csv.writer(out)
    wr.writerows(records)

In [None]:
match_strata = [[r for r in records[1:] if r[-1] >= low and r[-1] < high and r[2]]
                for low, high in zip([0, 0.05, 0.10, 0.15, 0.20],
                                     [0.05, 0.10, 0.15, 0.20, 0.25])]

print([len(m) for m in match_strata])

In [None]:
maxn = max(orig_match_count.keys())
for m in match_strata[::-1]:
    c = Counter([r[3] for r in m])
    pandas.Series([c[n] for n in range(maxn + 1)]).plot()


In [None]:
print("Most often imitated portions of the original script.")
print()
print("The imitated phrase appears between <<brakcets>>.")
print("Text outside brackets is provided for context, and ")
print("you can adjust how much context appears by modifying ")
print("`context_width` above.")
print()
print()

# Size of 
context_width = 5

for ix, n in orig_match_count.most_common(20):
    print('{} <<{}>> {}'.format(orig[ix - context_width: ix],
                                orig[ix: ix + window_size],
                                orig[ix + window_size: ix + window_size + context_width]))
    print()