In [1]:
import spacy
sp = spacy.load('en')

In [2]:
import os
import re
import csv

from pprint import pprint
from collections import defaultdict
import bisect

### Script Settings ###

# Modify these to change input files and other parameters.

# Input filenames:
home_folder = '../../../../'
original_script_filename = os.path.join(
    home_folder, 
    'original-scripts/force-awakens/force-awakens-lines.csv'
)
original_script_text_filename = os.path.join(
    home_folder,
    'original-scripts/force-awakens/force-awakens-with-scene-numbers.txt'
)
original_script_markup_filename = os.path.join(
    home_folder,
    'original-scripts/force-awakens/force-awakens-markup.txt'
)
fan_work_directory = os.path.join(
    home_folder, 
    'fan-works/force-awakens-fullset/plaintext'
)

def load_csv_script(filename):
    with open(filename) as orig_in:
        orig_csv = list(csv.reader(orig_in))[1:]
        orig_txt = ' '.join(line.strip() for char, line in orig_csv)
        tokens = sp(orig_txt)
        
        characters = []
        char_lines = iter(orig_csv)
        char, line = next(char_lines, ('', ''))
        
        start = 0
        for end in range(1, len(tokens)):
            tok_line = str(tokens[start:end])
            if line == tok_line:
                characters.extend([char] * (end - start))
                char, line = next(char_lines, ('', ''))
                start = end            
        return tokens, characters
    
    
def load_txt_script(filename):
    with open(filename) as orig_in:
        orig_txt = orig_in.read()
        #orig_txt = re.sub(r'\s+', ' ', orig_txt).strip()
        return orig_txt
    
txtscr = load_txt_script(original_script_text_filename)
csvscr, chars = load_csv_script(original_script_filename)

In [3]:
full_char_list = sorted(set(chars))
weird_chars = [c for c in full_char_list if any(c in cc and c != cc for cc in full_char_list)]
normal_chars = sorted(set(full_char_list) - set(weird_chars))

for c in normal_chars:
    txtscr = re.sub(r'^[ ]{{11}}\s*(?P<character>{})[ ]*$'.format(c),
                    'CHARACTER_NAME<<\g<character>>>',
                    txtscr,
                    flags=re.MULTILINE)
for c in weird_chars:
    txtscr = re.sub(r'^[ ]{{11}}\s*(?P<character>{})[ ]*$'.format(c),
                    'CHARACTER_NAME<<\g<character>>>',
                    txtscr,
                    flags=re.MULTILINE)

txtscr = re.sub(r'^           (?P<line>\S.*)$',
                'LINE<<\g<line>>>',
                txtscr,
                flags=re.MULTILINE)

txtscr = re.sub(r'\[(?P<number>\d+)\]', 
                'SCENE_NUMBER<<\g<number>>>', 
                txtscr,
                flags=re.MULTILINE)

In [4]:
#with open('testout.txt', 'w', encoding='utf-8') as op:
#    op.write(txtscr)

In [5]:
_line_rex = re.compile('LINE<<(?P<line>[^>]*)>>')
_scene_rex = re.compile('SCENE_NUMBER<<(?P<scene>[^>]*)>>')
_char_rex = re.compile('CHARACTER_NAME<<(?P<character>[^>]*)>>')

def parse_markup_script(filename):
    with open(filename, encoding='utf-8') as ip:
        current_scene = None
        current_char = None
        current_line = None
        rows = [['LOWERCASE', 'SPACY_ORTH_ID', 'SCENE', 'CHARACTER']]
        for i, line in enumerate(ip):
            if _scene_rex.search(line):
                current_scene = int(_scene_rex.search(line).group('scene'))
            elif _char_rex.search(line):
                current_char = _char_rex.search(line).group('character')
            elif _line_rex.search(line):
                tokens = sp(_line_rex.search(line).group('line'))
                for t in tokens:
                    # original Spacy lexeme object can be recreated using
                    #     spacy.lexeme.Lexeme(sp.vocab, t.orth)
                    # where `sp = spacy.load('en')`
                    row = [t.lower_, t.lower, current_scene, current_char]
                    rows.append(row)
    return rows

script_rows = parse_markup_script(original_script_markup_filename)[1:]

In [6]:
class SubseqFinder(object):
    def __init__(self, seq):
        self.original_seq = seq
        self.index_seq = sorted((s, i) for i, s in enumerate(seq))
    
    def find(self, sub):
        if not sub:
            return 0

        sub_start_val = sub[0]
        candidate_ix = bisect.bisect_left(self.index_seq, (sub_start_val, -1))
        if candidate_ix == len(self.index_seq):
            return -1
        
        seq_val, seq_ix = self.index_seq[candidate_ix]
        maxlen = len(self.original_seq)
        while seq_val == sub_start_val:
            for sub_ix, sub_val in enumerate(sub):
                seq_ix_next = seq_ix + sub_ix
                if seq_ix_next >= maxlen:
                    break
                    
                seq_val = self.original_seq[seq_ix_next]
                if seq_val != sub_val:
                    break
            else:
                return seq_ix
            candidate_ix += 1
            seq_val, seq_ix = self.index_seq[candidate_ix]
        else:
            return -1

def subseq_find(seq, sub):
    if not sub:
        return True
    
    for i in range(0, len(seq) - len(sub)):
        for j, y in enumerate(sub):
            x = seq[i + j]
            if y != x:
                break
        else:
            return i
    else:
        return -1

script_words = [r[1] for r in script_rows]
# words = sp('time ago in a galaxy far, far away...')
# words = [t.lower for t in words]
words = [506, 3637, 504, 303039, 1694, 533, 520, 628, 679, 767]
print(script_words[0:5])


[506, 824, 646, 1060, 522]


In [7]:
record_structure = {
    'fields': ['FAN_WORK_FILENAME', 
               'FAN_WORK_MATCH_INDEX', 
               'FAN_WORK_MATCH_TEXT',
               'ORIGINAL_SCRIPT_MATCH_INDEX',
               'ORIGINAL_SCRIPT_MATCH_TEXT',
               'ORIGINAL_SCRIPT_CHARACTERS',
               'MATCH_DISTANCE',
               'LEVENSHTEIN_DISTANCE',
               'COMBINED_DISTANCE'],
    'types': [str, int, str, int, str, str, float, int, float]}

with open('match-20k-10gram-20170406.csv') as ip:
    rows = list(csv.reader(ip))
    records = [rows[0]]
    records.extend([[record_structure['types'][i](cell) 
                     for i, cell in enumerate(row)]
                    for row in rows if row[0] != 'FAN_WORK_FILENAME'])


In [8]:
word_records = [['FAN_WORK_FILENAME', 
                 'FAN_WORK_MATCH_INDEX', 
                 'FAN_WORK_WORD',
                 'FAN_WORK_ORTH_ID',
                 'ORIGINAL_SCRIPT_MATCH_INDEX',
                 'ORIGINAL_SCRIPT_WORD',
                 'ORIGINAL_SCRIPT_ORTH_ID',
                 'ORIGINAL_SCRIPT_CHARACTER',
                 'ORIGINAL_SCRIPT_SCENE',
                 'BEST_MATCH_DISTANCE',
                 'BEST_LEVENSHTEIN_DISTANCE',
                 'BEST_COMBINED_DISTANCE']]

subseq_find = SubseqFinder(script_words).find
for i, r in enumerate(records):
    rec_words = [t.lower for t in sp(r[4])]
    script_ix = subseq_find(rec_words)
    
    if script_ix >= 0:
        rec_texts = [t.orth_ for t in sp(r[4])]
        fan_sp = sp(r[2].replace('—', ' - ').replace('-', ' - ').replace('…', '... '))
        fan_words = [t.orth for t in fan_sp]
        fan_texts = [t.orth_ for t in fan_sp]
        
        (FILENAME, 
         FAN_WORK_MATCH_INDEX, 
         FAN_WORK_MATCH_TEXT,
         ORIGINAL_SCRIPT_MATCH_INDEX,
         ORIGINAL_SCRIPT_MATCH_TEXT,
         ORIGINAL_SCRIPT_CHARACTERS,
         MATCH_DISTANCE,
         LEVENSHTEIN_DISTANCE,
         COMBINED_DISTANCE) = r
        for ngram_ix in range(len(rec_words)):
            word_ix = script_ix + ngram_ix
            word_row = script_rows[word_ix]
            SP_LOWER, SP_ORTH, SCENE, CHAR = word_row
            
            if ngram_ix >= len(fan_texts) or ngram_ix >= len(rec_texts):
                print("Error!")
                print(fan_texts)
                print(rec_texts)
                continue
                
            word_rec = [FILENAME,
                        int(FAN_WORK_MATCH_INDEX) + ngram_ix,
                        fan_texts[ngram_ix],
                        fan_words[ngram_ix],
                        int(word_ix),
                        rec_texts[ngram_ix],
                        rec_words[ngram_ix],
                        CHAR,
                        int(SCENE),
                        float(MATCH_DISTANCE),
                        int(LEVENSHTEIN_DISTANCE),
                        float(COMBINED_DISTANCE)
                       ]
            word_records.append(word_rec)
        
word_record_dedupe = defaultdict(list)
for wr in word_records:
    word_record_dedupe[(wr[0], wr[1], wr[4])].append(wr)

deduped_rows = []
for dupe_key in word_record_dedupe:
    dupe = word_record_dedupe[dupe_key]
    best_match = min(r[9] for r in dupe)
    best_lev = min(r[10] for r in dupe)
    best_com = min(r[11] for r in dupe)
    new_r = dupe[0]
    new_r[9] = best_match
    new_r[10] = best_lev
    new_r[11] = best_com
    deduped_rows.append(new_r)

print(len(deduped_rows))

Error!
['.What', 'you', 'always', 'do', 'to', 'me', 'it', 'is', 'more']
['.', 'What', 'do', 'we', 'do', 'with', 'her', '?', 'Is', 'there']
702438


In [9]:
with open('match-20k-10gram-wordlevel-20170406.csv', 'w', encoding='utf-8') as op:
    csv.writer(op).writerows(deduped_rows)