In [1]:
import csv
import re
import itertools
import collections
import os

In [88]:
import spacy
sp = spacy.load('en')


In [117]:
script_file = 'force-awakens-markup.txt'
match10g_file = 'match-20k-10gram-20170406.csv'
match6g_file = 'match-6gram-20170614.csv'

script = load_markup_script(script_file)
script_header = script[0]
script = script[1:]
with open(match10g_file, encoding='utf-8') as ip:
    match10g = list(csv.reader(ip))
with open(match6g_file, encoding='utf-8') as ip:
    match6g = list(csv.reader(ip))
match10g_header = match10g[0]
match10g = match10g[1:]
match10g.sort()
match6g_header = match6g[0]
match6g = match6g[1:]
for m in match6g:
    m[0] = os.path.split(m[0])[-1]
match6g.sort()


In [191]:
def load_markup_script(filename,
                        _line_rex=re.compile('LINE<<(?P<line>[^>]*)>>'),
                        _scene_rex=re.compile('SCENE_NUMBER<<(?P<scene>[^>]*)>>'),
                        _char_rex=re.compile('CHARACTER_NAME<<(?P<character>[^>]*)>>')):
    with open(filename, encoding='utf-8') as ip:
        current_scene = None
        current_char = None
        current_line = None
        rows = [['WORD', 'SPACY_ORTH_ID', 'LOWERCASE', 'LOWERCASE_ORTH_ID', 'SCENE', 'CHARACTER']]
        for i, line in enumerate(ip):
            if _scene_rex.search(line):
                current_scene = int(_scene_rex.search(line).group('scene'))
            elif _char_rex.search(line):
                current_char = _char_rex.search(line).group('character')
            elif _line_rex.search(line):
                tokens = sp(_line_rex.search(line).group('line'))
                for t in tokens:
                    # original Spacy lexeme object can be recreated using
                    #     spacy.lexeme.Lexeme(sp.vocab, t.orth)
                    # where `sp = spacy.load('en')`
                    row = [t.orth_, t.orth, t.lower_, t.lower, current_scene, current_char]
                    rows.append(row)
    return rows

def group_works(matches):
    return {k: sorted(v, key=lambda x: int(x[1])) 
            for k, v in 
            itertools.groupby(matches, key=lambda x: x[0])}

def group_contig(matches):
    groups = group_works(matches)
    return {k: find_contig(v) for k, v in groups.items()}

def window_contig(matches):
    groups = group_contig(matches)
    new_groups = {}
    for k, v in groups.items():
        new_v = []
        for cix, contig in enumerate(v):
            contig_joined = [contig[i:i + 6] for i in range(len(contig) - 5)]
            for cj in contig_joined:
                new_v.append(wordrows_to_ngramrow(cj))
        new_groups[k] = new_v
    return new_groups

def wordrows_to_ngramrow(rows):
    fan_words = ' '.join(r[2] for r in rows)
    script_words = ' '.join(script[int(r[4])][0] for r in rows)
    row = rows[0].copy()
    row[2] = fan_words
    row[5] = script_words
    return row

def inspect_dict(d, n=0):
    return next(itertools.islice(iter(d.items()), n, n + 1))

def find_contig(matches):
    contig = []
    last = -2
    for r in matches:
        ix = int(r[1])
        if int(ix) != last + 1:
            contig.append([r])
        else:
            contig[-1].append(r)
        last = ix
    return contig

def build_matrix(work_matches, threshold=None):
    row_names = sorted(work_matches.keys(), 
                       key=lambda x: int(os.path.splitext(x)[0]))
    row_index = {n: i for i, n in enumerate(row_names)}
    col_index = {m[4]: m[3] 
                 for matches in work_matches.values() 
                 for m in matches}
    col_names = sorted(col_index, key=col_index.get)
    col_dense_index = {n: i for i, n in enumerate(col_names)}
    
    col_names = ['FILENAME'] + col_names
    col_names = [re.sub('\s+', ' ', n) for n in col_names]
    
    col_n = max(col_dense_index.values()) + 1
    
    matrix = [col_names]
    for rn in row_names:
        row = [0] * col_n
        row[0] = rn
        for m in work_matches[rn]:
            if threshold is None or m[-1] < threshold:
                row[col_dense_index[m[4]]] = 1
        matrix.append(row)
    
    rowsum = matrix[1][1:]
    for r in matrix[1:]:
        rowsum = [a + b for a, b in zip(rowsum, r[1:])]    
    rowsum = ['Total'] + rowsum
    
    matrix.append(rowsum)
    discard = [t != 'Total' and t == 0 for t in rowsum]
    for r in matrix:
        r[:] = [cell for cell, dis in zip(r, discard) if not dis]
    return matrix

In [192]:
match10g_works = group_works(match10g)
for v in match10g_works.values():
    for i, vv in enumerate(v):
        v[i] = vv[0:5] + vv[6:]
        vv = v[i]
        vv[1] = int(vv[1])
        vv[3] = int(vv[3])
        vv[5] = float(vv[5])
        vv[6] = int(vv[6])
        vv[7] = float(vv[7])
match6g_works = window_contig(match6g)
for v in match6g_works.values():
    for i, vv in enumerate(v):
        v[i] = vv[0:3] + vv[4:6] + vv[9:]
        vv = v[i]
        vv[1] = int(vv[1])
        vv[3] = int(vv[3])
        vv[5] = float(vv[5])
        vv[6] = int(vv[6])
        vv[7] = float(vv[7])

In [193]:
mat_10gram = build_matrix(match10g_works)
with open('match-10gram-matrix-thresh-none.csv', 'w', encoding='utf-8') as op:
    csv.writer(op).writerows(mat_10gram)
mat_10gram = build_matrix(match10g_works, threshold=0.2)
with open('match-10gram-matrix-thresh-0.2.csv', 'w', encoding='utf-8') as op:
    csv.writer(op).writerows(mat_10gram)
mat_6gram = build_matrix(match6g_works)
with open('match-6gram-matrix-thresh-none.csv', 'w', encoding='utf-8') as op:
    csv.writer(op).writerows(mat_6gram)
mat_6gram = build_matrix(match6g_works, threshold=0.05)
with open('match-6gram-matrix-thresh-0.05.csv', 'w', encoding='utf-8') as op:
    csv.writer(op).writerows(mat_6gram)