In [1]:
from matching_functions import *

EEBO_SHINGLE_FOLDER = '/home/spenteco/0/eebo_shingled/'
METADATA_FILE = 'metadata/EEBO_metadata.tsv'
OUTPUT_FOLDER = 'all_to_all_html_outputs/'
RESULTS_PICKLE_FOLDER = 'text_reuse_pickle_outputs/'
SQLITE3_DATABASE = '/ssd_data/shingles.sqlite3'

metadata = load_metadata(METADATA_FILE)

In [2]:
for k, v in metadata.items():
    if k in ['A43441', 'A12777', 'A77689', 'A16884', 'A17310', 'B00290', 'A10675']:
        print(k, v, end='\n\n')

B00290 {'year': '1553', 'author': 'Church of England.', 'title': 'The booke of common prayer, and adminystracion of the sacramentes, and other rites and ceremonies in the Churche of Englande.|Book of common prayer'}

A16884 {'year': '1600', 'author': 'Albott, Robert, fl. 1600.', 'title': 'Englands Parnassus: or the choysest flowers of our moderne poets, with their poeticall comparisons Descriptions of bewties, personages, castles, pallaces, mountaines, groues, seas, springs, riuers, &c. Whereunto are annexed other various discourses, both pleasaunt and profitable.'}

A12777 {'year': '1590', 'author': 'Spenser, Edmund, 1552?-1599.', 'title': 'The faerie qveene disposed into twelue books, fashioning XII. morall vertues.'}

A10675 {'year': '1562', 'author': 'Whittingham, William, d. 1579.|Gilby, Anthony, ca. 1510-1585.|Sampson, Thomas, 1517?-1589.', 'title': 'The Bible and Holy Scriptures conteyned in the Olde and Newe Testament. Translated according to the Ebrue and Greke, and conferred 

In [3]:
import glob, time, json, sqlite3, pickle
from mako.template import Template
from collections import Counter

conn = sqlite3.connect(SQLITE3_DATABASE)
c = conn.cursor()

MAX_GAP_ALLOWED = 5
MIN_MATCH_LENGTH = 6

shingles_hash_time = {'n': 0, 'time': 0.0}
shingles_to_matches_time = {'n': 0, 'time': 0.0}
merged_matches_time = {'n': 0, 'time': 0.0}
pickle_time = {'n': 0, 'time': 0.0}
final_results_time = {'n': 0, 'time': 0.0}

all_merged_matches = []

def actually_check_matches(from_tcp_id, file_a, from_shingles, to_tcp_id, to_shingles_list, debug=False):
    
    if len(to_shingles_list) == 1:
        return None

    start_time = time.time()
    
    to_shingles = {}
    for s in to_shingles_list:
        to_shingles[s[1]] = s[2]
                    
    shingles_hash_time['n'] += 1
    shingles_hash_time['time'] += (time.time() - start_time)

    start_time = time.time()
        
    matches = []
    
    for k in to_shingles.keys():
        if k in from_shingles:
            for v_a in from_shingles[k]:
                for v_b in to_shingles[k]:
                    matches.append([v_a, v_b])
                    
    shingles_to_matches_time['n'] += 1
    shingles_to_matches_time['time'] += (time.time() - start_time)

    start_time = time.time()
                    
    merged_matches = merge_matches(matches, MAX_GAP_ALLOWED, MIN_MATCH_LENGTH)
                    
    merged_matches_time['n'] += 1
    merged_matches_time['time'] += (time.time() - start_time)
    
    final_results = None
    
    if len(merged_matches) > 0:
        
        all_merged_matches.append([to_tcp_id, merged_matches])

        start_time = time.time()
        
        file_b = None
        if debug == True:
            file_b = load_pickle_file(EEBO_SHINGLE_FOLDER + to_tcp_id + '.pickle')
                    
        pickle_time['n'] += 1
        pickle_time['time'] += (time.time() - start_time)

        start_time = time.time()
        
        final_results = make_final_results(merged_matches, file_a, file_b, 
                                           debug=debug, return_match_offsets=True)
                    
        final_results_time['n'] += 1
        final_results_time['time'] += (time.time() - start_time)
        
    return final_results

def get_metadata(tcp_id):
    
    author = 'METADATA ERROR'
    title = 'METADATA ERROR'
    year = 'MDER'

    try:

        author = metadata[tcp_id]['author']
        title = metadata[tcp_id]['title']
        year = metadata[tcp_id]['year']

    except KeyError:
        print('ERROR -- metadata?', tcp_id)

    return author, title, year
        
def find_text_reuse(from_tcp_id):
    
    #start_time = time.time()
    
    t = Template(filename='matching_results_template.html')
    
    from_author = metadata[from_tcp_id]['author']
    from_title = metadata[from_tcp_id]['title']
    from_year = metadata[from_tcp_id]['year']
    
    from_file = load_pickle_file(EEBO_SHINGLE_FOLDER + from_tcp_id + '.pickle')
    
    start_time = time.time()
    
    possible_matches = []
    
    for row in c.execute('select b.tcp_id, b.shingle, b.offsets from shingles a, shingles b ' + \
                         'where a.tcp_id = ? and b.tcp_id <> ? and a.shingle = b.shingle ',
                             (from_tcp_id, from_tcp_id,)):
        possible_matches.append([row[0], row[1], json.loads(row[2])])
                
    possible_matches.sort()
        
    print(from_tcp_id, len(from_file['shingles']), len(possible_matches))
    print('\t', 'A', (time.time() - start_time))
    
    all_results = []
    n_actually_check_matches = 0
    
    to_shingles = []
    last_key = None
    
    #f = open('possible_matches.' + from_tcp_id + '.js', 'w', encoding='utf-8')
    #f.write(json.dumps(possible_matches))
    #f.close()
    
    for m in possible_matches:
        if last_key != None and m[0] != last_key:
            
            n_actually_check_matches += 1
            
            check_result = actually_check_matches(from_tcp_id, 
                                                                  from_file, from_file['shingles'], 
                                                                   last_key, to_shingles)
            
            if check_result != None:
                to_author, to_title, to_year = get_metadata(last_key)
                all_results.append([[last_key, to_author, to_title, to_year], check_result])
            
            to_shingles = []
            
        last_key = m[0]
        to_shingles.append(m)
        
    n_actually_check_matches += 1
    check_result = actually_check_matches(from_tcp_id, from_file, from_file['shingles'], 
                                           last_key, to_shingles)
    
    if check_result != None:
        to_author, to_title, to_year = get_metadata(last_key)
        all_results.append([[last_key, to_author, to_title, to_year], check_result])
        
    print('\t', 'B', (time.time() - start_time), 'n_actually_check_matches', n_actually_check_matches,
            'len(all_results)', len(all_results))
        
    if len(all_results) > 0:
        
        from_author, from_title, from_year = get_metadata(from_tcp_id)

        f = open(RESULTS_PICKLE_FOLDER + from_tcp_id + '.pickle', 'wb')
        pickle.dump([from_tcp_id, from_year, from_author, from_title, all_results], f)           
        f.close()
        
        build_match_report(from_tcp_id, EEBO_SHINGLE_FOLDER, RESULTS_PICKLE_FOLDER, OUTPUT_FOLDER)
        
        print('\t', 'C', (time.time() - start_time))
        

    print()
    print('\t\t', 'shingles_hash_time', shingles_hash_time)
    print('\t\t', 'shingles_to_matches_time', shingles_to_matches_time)
    print('\t\t', 'merged_matches_time', merged_matches_time)
    print('\t\t', '(no) pickle_time', pickle_time)
    print('\t\t', 'final_results_time', final_results_time)
    print()
    
    stop_time = time.time()
    
    print(from_tcp_id, 'done!', (stop_time - start_time), end='\n\n')

In [4]:
for tcp_id in ['A43441', 'A12777', 'A77689', 'A16884', 'A17310', 'B00290']:

    print()
    print(tcp_id, metadata[tcp_id], end='\n\n')

    #from_file = load_pickle_file(EEBO_SHINGLE_FOLDER + tcp_id + '.pickle')
    #print(tcp_id, 'n shingles', len(from_file['shingles']))
    #from_file = None

    find_text_reuse(tcp_id)


A43441 {'year': '1648', 'author': 'Herrick, Robert, 1591-1674.|Marshall, William, fl. 1617-1650.', 'title': 'Hesperides, or, The works both humane & divine of Robert Herrick, Esq.'}

A43441 35214 346242
	 A 3.9747982025146484
	 B 4.755926609039307 n_actually_check_matches 37732 len(all_results) 60
	 C 5.040635108947754

		 shingles_hash_time {'n': 28204, 'time': 0.08287644386291504}
		 shingles_to_matches_time {'n': 28204, 'time': 0.1698627471923828}
		 merged_matches_time {'n': 28204, 'time': 0.3590700626373291}
		 (no) pickle_time {'n': 60, 'time': 3.0517578125e-05}
		 final_results_time {'n': 60, 'time': 0.0007984638214111328}

A43441 done! 5.040891885757446


A12777 {'year': '1590', 'author': 'Spenser, Edmund, 1552?-1599.', 'title': 'The faerie qveene disposed into twelue books, fashioning XII. morall vertues.'}

A12777 77890 570726
	 A 7.124818563461304
	 B 19.28403902053833 n_actually_check_matches 41463 len(all_results) 542
	 C 19.942251682281494

		 shingles_hash_time {'n': 60