## How matching works


### First, some preliminary preprocessing

There's a better example of this in **test_matching_functions_part_1.ipynb**; for now, let's accept it as a given . . . 

In [1]:
from matching_functions import *

SHINGLE_LENGTH = 3

In [2]:
for file_name in ['sample_a', 'sample_b']:
    
    file_data = preprocess_one_file('test_adorned_xml/' + file_name + '.xml', SHINGLE_LENGTH)

    f = open('test_pickles/' + file_name + '.pickle', 'wb')
    pickle.dump(file_data, f)
    f.close()
    
print('Done!')

Done!


In [3]:
sample_a = load_pickle_file('test_pickles/sample_a.pickle')
sample_b = load_pickle_file('test_pickles/sample_b.pickle')

### The results of preprocessing . . . 

We have two very simple texts, which consist of single-character "words":

In [4]:
print(' '.join(sample_a['tokens']))
print()
print(' '.join(sample_b['tokens']))

B C G H * * * B C G H F J K L * * * F J K L

B C G H F J K L


### The pre-processed texts consist of . . . 

 . . . five collections of data:
 
 1.  **tokens** which are all the words and punctuation from the source texts (which have been morphadorned).
 2.  **lemmas**; note, however, we replace a number of lemma values (punctuation, stop words, words with non-latin characters, numbers) with spaces.  Note that in this example, the "\*" tokens in sample_a are replaced with space in lemmas.  We also lower-case at this point.
 3.  **non_space_lemmas** which has the same values as lemmas, except that we've dropped all the spaces.
 4.  **offsets** which indicate the position of each non_space_lemma within the lists of lemmas and tokens.
 5.  **shingles**, which are ngrams made from non_space_lemmas; each ngram us associated with its starting and ending position in offsets.
 

Here's a very simple example of a text with five one-letter "words": 
 
     tokens: \['B', 'C', '*', 'G', 'H'\]
     lemmas: \['b', 'c', ' ', 'g', 'h'\]
     non_space_lemmas: \['b', 'c', 'g', 'h'\]
     offsets: \[0, 1, 3, 4\]
     shingles: {('b', 'c', 'g'): \[\[0, 2\]\], ('c', 'g', 'h'): \[\[1, 3\]\]} 
     
     

In [5]:
print(sample_a)
print()
print(sample_b)

{'tokens': ['B', 'C', 'G', 'H', '*', '*', '*', 'B', 'C', 'G', 'H', 'F', 'J', 'K', 'L', '*', '*', '*', 'F', 'J', 'K', 'L'], 'lemmas': ['b', 'c', 'g', 'h', ' ', ' ', ' ', 'b', 'c', 'g', 'h', 'f', 'j', 'k', 'l', ' ', ' ', ' ', 'f', 'j', 'k', 'l'], 'non_space_lemmas': ['b', 'c', 'g', 'h', 'b', 'c', 'g', 'h', 'f', 'j', 'k', 'l', 'f', 'j', 'k', 'l'], 'offsets': [0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14, 18, 19, 20, 21], 'shingles': {('b', 'c', 'g'): [[0, 2], [4, 6]], ('c', 'g', 'h'): [[1, 3], [5, 7]], ('g', 'h', 'b'): [[2, 4]], ('h', 'b', 'c'): [[3, 5]], ('g', 'h', 'f'): [[6, 8]], ('h', 'f', 'j'): [[7, 9]], ('f', 'j', 'k'): [[8, 10], [12, 14]], ('j', 'k', 'l'): [[9, 11], [13, 15]], ('k', 'l', 'f'): [[10, 12]], ('l', 'f', 'j'): [[11, 13]]}}

{'tokens': ['B', 'C', 'G', 'H', 'F', 'J', 'K', 'L'], 'lemmas': ['b', 'c', 'g', 'h', 'f', 'j', 'k', 'l'], 'non_space_lemmas': ['b', 'c', 'g', 'h', 'f', 'j', 'k', 'l'], 'offsets': [0, 1, 2, 3, 4, 5, 6, 7], 'shingles': {('b', 'c', 'g'): [[0, 2]], ('c', 'g', '

In [7]:
MAX_GAP_ALLOWED = 1
MIN_MATCH_LENGTH = 4

results = match_two_files(sample_a, 
                            sample_b,
                            MAX_GAP_ALLOWED, 
                            MIN_MATCH_LENGTH)

print()
print('a to b ------------------------------')
print()

for fn, f in enumerate(results):
    print(f[0], '<-->', f[1])

results = match_two_files(sample_b, 
                            sample_a,
                            MAX_GAP_ALLOWED, 
                            MIN_MATCH_LENGTH)

print()
print('b to a ------------------------------')
print()

for fn, f in enumerate(results):
    print(f[0], '<-->', f[1])


a to b ------------------------------

BCGH <--> BCGH
BCGHFJKL <--> BCGHFJKL
FJKL <--> FJKL

b to a ------------------------------

BCGH <--> BCGH
BCGHFJKL <--> BCGHFJKL
FJKL <--> FJKL


In [16]:
print('sample_a shingles', sample_a['shingles'])
print()

print('sample_b shingles', sample_b['shingles'])
print()

matches = []
for k in sample_a['shingles'].keys():
    if k in sample_b['shingles']:
        for v_a in sample_a['shingles'][k]:
            for v_b in sample_b['shingles'][k]:
                matches.append([v_a, v_b])
                
matches.sort()

for m in matches:
    print('match', m)

sample_a shingles {('b', 'c', 'g'): [[0, 2], [4, 6]], ('c', 'g', 'h'): [[1, 3], [5, 7]], ('g', 'h', 'b'): [[2, 4]], ('h', 'b', 'c'): [[3, 5]], ('g', 'h', 'f'): [[6, 8]], ('h', 'f', 'j'): [[7, 9]], ('f', 'j', 'k'): [[8, 10], [12, 14]], ('j', 'k', 'l'): [[9, 11], [13, 15]], ('k', 'l', 'f'): [[10, 12]], ('l', 'f', 'j'): [[11, 13]]}

sample_b shingles {('b', 'c', 'g'): [[0, 2]], ('c', 'g', 'h'): [[1, 3]], ('g', 'h', 'f'): [[2, 4]], ('h', 'f', 'j'): [[3, 5]], ('f', 'j', 'k'): [[4, 6]], ('j', 'k', 'l'): [[5, 7]]}

match [[0, 2], [0, 2]]
match [[1, 3], [1, 3]]
match [[4, 6], [0, 2]]
match [[5, 7], [1, 3]]
match [[6, 8], [2, 4]]
match [[7, 9], [3, 5]]
match [[8, 10], [4, 6]]
match [[9, 11], [5, 7]]
match [[12, 14], [4, 6]]
match [[13, 15], [5, 7]]


In [18]:
grouped_matches = [[matches[0],],]

for m in matches[1:]:

    group_match_n = -1
    for gn, g in enumerate(grouped_matches):
        if (m[0][0] - MAX_GAP_ALLOWED) < g[-1][0][1] and \
            (m[1][0] - MAX_GAP_ALLOWED) < g[-1][1][1] and \
            m[0][0] > g[-1][0][0] and \
            m[1][0] > g[-1][1][0]:

            group_match_n = gn
            break

    if group_match_n > -1:
        grouped_matches[gn].append(m)
    else:
        grouped_matches.append([m])

for m in grouped_matches:
    print('grouped_match', m)

grouped_match [[[0, 2], [0, 2]], [[1, 3], [1, 3]]]
grouped_match [[[4, 6], [0, 2]], [[5, 7], [1, 3]], [[6, 8], [2, 4]], [[7, 9], [3, 5]], [[8, 10], [4, 6]], [[9, 11], [5, 7]]]
grouped_match [[[12, 14], [4, 6]], [[13, 15], [5, 7]]]


In [25]:
MIN_MATCH_LENGTH = 4

merged_matches = []
for gn, g in enumerate(grouped_matches):

    from_matches = []
    to_matches = []
    for m in g:
        from_matches.append(m[0])
        to_matches.append(m[1])

    from_matches.sort()
    to_matches.sort()

    if from_matches[-1][1] - from_matches[0][0] >= MIN_MATCH_LENGTH - 1: 

        merged_matches.append([[from_matches[0][0], from_matches[-1][1]], 
                                [to_matches[0][0], to_matches[-1][1]]])

for m in merged_matches:
    print('merged_match', m)

merged_match [[0, 3], [0, 3]]
merged_match [[4, 11], [0, 7]]
merged_match [[12, 15], [4, 7]]


In [39]:
final_results = []

for m in merged_matches:
    
    print()
    print('merged_match', m)

    a_from_offset = sample_a['offsets'][m[0][0]]
    a_to_offset = sample_a['offsets'][m[0][1]]
    
    print('\tsample_a offsets', a_from_offset, a_to_offset)

    b_from_offset = sample_b['offsets'][m[1][0]]
    b_to_offset = sample_b['offsets'][m[1][1]]
    
    print('\tsample_b offsets', b_from_offset, b_to_offset)

    final_results.append([''.join(sample_a['tokens'][a_from_offset: a_to_offset + 1]),
                                ''.join(sample_b['tokens'][b_from_offset: b_to_offset + 1])])
    
print()
for r in final_results:
    print('final_result', r)


merged_match [[0, 3], [0, 3]]
	sample_a offsets 0 3
	sample_b offsets 0 3

merged_match [[4, 11], [0, 7]]
	sample_a offsets 7 14
	sample_b offsets 0 7

merged_match [[12, 15], [4, 7]]
	sample_a offsets 18 21
	sample_b offsets 4 7

final_result ['BCGH', 'BCGH']
final_result ['BCGHFJKL', 'BCGHFJKL']
final_result ['FJKL', 'FJKL']
