In [44]:
import spacy
import logging
import sys
import os
import pandas as pd
import json
from spacy.tokenizer import Tokenizer
from pandas.io.json import json_normalize

nlp = spacy.load('fr_core_news_md')
tokenizer = Tokenizer(nlp.vocab)

In [45]:
MANUAL_ANNO_DIR = '/home/vgautam/GRIM/Miriam/JSON_files_with_gender/'
MANUAL_ANNO_ENCODING = 'Mac Roman'
SYSTEM_OUT_DIR = '/home/vgautam/GRIM/output/'
SYSTEM_OUT_ENCODING = 'utf-8'

QUOTE_MATCH_THRESHOLDS = [0.3, 0.8]

In [46]:
system_out_dict = {}
manual_anno_dict = {}

for f in os.listdir(MANUAL_ANNO_DIR):
    if f.endswith('.json'):
        with open(os.path.join(MANUAL_ANNO_DIR, f), 'r', encoding=MANUAL_ANNO_ENCODING) as fo:
            file_str = fo.read().rstrip()
        manual_anno_dict[f.split('.json')[0]] = json.loads(file_str)
        
for f in os.listdir(SYSTEM_OUT_DIR):
    if f.endswith('.json'):
        with open(os.path.join(SYSTEM_OUT_DIR, f), 'r', encoding=SYSTEM_OUT_ENCODING) as fo:
            file_str = fo.read().rstrip()
        system_out_dict[f.split('.json')[0]] = json.loads(file_str)

In [47]:
def get_index(index_string):
    if index_string != "":
        return eval(index_string)
    return None

def calc_index_match_score(indx_string1, indx_string2):
    indx1 = get_index(indx_string1)
    indx2 = get_index(indx_string2)
    
    if indx1 is None or indx2 is None:
        return 0
    else:
        indx1_set = set(range(indx1[0], indx1[1]))
        indx2_set = set(range(indx2[0], indx2[1]))
        score = len(indx1_set.intersection(indx2_set)) / len(indx1_set.union(indx2_set))
        return score
    
def calc_string_match_score(string1, string2):
    tokens1 = set([tok.text for tok in tokenizer(string1)])
    tokens2 = set([tok.text for tok in tokenizer(string2)])
    
    #print(tokens1.intersection(tokens2), tokens1.union(tokens2))
    try:
        return (len(tokens1.intersection(tokens2)) / len(tokens1.union(tokens2)))
    except ZeroDivisionError:
        return 0

In [48]:
def compare_quotes(q1, q2):
    threshold = 0.3
    
    # Compute Match Score
    quote_match_score = calc_index_match_score(q1['quote_index'], q2['quote_index'])

    # Compare speakers
    speaker_match_score = calc_index_match_score(q1['speaker_index'], q2['speaker_index'])
    
    quote_match = quote_match_score > threshold
    
    speaker_match_cond_1 = speaker_match_score > 0
    speaker_match_cond_2 = (('is_floating_quote' in q2.keys())
                            and q2['is_floating_quote'] 
                            and (len(q1['speaker'].strip()) == 0)) or (
                            ('is_floating_quote' in q1.keys())
                            and q1['is_floating_quote'] 
                            and (len(q2['speaker'].strip()) == 0)
                            )
    speaker_match = speaker_match_cond_1 or speaker_match_cond_2
    
    if quote_match:
        match_score = (quote_match_score + speaker_match_score) / 2
    else:
        match_score = 0
        
    res_obj = {
        'q_a': q1,
        'q_b': q2,
        'quote_match_score':round(quote_match_score,2),
        'speaker_match_score': round(speaker_match_score,2),
        'quote_match': quote_match,
        'speaker_match': speaker_match,
        'match_score': round(match_score,2)
    }
    
    return match_score, res_obj

In [49]:
compare_quotes(manual_anno_dict['5c3ef32e1e67d78e27f52120'][1], system_out_dict['5c3ef32e1e67d78e27f52120'][0])

(1.0,
 {'q_a': {'speaker': 'Higgs',
   'verb': 'prévenu',
   'quote': 'que la province resterait fidèle à son engagement initial de dépenser 10 millions $ pour les Jeux, pas un cent de plus',
   'speaker_index': '(982,987)',
   'verb_index': '(995,1002)',
   'quote_index': '(1003,1121)',
   'reference': 'Blaine Higgs',
   'speaker_gender': 'male'},
  'q_b': {'speaker': 'Le premier ministre Higgs',
   'speaker_index': '(962,987)',
   'quote': 'que la province resterait fidèle à son engagement initial de dépenser 10 millions $ pour les Jeux, pas un cent de plus',
   'quote_index': '(1003,1121)',
   'verb': 'prévenu',
   'verb_index': '(995,1002)',
   'quote_token_count': 23,
   'quote_type': 'SVC',
   'is_floating_quote': False,
   'reference': 'Le premier ministre Higgs'},
  'quote_match_score': 1.0,
  'speaker_match_score': 0.2,
  'quote_match': True,
  'speaker_match': True,
  'match_score': 0.6})

In [50]:
def find_best_match_quote(quote, quote_list, match_threshold):
    remaining_quotes = []
    best_quote = None
    best_stats = None
    max_score = 0
    
    # Find best match
    for q in quote_list:
        score, stats = compare_quotes(quote, q)
        if score > match_threshold and score > max_score:
            max_score = score
            best_quote = q
            best_stats = stats
    
    # Find remaining quotes
    for q in quote_list:
        if q != best_quote:
            remaining_quotes.append(q)
        
    return best_quote, best_stats, remaining_quotes

In [51]:
def compare_res(quotes_a, quotes_b, min_threshold):
    
    #     number of quotes
    n_quotes_a = len(quotes_a)
    n_quotes_b = len(quotes_b)
    n_quote_match = 0
    n_speaker_match = 0
    true_positive = 0
    false_positive = 0
    false_negative = 0    
    stats = []
    remaining_quotes_a = []
    remaining_quotes_b = quotes_b
    
    for q_a in quotes_a:
        best_quote, best_stats, remaining_quotes_b = find_best_match_quote(q_a, remaining_quotes_b, min_threshold)
        if best_quote is not None:
            true_positive += 1
            if best_stats['speaker_match']:
                n_speaker_match += 1
            stats.append(best_stats)
        else:
            false_negative += 1
            remaining_quotes_a.append(q_a)

    false_positive = len(remaining_quotes_b)

    res_obj = {
        'n_quotes_a': n_quotes_a,
        'n_quotes_b': n_quotes_b,
        'n_speaker_match': n_speaker_match,
        'true_positive': true_positive,
        'false_positive': false_positive,
        'false_negative': false_negative,
        'stats': stats,
        'remaining_a': remaining_quotes_a,
        'remaining_b': remaining_quotes_b
    }
    
    return res_obj

In [58]:
results_dir = os.path.join('/home/vgautam/GRIM/results/', 'Miriam' + "-" + 'v1.2')
os.makedirs(results_dir, exist_ok=True)

#  ----- Main loop
with pd.ExcelWriter(os.path.join(results_dir,'french_results.xlsx')) as writer:
    for min_threshold in QUOTE_MATCH_THRESHOLDS:
        all_docs_comp_res = []

        for f_Miriam in manual_anno_dict.keys():
            json_Miriam = manual_anno_dict[f_Miriam]
            json_v1 = system_out_dict[f_Miriam]

            comp_res = compare_res(json_Miriam, json_v1, min_threshold)
            comp_res['id'] = f_Miriam.replace('.json','')
            all_docs_comp_res.append(comp_res)

            with open(os.path.join(results_dir, f_Miriam + '.json'), 'w') as fo:
                json.dump(comp_res, fo, indent=4, sort_keys=True)

            print(f_Miriam, ' Processed!','\n','-'*20)

        result_df = json_normalize(all_docs_comp_res)
        result_df = result_df.sort_values(by=['id'])
        result_df = result_df[['id','true_positive','false_negative','false_positive','n_quotes_a','n_quotes_b','n_speaker_match','remaining_a', 'remaining_b', 'stats']]

        result_df.to_excel(writer, sheet_name='raw'+str(min_threshold), index=False)

5c1f4ec11e67d78e279d0505  Processed! 
 --------------------
5c3ef32e1e67d78e27f52120  Processed! 
 --------------------
5c497ebf1e67d78e27205222  Processed! 
 --------------------
5c1ea0a81e67d78e279ae6c5  Processed! 
 --------------------
5c32df6b1e67d78e27cf59df  Processed! 
 --------------------
5c482d841e67d78e271c28fd  Processed! 
 --------------------
5c3d7f2f1e67d78e27f03374  Processed! 
 --------------------
5c53e08b1e67d78e27404c69  Processed! 
 --------------------
5c480de41e67d78e271b5684  Processed! 
 --------------------
5c47f7b01e67d78e271b0b6c  Processed! 
 --------------------
5c29a2d01e67d78e27b6f656  Processed! 
 --------------------
5c3466b21e67d78e27d40c2f  Processed! 
 --------------------
5c14d80b1e67d78e2770471b  Processed! 
 --------------------
5c1dd3051e67d78e27981aa4  Processed! 
 --------------------
5c3e18ac1e67d78e27f24c0d  Processed! 
 --------------------
5c343c861e67d78e27d31ccf  Processed! 
 --------------------
5c535aed1e67d78e273ee007  Processed! 
 -