# Sanity checks for annotation data

In [45]:
import numpy as np
import pandas as pd
import os
import json

In [46]:
ENCODING = 'Mac Roman'

In [47]:
df = pd.read_excel('AnnotationTable_byArticle.xlsx')
people_cols = [column for column in list(df.columns) if 'people' in column and len(column) > len('people')]
source_cols = [column for column in list(df.columns) if 'sources' in column]
json_files = ['JSON_files_with_gender/'+f for f in os.listdir('JSON_files_with_gender/') if f.endswith('.json')]

## Excel Assertions

In [48]:
def excelAssertions(row):

    def customError(key, check, stacktrace=None):
        if stacktrace is None:
            stacktrace = []
        
        errorMsg = 'FAILED ASSERTION: '

        if check is 'subset':
            errorMsg += key + ' not subset of people'
            stacktrace.append(key + ': ' + row[key])
            stacktrace.append('people: ' + row['people'])
        if check is 'superset':
            errorMsg += 'people not superset of ' + key
            stacktrace.append('people: ' + row['people'])
            stacktrace.append(key + ': ' + row[key])
        if check is 'eval':
            errorMsg += 'Error evaluating ' + key + ' as list'
        if check is 'disjoint':
            errorMsg += key + ' not disjoint'

        errorMsg += ', ' + row['_id']
        
        if len(stacktrace) > 0:
            for trace in stacktrace:
                errorMsg += '\n' + str(trace)
        
        return errorMsg + '\n'
    
    excel_dict = {}
    assertion_count = 0
    assertion_errors = ''
    
    people = set()
    try:
        people = set(eval(row['people']))
    except SyntaxError as e:
        assertion_count += 1
        assertion_errors += customError('people', 'eval', [row['people']])
    
    for col in people_cols + source_cols:
        excel_dict[col] = set()
        try:
            excel_dict[col] = set(eval(row[col]))
        except SyntaxError as e:
            assertion_count += 1
            assertion_errors += customError('people', 'eval', [row['people']])
    
    if assertion_count > 0:
        raise AssertionError(assertion_errors)
        
    for i in range(len(people_cols)):
        for j in range(i+1, len(people_cols)):
            assert excel_dict[people_cols[i]].isdisjoint(excel_dict[people_cols[j]]), customError(
                people_cols[i] + ', ' + people_cols[j], 'disjoint')
            
    for i in range(len(source_cols)):
        for j in range(i+1, len(source_cols)):
            assert excel_dict[source_cols[i]].isdisjoint(excel_dict[source_cols[j]]), customError(
                source_cols[i] + ', ' + source_cols[j], 'disjoint')
    
    for key in excel_dict.keys():
        if not excel_dict[key].issubset(people):
            assertion_count += 1
            assertion_errors += customError(key, 'subset')
        if not people.issuperset(excel_dict[key]):
            assertion_count += 1
            assertion_errors += customError(key, 'superset')
    
    if assertion_count > 0:
        raise AssertionError(assertion_errors)

In [49]:
for index,row in df.iterrows():
    try:
        excelAssertions(row)
#         print()
    except AssertionError as e:
        print(index)
        print(e)

## JSON

In [96]:
json_objs = {}

for file in json_files:
    print(file)
    if file.endswith('.json'):
        with open(file, 'r+', encoding=ENCODING) as fo:
            file_str = fo.read().rstrip()
        json_objs[file.split('/')[-1]] = json.loads(file_str)

JSON_files_with_gender/5c1f4ec11e67d78e279d0505.json
JSON_files_with_gender/5c3ef32e1e67d78e27f52120.json
JSON_files_with_gender/5c497ebf1e67d78e27205222.json
JSON_files_with_gender/5c1ea0a81e67d78e279ae6c5.json
JSON_files_with_gender/5c32df6b1e67d78e27cf59df.json
JSON_files_with_gender/5c482d841e67d78e271c28fd.json
JSON_files_with_gender/5c3d7f2f1e67d78e27f03374.json
JSON_files_with_gender/5c53e08b1e67d78e27404c69.json
JSON_files_with_gender/5c480de41e67d78e271b5684.json
JSON_files_with_gender/5c47f7b01e67d78e271b0b6c.json
JSON_files_with_gender/5c29a2d01e67d78e27b6f656.json
JSON_files_with_gender/5c3466b21e67d78e27d40c2f.json
JSON_files_with_gender/5c1dd3051e67d78e27981aa4.json
JSON_files_with_gender/5c3e18ac1e67d78e27f24c0d.json
JSON_files_with_gender/5c343c861e67d78e27d31ccf.json
JSON_files_with_gender/5c535aed1e67d78e273ee007.json
JSON_files_with_gender/5c480a051e67d78e271b45e1.json
JSON_files_with_gender/5c15646b1e67d78e2771b011.json
JSON_files_with_gender/5c3ecf861e67d78e27f448a

In [97]:
index_fields = ['verb_index', 'speaker_index', 'quote_index']

In [98]:
def jsonAssertions(json):
    for quote_record in json:
        fields = ['verb', 'verb_index', 'quote', 'quote_index', 'speaker',
                  'speaker_index', 'speaker_gender', 'reference']
        for field in fields:
            assert field in quote_record.keys(), "FAILED ASSERTION: Fields in quote record: " + field
        
        assert len(quote_record['quote']) > 0, "FAILED ASSERTION: Empty quote"
        assert len(quote_record['quote_index']) > 0, "FAILED ASSERTION: Empty quote-index"
        assert len(quote_record['reference']) > 0, "FAILED ASSERTION: Empty reference"
        
        if len(quote_record['verb']) == 0:
            assert len(quote_record['verb_index']) == 0, "FAILED ASSERTION: verb empty, verb-index non-empty"
        if len(quote_record['verb_index']) == 0:
            assert len(quote_record['verb']) == 0, "FAILED ASSERTION: verb-index non-empty, verb-index empty"
            
        if len(quote_record['speaker']) == 0:
            assert len(quote_record['speaker_index']) == 0, "FAILED ASSERTION: speaker empty, speaker-index non-empty"
        if len(quote_record['speaker_index']) == 0:
            assert len(quote_record['speaker']) == 0, "FAILED ASSERTION: speaker non-empty, speaker-index empty"
            
        assert quote_record['speaker_gender'] in ['male', 'female', 'unknown'], "FAILED ASSERTION: speaker_gender " + quote_record['speaker_gender']
        
        tuples = []
        
        for field in index_fields:
            if len(quote_record[field]) > 0:
                index_tuple = eval(quote_record[field])
                tuples.append(index_tuple)
                
                assert index_tuple[1] > index_tuple[0], "FAILED ASSERTION: bad tuple " + quote_record[field]
                
                assert len(quote_record[field.split('_index')[0]]) == index_tuple[1] - index_tuple[0], (
                                "FAILED ASSERTION: " + field + " length mismatch ") + quote_record[field]
            
        tuple_set = set(tuples)
        assert len(tuple_set) == len(tuples), "FAILED ASSERTION: distinct indices"
            
        for i,t_1 in enumerate(tuples):
            others = tuples[:i] + tuples[i+1:]
            for t_2 in others:
                assert t_1[0] != t_2[1], "FAILED ASSERTION: overlap in indices"
                assert t_1[1] != t_2[0], "FAILED ASSERTION: overlap in indices"
                
                
        if len(quote_record['verb']) > 0:
            assert ' ' not in quote_record['verb'], "FAILED ASSERTION: verb contains space " + quote_record['verb']

In [99]:
for fname in json_objs.keys():
    try:
        jsonAssertions(json_objs[fname])
    except AssertionError as e:
        print(fname)
        print(e)
        print()

5c2851051e67d78e27b39e79.json
FAILED ASSERTION: Fields in quote record: speaker_gender



## Verbs

In [100]:
verb_list = []
for fname in json_objs.keys():
    for quote_record in json_objs[fname]:
        verb = quote_record['verb']
        if len(verb) > 0:
            verb_list.append(verb.rstrip())

In [101]:
print(list(set(verb_list)))

['avoue', 'était', 'remémore', 'entendre', 'tonné', 'valoir', 'relaté', 'est', 'estime', 'raconté', 'observe', 'réfuté', 'enchaîné', 'note', 'demandé', 'avance', 'dise', 'considérant', 'annoncé', 'ordonne', 'adressé', 'cité', 'constaté', 'indiquant', 'dénoncer', 'réagi', 'préciser', 'lâche', 'souri', 'jugé', 'lance', 'signale', 'reconnu', 'estimé', 'évoqué', 'accusent', 'soulève', 'souligné', 'révèle', 'disait', 'remarquer', 'accuse', 'commente', 'suggérer', 'évoquées', 'ajouté', 'plaidé', 'déploré', 's’excite', 'indique', 'laisse', 'confirmé', 'jugent', 'insurgé', 'répliquent', 'donne', 'promet', 'demeurent', 'dénoncé', 'répète', 'plaignent', 'propose', 'répondu', 'déclare', 'expliqué', 'prévoit', 'dit', 'insisté', 'souligne', 'plaidait', 'exige', 'prétextant', 'exprimé', 'justifiant', 'admis', 'rappelant', 'dénoncent', 'constate', 'demande', 'rappelle', 'critique', 'rétorqué', 'espère', 'affirmé', 'a', 'poursuivi', 'précisant', 'croit', 'réjouit', 'disant', 'dite', 'ajoute', 'recomma

In [102]:
with open('verbs_fr.txt', 'w+') as fo:
    for verb in list(set(verb_list)):
        fo.write(verb + '\n')

## Stats on quotes

In [103]:
num_quotes = []
for fname in json_objs.keys():
    num_quotes.append(len(json_objs[fname]))
    
sum(num_quotes)

715

In [104]:
num_syntactic_quotes = []
for fname in json_objs.keys():
    f_quotes = 0
    for quote_record in json_objs[fname]:
        if (len(quote_record['verb']) > 0) and (len(quote_record['speaker']) > 0):
            f_quotes += 1
    num_syntactic_quotes.append(f_quotes)
    
sum(num_syntactic_quotes)

538