# Sanity checks for annotation data

In [10]:
import numpy as np
import pandas as pd
import os
import json

In [11]:
ENCODING = 'utf-8'

In [12]:
df = pd.read_excel('AnnotationTable_byArticle_withReplacementArticles.xlsx')
people_cols = [column for column in list(df.columns) if 'people' in column and len(column) > len('people')]
source_cols = [column for column in list(df.columns) if 'sources' in column]
json_files = ['OutputJsonFiles/'+f for f in os.listdir('OutputJsonFiles/') if f.endswith('.json')]

## Excel Assertions

In [13]:
df.columns

Index(['Unnamed: 0', 'articleID', 'outlet', 'articleBodyText', 'articleURL',
       'authors', 'authorFemale', 'authorMale', 'authorUnknowns', 'people',
       'peopleFemale', 'peopleMale', 'peopleUnknown', 'quotations',
       'sourcesFemale', 'sourcesMale', 'sourcesUnknown', 'articleType',
       'annotatorsComment'],
      dtype='object')

In [14]:
def excelAssertions(row):

    def customError(key, check, stacktrace=None):
        if stacktrace is None:
            stacktrace = []
        
        errorMsg = 'FAILED ASSERTION: '

        if check is 'subset':
            errorMsg += key + ' not subset of people'
            stacktrace.append(key + ': ' + row[key])
            stacktrace.append('people: ' + row['people'])
        if check is 'superset':
            errorMsg += 'people not superset of ' + key
            stacktrace.append('people: ' + row['people'])
            stacktrace.append(key + ': ' + row[key])
        if check is 'eval':
            errorMsg += 'Error evaluating ' + key + ' as list'
        if check is 'disjoint':
            errorMsg += key + ' not disjoint'

        errorMsg += ', ' + row['articleID']
        
        if len(stacktrace) > 0:
            for trace in stacktrace:
                errorMsg += '\n' + str(trace)
        
        return errorMsg + '\n'
    
    excel_dict = {}
    assertion_count = 0
    assertion_errors = ''
    
    people = set()
    try:
        people = set(eval(row['people']))
    except SyntaxError as e:
        assertion_count += 1
        assertion_errors += customError('people', 'eval', [row['people']])
    
    for col in people_cols + source_cols:
        excel_dict[col] = set()
        try:
            excel_dict[col] = set(eval(row[col]))
        except SyntaxError as e:
            assertion_count += 1
            assertion_errors += customError('people', 'eval', [row['people']])
    
    if assertion_count > 0:
        raise AssertionError(assertion_errors)
        
    for i in range(len(people_cols)):
        for j in range(i+1, len(people_cols)):
            assert excel_dict[people_cols[i]].isdisjoint(excel_dict[people_cols[j]]), customError(
                people_cols[i] + ', ' + people_cols[j], 'disjoint')
            
    for i in range(len(source_cols)):
        for j in range(i+1, len(source_cols)):
            assert excel_dict[source_cols[i]].isdisjoint(excel_dict[source_cols[j]]), customError(
                source_cols[i] + ', ' + source_cols[j], 'disjoint')
    
    for key in excel_dict.keys():
        if not excel_dict[key].issubset(people):
            assertion_count += 1
            assertion_errors += customError(key, 'subset')
        if not people.issuperset(excel_dict[key]):
            assertion_count += 1
            assertion_errors += customError(key, 'superset')
    
    if assertion_count > 0:
        raise AssertionError(assertion_errors)

In [15]:
for index,row in df.iterrows():
    try:
        excelAssertions(row)
#         print()
    except AssertionError as e:
        print(index)
        print(e)

41
FAILED ASSERTION: Error evaluating people as list, ObjectId(5c5d292b1e67d78e275d9e9f)
["Ehab Lotayef,"François Huot","Alexandre Bissonnette","Ibrahima Barry","Mamadou Tanour Barry","Khaled Belkacemi","Aboubake Thabti","Abdelkrim Hassane","Azzedine Soufiane","Dylann Roof","Justin Trudeau","Kenza Tarek","Megda Belkacemi"]
FAILED ASSERTION: Error evaluating people as list, ObjectId(5c5d292b1e67d78e275d9e9f)
["Ehab Lotayef,"François Huot","Alexandre Bissonnette","Ibrahima Barry","Mamadou Tanour Barry","Khaled Belkacemi","Aboubake Thabti","Abdelkrim Hassane","Azzedine Soufiane","Dylann Roof","Justin Trudeau","Kenza Tarek","Megda Belkacemi"]
FAILED ASSERTION: Error evaluating people as list, ObjectId(5c5d292b1e67d78e275d9e9f)
["Ehab Lotayef,"François Huot","Alexandre Bissonnette","Ibrahima Barry","Mamadou Tanour Barry","Khaled Belkacemi","Aboubake Thabti","Abdelkrim Hassane","Azzedine Soufiane","Dylann Roof","Justin Trudeau","Kenza Tarek","Megda Belkacemi"]

54
FAILED ASSERTION: Error eva

## JSON

In [16]:
json_objs = {}

for file in json_files:
    print(file)
    if file.endswith('.json'):
        with open(file, 'r+', encoding=ENCODING) as fo:
            file_str = fo.read().rstrip()
        json_objs[file.split('/')[-1]] = json.loads(file_str)

OutputJsonFiles/5c1e0b68795bd2a5d03a49a9.json
OutputJsonFiles/5c2a3d191e67d78e27b8ac72.json
OutputJsonFiles/5c159cb81e67d78e277241fd.json
OutputJsonFiles/5c531ba91e67d78e273e272a.json
OutputJsonFiles/5c146e42795bd2fcce2ea8e5.json
OutputJsonFiles/5c5d292b1e67d78e275d9e9f.json
OutputJsonFiles/5c25054e1e67d78e27aac4ef.json
OutputJsonFiles/5c2ae5f11e67d78e27ba36d7.json
OutputJsonFiles/5c1452701e67d78e276ee126.json
OutputJsonFiles/5c286d031e67d78e27b3f17b.json
OutputJsonFiles/5c2059ec1e67d78e279ea86c.json
OutputJsonFiles/5c1de1661e67d78e27984d34.json
OutputJsonFiles/5c201b371e67d78e279e248a.json
OutputJsonFiles/5c2060d31e67d78e279eb852.json
OutputJsonFiles/5c3e27a31e67d78e27f27b38.json
OutputJsonFiles/5c3f78b21e67d78e27f6a477.json
OutputJsonFiles/5c546d281e67d78e27426e82.json
OutputJsonFiles/5c158f201e67d78e27721ffd.json
OutputJsonFiles/5c32f9841e67d78e27cfa4eb.json
OutputJsonFiles/5c4a89f31e67d78e27233c5d.json
OutputJsonFiles/5c3eec791e67d78e27f51065.json
OutputJsonFiles/5c494e541e67d78e27

In [17]:
index_fields = ['verb_index', 'speaker_index', 'quote_index']

In [22]:
def jsonAssertions(json):
    for quote_record in json:
        fields = ['verb', 'verb_index', 'quote', 'quote_index', 'speaker',
                  'speaker_index', 'reference']
        for field in fields:
            assert field in quote_record.keys(), "FAILED ASSERTION: Fields in quote record: " + field
        
        assert len(quote_record['quote']) > 0, "FAILED ASSERTION: Empty quote"
        assert len(quote_record['quote_index']) > 0, "FAILED ASSERTION: Empty quote-index"
        assert len(quote_record['reference']) > 0, "FAILED ASSERTION: Empty reference"
        
        if len(quote_record['verb']) == 0:
            assert len(quote_record['verb_index']) == 0, "FAILED ASSERTION: verb empty, verb-index non-empty"
        if len(quote_record['verb_index']) == 0:
            assert len(quote_record['verb']) == 0, "FAILED ASSERTION: verb-index non-empty, verb-index empty"
            
        if len(quote_record['speaker']) == 0:
            assert len(quote_record['speaker_index']) == 0, "FAILED ASSERTION: speaker empty, speaker-index non-empty"
        if len(quote_record['speaker_index']) == 0:
            assert len(quote_record['speaker']) == 0, "FAILED ASSERTION: speaker non-empty, speaker-index empty"
        
        tuples = []
        
        for field in index_fields:
            if len(quote_record[field]) > 0:
                index_tuple = eval(quote_record[field])
                tuples.append(index_tuple)
                
                assert index_tuple[1] > index_tuple[0], "FAILED ASSERTION: bad tuple " + quote_record[field]
                
                assert len(quote_record[field.split('_index')[0]]) == index_tuple[1] - index_tuple[0], (
                                "FAILED ASSERTION: " + field + " length mismatch ") + quote_record[field]
            
        tuple_set = set(tuples)
        assert len(tuple_set) == len(tuples), "FAILED ASSERTION: distinct indices"
            
        for i,t_1 in enumerate(tuples):
            others = tuples[:i] + tuples[i+1:]
            for t_2 in others:
                assert t_1[0] != t_2[1], "FAILED ASSERTION: overlap in indices"
                assert t_1[1] != t_2[0], "FAILED ASSERTION: overlap in indices"
                
                
        if len(quote_record['verb']) > 0:
            assert ' ' not in quote_record['verb'], "FAILED ASSERTION: verb contains space " + quote_record['verb']

In [23]:
for fname in json_objs.keys():
    try:
        jsonAssertions(json_objs[fname])
    except AssertionError as e:
        print(fname)
        print(e)
        print()

5c5d292b1e67d78e275d9e9f.json
FAILED ASSERTION: Empty quote

5c25054e1e67d78e27aac4ef.json
FAILED ASSERTION: quote_index length mismatch (8652,8663)

5c5e50711e67d78e27616b23.json
FAILED ASSERTION: quote_index length mismatch (3566,3685)

5c5555d11e67d78e27457a93.json
FAILED ASSERTION: verb empty, verb-index non-empty

5c2a60611e67d78e27b8feef.json
FAILED ASSERTION: speaker_index length mismatch (2121,2198)

5c1efb3d1e67d78e279bd39a.json
FAILED ASSERTION: speaker_index length mismatch (2102,2110)

5c29947f1e67d78e27b6d330.json
FAILED ASSERTION: bad tuple (7937,3943)

5c33859e1e67d78e27d12893.json
FAILED ASSERTION: quote_index length mismatch (2136,2192)

5c54662d1e67d78e27425afa.json
FAILED ASSERTION: quote_index length mismatch (1886,1926)

5c4888ac1e67d78e271d2cdf.json
FAILED ASSERTION: speaker empty, speaker-index non-empty

5c5d532a795bd2d5c282a094.json
FAILED ASSERTION: quote_index length mismatch (8769,8858)



## Verbs

In [24]:
verb_list = []
for fname in json_objs.keys():
    for quote_record in json_objs[fname]:
        verb = quote_record['verb']
        if len(verb) > 0:
            verb_list.append(verb.rstrip())

In [25]:
print(list(set(verb_list)))

['chanted', 'reminded', 'explain', 'wrote', 'sign', 'urge', 'reported', 'announced', 'alleged', 'warned', 'tells', 'promise', 'notes', 'noting', 'assured', 'described', 'joking', 'calls', 'maintains', 'acknowledges', 'advised', 'echoes', 'ruled', 'insists', 'recalled', 'joked', 'stuttered', 'noted', 'claims', 'confirmed', 'believed', 'report', 'stated', 'admitted', 'warns', 'insisting', 'alleges', 'remarks', 'lamented', 'continues', 'called', 'explains', 'asks', 'points', 'acknowledged', 'argues', 'replied', 'writing', 'scolded', '“This can happen again.”', 'confessed', 'indicated', 'shout', 'adds', 'suggests', 'said', 'say', 'responded', 'commented', 'told', 'voiced', 'testifying', 'writes', 'added', 'testified', 'argued', 'stressed', 'describing', 'claimed', 'predicted', 'asked', 'advises', 'states', 'calling', 'says', 'went', 'wondered', 'began', 'sniffed', 'urged', 'tweeted', 'repeated', 'complained', 'arguing', 'admits', 'explained', 'talking', 'conceded', 'saying', 'crowed', 'arg

In [26]:
with open('verbs_fr.txt', 'w+') as fo:
    for verb in list(set(verb_list)):
        fo.write(verb + '\n')

## Stats on quotes

In [27]:
num_quotes = []
for fname in json_objs.keys():
    num_quotes.append(len(json_objs[fname]))
    
sum(num_quotes)

1493

In [28]:
num_syntactic_quotes = []
for fname in json_objs.keys():
    f_quotes = 0
    for quote_record in json_objs[fname]:
        if (len(quote_record['verb']) > 0) and (len(quote_record['speaker']) > 0):
            f_quotes += 1
    num_syntactic_quotes.append(f_quotes)
    
sum(num_syntactic_quotes)

1105