In [1]:
"""
This notebook analyses noise in the datasets from inconsistencies in labelling 
between words and phrases.

"""

#set up
import csv
import pandas as pd
import numpy as np


datasets_per_language = {"english": ["News", "WikiNews", "Wikipedia"],
                         "spanish": ["Spanish"],
                         "german": ["German"],
                         "french": ["French"]}


# fn to define filepaths
def filepaths(language, dataset_name):
    paths = []
    
    if language != "french":  # language = English, Spanish or German
        trainset_path = "../data/raw/{}/{}_Train.tsv".format(language.lower(), dataset_name)
        devset_path = "../data/raw/{}/{}_Dev.tsv".format(language.lower(), dataset_name) 
        paths.append(trainset_path)
        paths.append(devset_path)
   
    testset_path = "../data/raw/{}/{}_Test.tsv".format(language.lower(), dataset_name)
    paths.append(testset_path)
    
    return paths


# fn to read data
def read_dataset(file_path):
        with open(file_path,  encoding="utf8") as file:
            fieldnames = ['hit_id', 'sentence', 'start_offset', 'end_offset', 'target_word', 'native_annots',
                          'nonnative_annots', 'native_complex', 'nonnative_complex', 'gold_label', 'gold_prob']
            reader = csv.DictReader(file, fieldnames=fieldnames, delimiter='\t')

            dataset = [sent for sent in reader]

        return dataset
    

 # fn to make two-level dictionary of all target words in sent plus their labels    
def sents_dic(data): 
    sents_dic = {}
    for sent in data:
        if sent["sentence"] not in sents_dic.keys():
            sents_dic[sent["sentence"]] =  {sent["target_word"]: sent["gold_label"]}
        else:
            sents_dic[sent["sentence"]].update({sent["target_word"]:sent["gold_label"]})

    return sents_dic


# fn to count inconsistencis in multi=phrase labels
def label_analysis(sents):  # analyses label inconsistencies for given dataset
    
    phrase_counter = 0   # number multi-word phrases
    complex_counter = 0  # number of these phrases that are complex
    complex_with_noncomplex_subword_counter = 0  # at least one inconsistently labelled subword
    noncomplex_with_complex_subword_counter = 0  # at least one inconsistently labelled subword
    complex_with_all_noncomplex_subwords_counter = 0  # all subwords inconsistently labelled
    noncomplex_with_all_complex_subwords_counter = 0  # all subwords inconsistently labelled

    for sent in sents.keys():
        for target_word, label in sents[sent].items():
            if len(target_word.split())>1:
                phrase_counter += 1
                if label == '1':
                    complex_counter += 1

                consistency_counter = 0 # number times subword label disagrees with target word label   
                subword_counter = 0  # count subwords
                for other_word, other_label in sents[sent].items():
                    if len(other_word) < len(target_word) and other_word in target_word:
                        subword_counter +=1
                        if label != other_label:
                            consistency_counter +=1

                if consistency_counter != 0:
                    if label =='1':
                        complex_with_noncomplex_subword_counter += 1
                    else: # label == 0
                        noncomplex_with_complex_subword_counter += 1
                        
                if subword_counter > 0 and subword_counter == consistency_counter:
                    if label =='1':
                        complex_with_all_noncomplex_subwords_counter += 1
                    else: # label == 0
                        noncomplex_with_all_complex_subwords_counter += 1
                     
                    
    return [complex_counter,
            phrase_counter - complex_counter,  # number non-complex,
            phrase_counter,
            complex_with_noncomplex_subword_counter + noncomplex_with_complex_subword_counter,
            complex_with_all_noncomplex_subwords_counter + noncomplex_with_all_complex_subwords_counter]
 

# fn to make dictionary of label inconsistency counts
def make_analysis_dic(datasets_per_language):
    datasets_analysis_dic = {"English":np.array([0,0,0,0,0]), 
                             "German":np.array([0,0,0,0,0]), 
                             "French":np.array([0,0,0,0,0]), 
                            "Spanish":np.array([0,0,0,0,0])}

    for language, dataset_names in datasets_per_language.items():

        for dataset_name in dataset_names:
            target_filepaths = filepaths(language, dataset_name)

            for file_path in target_filepaths:
                data = read_dataset(file_path)
                sents = sents_dic(data)
                counters = np.array(label_analysis(sents))
                
                datasets_analysis_dic[language.capitalize()] += counters

    return datasets_analysis_dic  


In [2]:
 # make dic of label inconsistency counts
analysis_dic = make_analysis_dic(datasets_per_language)

# make df
df = pd.DataFrame(data=analysis_dic)
df = df.transpose()
df.columns =["Complex MWP","Non-Complex MWP","Total MWP",'1+ Inconsistent SW', 
         'All Inconsistent SWs']
df.loc['Total']= df.sum()
df

Unnamed: 0,Complex MWP,Non-Complex MWP,Total MWP,1+ Inconsistent SW,All Inconsistent SWs
English,3662,969,4631,3271,921
German,502,0,502,377,179
French,241,0,241,190,82
Spanish,2294,0,2294,1746,763
Total,6699,969,7668,5584,1945


In [3]:
print(df.to_latex())

\begin{tabular}{lrrrrr}
\toprule
{} &  Complex MWP &  Non-Complex MWP &  Total MWP &  1+ Inconsistent SW &  All Inconsistent SWs \\
\midrule
English &         3662 &              969 &       4631 &                3271 &                   921 \\
German  &          502 &                0 &        502 &                 377 &                   179 \\
French  &          241 &                0 &        241 &                 190 &                    82 \\
Spanish &         2294 &                0 &       2294 &                1746 &                   763 \\
Total   &         6699 &              969 &       7668 &                5584 &                  1945 \\
\bottomrule
\end{tabular}



In [8]:
#  fn to print out phrases & their subwords w/ labels     
def label_examples(language, dataset_name, data_type, min_phrase_length): 

    all_filepaths = filepaths(language, dataset_name)
    if language != "french":
        if data_type == "Train":
            file_path = all_filepaths[0]
        elif data_type == "Dev":
            file_path = all_filepaths[1]
        else: #data_type = "Test"
            file_path = all_filepaths[2]
            
    if language == 'french':
        if data_type == "Test":
            file_path = all_filepaths[0]
        else:
            raise TypeError("ERROR: French training/dev data does not exist")

    data_name = dataset_name +' '+ data_type
    data = read_dataset(file_path)
    sents = sents_dic(data)
    print(data_name, " instances with minimum phrase length", min_phrase_length)

    phrase_counter = 0   # number multi-word phrases
    complex_counter = 0  # number of these phrases that are complex
    complex_with_noncomplex_subword_counter = 0  # at least one inconsistently labelled subword
    noncomplex_with_complex_subword_counter = 0  # at least one inconsistently labelled subword
    complex_with_all_noncomplex_subwords_counter = 0  # all subwords inconsistently labelled
    noncomplex_with_all_complex_subwords_counter = 0  # all subwords inconsistently labelled

    for sent in sents.keys():
        for target_word, label in sents[sent].items():
            if len(target_word.split())>min_phrase_length-1:
                print("-"*20)
                print(target_word,label)
                phrase_counter += 1
                if label == '1':
                    complex_counter += 1

                consistency_counter = 0 # number times subword label disagrees with target word label   
                subword_counter = 0  # subwords in phrase counter
                for other_word, other_label in sents[sent].items():
                    if len(other_word) < len(target_word) and other_word in target_word:
                        subword_counter +=1 
                        print(other_word, other_label)
                        if label != other_label:
                            consistency_counter +=1

                if consistency_counter != 0:
                    print("-- subwords with inconsistent labels:", consistency_counter, "/", subword_counter)
                    if label =='1':
                        complex_with_noncomplex_subword_counter += 1
                    else: # label == 0
                        noncomplex_with_complex_subword_counter += 1
                        
                        
                if subword_counter > 0 and subword_counter == consistency_counter:
                    if label =='1':
                        complex_with_all_noncomplex_subwords_counter += 1
                    else: # label == 0
                        noncomplex_with_all_complex_subwords_counter += 1
                    print("-- all subwords inconsistent")

    print("\nTotal multiword phrases of length at least", min_phrase_length,":",phrase_counter)
    print("Total complex multiword phrases:", complex_counter)
    print("Total non-complex multiword phrases:", phrase_counter - complex_counter)
    print("Total complex phrases with at least one noncomplex subword:",complex_with_noncomplex_subword_counter)
    print("Total noncomplex phrases with at least one complex subword:", noncomplex_with_complex_subword_counter)
    print("Total phrases with at least inconsistent subword one:", 
          complex_with_noncomplex_subword_counter + noncomplex_with_complex_subword_counter)
    print("Total complex phrases with all noncomplex subwords:",complex_with_all_noncomplex_subwords_counter)
    print("Total noncomplex phrases with all complex subwords:", noncomplex_with_all_complex_subwords_counter)
    print("Total phrases with all inconsistent subwords:", 
          complex_with_all_noncomplex_subwords_counter + noncomplex_with_all_complex_subwords_counter)


In [7]:
# print MWP examples
label_examples("english", "News", "Dev", 5)

News Dev  instances with minimum phrase length 5
--------------------
exposure to the property sector 1
exposure 1
property sector 1
sector 1
property 0
-- subwords with inconsistent labels: 1 / 4
--------------------
lost track of the days 0
lost track 1
lost 0
track 0
days 0
-- subwords with inconsistent labels: 1 / 4
--------------------
dispute over censorship and security 0
dispute 1
censorship 1
security 1
-- subwords with inconsistent labels: 3 / 3
-- all subwords inconsistent
--------------------
vulnerable to a resilient insurgency 0
vulnerable 1
resilient 1
resilient insurgency 1
insurgency 1
-- subwords with inconsistent labels: 4 / 4
-- all subwords inconsistent
--------------------
ambulance in a police car 0
ambulance 1
police 0
car 0
-- subwords with inconsistent labels: 1 / 3
--------------------
scheduled to end combat operations 0
scheduled 1
combat 1
end 0
combat operations 1
operations 0
-- subwords with inconsistent labels: 3 / 5
--------------------
The Syrian Obs

In [9]:
# print MWP examples
label_examples("french", "French", "Test", 5)

French Test  instances with minimum phrase length 5
--------------------
crise des marchés du crédit 1
crise 0
marchés 0
crédit 0
-- subwords with inconsistent labels: 3 / 3
-- all subwords inconsistent
--------------------
campagne de récolte de fonds 1
campagne 0
récolte 0
fonds 0
-- subwords with inconsistent labels: 3 / 3
-- all subwords inconsistent
--------------------
un alcaloïde bi - indolé 1
alcaloïde 1
alcaloïde bi - indolé 1
bi - indolé 1
indolé 0
-- subwords with inconsistent labels: 1 / 4
--------------------
anti - inflammatoire de la caulerpine 1
anti 0
anti - inflammatoire 1
inflammatoire 0
caulerpine 1
-- subwords with inconsistent labels: 2 / 4
--------------------
péritoine induite par des carrageenans 1
induit 1
péritoine 1
induite 0
carrageenans 1
-- subwords with inconsistent labels: 1 / 4
--------------------
inhibition de COX et des phosphilases 1
inhibition 0
COX 0
phosphilases 0
-- subwords with inconsistent labels: 3 / 3
-- all subwords inconsistent
--------