In [20]:
"""
This notebook analyses noise in the datasets from inconsistencies in labelling 
between words and phrases.

"""

# imports
import glob
import pandas as pd
import numpy as np


# set up
languages = ["english", "spanish", "german", "french"]

filepaths = {}
for language in languages:
    filepaths[language] = glob.glob("../data/raw/"+language+"/*")
    
    
fieldnames = ['hit_id', 'sentence', 'start_offset', 'end_offset', 'target_word', 'native_annots', 
              'nonnative_annots', 'native_complex', 'nonnative_complex', 'gold_label', 'gold_prob']

target_cols = ['hit_id','sentence','start_offset', 'end_offset','target_word', 'gold_label']

df = pd.DataFrame()

In [21]:
# create dataframe
for language, paths in filepaths.items():
    for path in paths:
        source_df = pd.read_csv(path, sep='\t', header=None, names=fieldnames)
        target_df = source_df[target_cols]
        target_df.is_copy = False
        target_df['language'] = language
        df = df.append(target_df, ignore_index=True)
        
# clean df
df = df.dropna() 
df['n_words'] = df.apply(func= lambda row :  len( row['target_word'].split() ) , axis=1)  # number SWs in target word/phrase
df.head(10)
print(df.shape)
df.head(10)

  object.__getattribute__(self, name)
  return object.__setattr__(self, name, value)


(62640, 8)


Unnamed: 0,hit_id,sentence,start_offset,end_offset,target_word,gold_label,language,n_words
0,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,7,13,troops,0,english,1
1,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,0,6,Syrian,0,english,1
2,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,14,21,shelled,1,english,1
3,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,24,34,rebel-held,1,english,1
4,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,51,59,sparking,1,english,1
5,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,35,39,town,0,english,1
6,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,43,49,Monday,0,english,1
7,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,51,67,sparking intense,1,english,2
8,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,51,75,sparking intense clashes,1,english,3
9,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,60,67,intense,1,english,1


In [22]:
# get some initial counts
for language in languages:
    print(language, df.loc[df['language'] == language].shape[0], "total",
        ",", df.loc[(df['language'] == language) & (df['gold_label'] == 1)].shape[0], "complex")
    
print("\n")
for language in languages:
    print(language, 
          df.loc[(df['language'] == language) 
                 & (df['gold_label'] == 1)
                 & (df["n_words"] > 1)].shape[0], "complex MWP",
          df.loc[(df['language'] == language) 
                 & (df['gold_label'] == 0)
                 & (df["n_words"] > 1)].shape[0], "non-complex MWP")

english 34879 total , 14428 complex
spanish 17605 total , 7015 complex
german 7905 total , 3272 complex
french 2251 total , 657 complex


english 3750 complex MWP 982 non-complex MWP
spanish 2309 complex MWP 0 non-complex MWP
german 502 complex MWP 0 non-complex MWP
french 242 complex MWP 0 non-complex MWP


In [23]:
# makes dictionary of counts of complex and NC MWPS, and MWPs w/ SWs of opposite label
def inconsistency_counter(languages):
    counters_dic ={}
    
    for language in languages: 
        MWP_counter = 0   # number multiword_phrases (MWPs)
        CMWP_counter = 0  # number of these phrases that are complex
        irregSW_counter = 0  # number MWPS with at least one inconsistently labelled SW
        allirregSW_counter = 0 # number MWPs with all SWs inconsistently labelled

        lang_dic = df.loc[(df['language'] == language)].to_dict()

        for row, value in lang_dic["n_words"].items():

            # count MWPS and complex MWPs
            if value > 1:  # if MWP
                MWP_counter += 1
                if lang_dic["gold_label"][row] == 1:
                    CMWP_counter += 1

                # define row attributes
                target_hit = lang_dic["hit_id"][row]
                target_sent = lang_dic["sentence"][row]
                target_MWP = lang_dic["target_word"][row]
                target_startoffset = lang_dic["start_offset"][row]
                target_endoffset = lang_dic["end_offset"][row]
                target_GL = lang_dic["gold_label"][row]
                target_irregSW_counter = 0
                target_SW_counter = 0
                
                # count how many SWs have the opposite label
                for row2, sent in lang_dic["sentence"].items():
                    if sent == target_sent:
                        if lang_dic["hit_id"][row2] == target_hit:
                            if lang_dic["start_offset"][row2]>= target_startoffset:  
                                if lang_dic["end_offset"][row2] <= target_endoffset:
                                    if len(target_MWP) > len(lang_dic["target_word"][row2]):
                                        target_SW_counter += 1
                                        if lang_dic["gold_label"][row2] != target_GL:
                                            target_irregSW_counter += 1

                # update irregular SW counters
                if target_irregSW_counter > 0:
                    irregSW_counter += 1
                    if target_SW_counter == target_irregSW_counter:
                        allirregSW_counter += 1

        # update dic
        counters_dic[language] = (CMWP_counter, MWP_counter - CMWP_counter, 
                                  irregSW_counter, allirregSW_counter)
    
    return counters_dic

counters_dic = inconsistency_counter(languages)

In [24]:
# make latex table of counters_dic
counters_df = pd.DataFrame(data=counters_dic).transpose()
counters_df.columns =["C MWP","NC MWP",'$\geq 1$ Irreg. SW', 'All irreg. SWs']
counters_df.loc['Total']= counters_df.sum()

print(counters_df.to_latex())
counters_df

\begin{tabular}{lrrrr}
\toprule
{} &  C MWP &  NC MWP &  \$\textbackslash geq 1\$ Irreg. SW &  All irreg. SWs \\
\midrule
english &   3750 &     982 &                3315 &             950 \\
spanish &   2309 &       0 &                1747 &             760 \\
german  &    502 &       0 &                 374 &             178 \\
french  &    242 &       0 &                 192 &              82 \\
Total   &   6803 &     982 &                5628 &            1970 \\
\bottomrule
\end{tabular}



Unnamed: 0,C MWP,NC MWP,$\geq 1$ Irreg. SW,All irreg. SWs
english,3750,982,3315,950
spanish,2309,0,1747,760
german,502,0,374,178
french,242,0,192,82
Total,6803,982,5628,1970


In [25]:
# for given language and min number of words in MWPs, prints MWPS and SWs and counts irreg. SWs
def MWP_SW_printer(language, min_length):  # language should be one of: english, spanish, german, french
        print(language, "MWPs with min number of words", min_length, ":\n-")
        
        MWP_counter = 0   # number multiword_phrases (MWPs)
        CMWP_counter = 0  # number of these phrases that are complex
        irregSW_counter = 0  # number MWPS with at least one inconsistently labelled SW
        allirregSW_counter = 0 # number MWPs with all SWs inconsistently labelled


        lang_dic = df.loc[(df['language'] == language)].to_dict()

        for row, value in lang_dic["n_words"].items():

            # count MWPS and complex MWPs
            if value >= min_length:  # if MWP with at least min_length words
                print("-"*20)
                MWP_counter += 1
                if lang_dic["gold_label"][row] == 1:
                    CMWP_counter += 1

                # define row attributes
                target_hit = lang_dic["hit_id"][row]
                target_sent = lang_dic["sentence"][row]
                target_MWP = lang_dic["target_word"][row]
                target_startoffset = lang_dic["start_offset"][row]
                target_endoffset = lang_dic["end_offset"][row]
                target_GL = lang_dic["gold_label"][row]
                target_irregSW_counter = 0
                target_SW_counter = 0
                print(target_MWP, target_GL)

                # count how many SWs have the opposite label
                for row2, sent in lang_dic["sentence"].items():
                    if sent == target_sent:
                        if lang_dic["hit_id"][row2] == target_hit:
                            if lang_dic["start_offset"][row2]>= target_startoffset:  
                                if lang_dic["end_offset"][row2] <= target_endoffset:
                                    if len(target_MWP) > len(lang_dic["target_word"][row2]):
                                        print(lang_dic["target_word"][row2], lang_dic["gold_label"][row2])
                                        target_SW_counter += 1
                                        if lang_dic["gold_label"][row2] != target_GL:
                                            target_irregSW_counter += 1

                # update irreuglar labels counter
                if target_irregSW_counter > 0:
                    irregSW_counter += 1
                    print("Irregular SWs:", target_irregSW_counter,"/",target_SW_counter)
                    if target_SW_counter == target_irregSW_counter:
                        allirregSW_counter += 1
                        print("-- All irregular SWs:")
                    

        # print summary
        print("\nTotal MWPs with at least", min_length,"words:",MWP_counter)
        print("Total complex MWPss:", CMWP_counter)
        print("Total non-complex multiword phrases:", MWP_counter - CMWP_counter)
        print("Total MWPs with at least one opposite label SW:", irregSW_counter)
        print("Total MWPs with all opposite label SWs:", allirregSW_counter)


In [26]:
MWP_SW_printer("french", 4)

french MWPs with min number of words 4 :
-
--------------------
crise des marchés du crédit 1
crise 0
marchés 0
crédit 0
Irregular SWs: 3 / 3
-- All irregular SWs:
--------------------
campagne de récolte de fonds 1
campagne 0
récolte 0
fonds 0
Irregular SWs: 3 / 3
-- All irregular SWs:
--------------------
marque néanmoins sa désapprobation 1
marque 0
néanmoins sa désapprobation 1
désapprobation 1
Irregular SWs: 1 / 3
--------------------
un alcaloïde bi - indolé 1
alcaloïde 1
alcaloïde bi - indolé 1
bi - indolé 1
indolé 0
Irregular SWs: 1 / 4
--------------------
alcaloïde bi - indolé 1
alcaloïde 1
bi - indolé 1
indolé 0
Irregular SWs: 1 / 3
--------------------
isomères de la caulerpine 1
isomères 1
caulerpine 1
--------------------
rhodophycées et les chlorophycées 1
rhodophycées 1
chlorophycées 1
--------------------
anti - inflammatoire de la caulerpine 1
anti 0
anti - inflammatoire 1
inflammatoire 0
caulerpine 1
Irregular SWs: 2 / 4
--------------------
péritoine induite par des

In [27]:
MWP_SW_printer("english", 5)

english MWPs with min number of words 5 :
-
--------------------
exposure to the property sector 1
exposure 1
property sector 1
sector 1
property 0
Irregular SWs: 1 / 4
--------------------
lost track of the days 0
lost track 1
lost 0
track 0
days 0
Irregular SWs: 1 / 4
--------------------
dispute over censorship and security 0
dispute 1
censorship 1
security 1
Irregular SWs: 3 / 3
-- All irregular SWs:
--------------------
vulnerable to a resilient insurgency 0
vulnerable 1
resilient 1
resilient insurgency 1
insurgency 1
Irregular SWs: 4 / 4
-- All irregular SWs:
--------------------
ambulance in a police car 0
ambulance 1
police 0
car 0
Irregular SWs: 1 / 3
--------------------
scheduled to end combat operations 0
scheduled 1
combat 1
end 0
combat operations 1
operations 0
Irregular SWs: 3 / 5
--------------------
The Syrian Observatory for Human Rights 1
Observatory 1
Syrian 0
Human 0
Rights 0
Irregular SWs: 3 / 4
--------------------
blames the bloodshed on armed gangs 0
blames 1


Serious 0
Suite 1
Crime 0
Antrim 1
police 0
station 0
Irregular SWs: 4 / 8
--------------------
harvesting of endangered marine species 0
harvesting 1
endangered 1
endangered marine species 1
marine 1
species 1
Irregular SWs: 5 / 5
-- All irregular SWs:
--------------------
unsubtle show of the power 0
unsubtle 1
unsubtle show 1
show 0
power 0
Irregular SWs: 2 / 4
--------------------
see the light of a new day on the horizon 0
horizon 1
light 0
day 0
Irregular SWs: 1 / 3
--------------------
contents of this illegitimate document 0
contents 1
illegitimate 1
illegitimate document 1
document 0
Irregular SWs: 3 / 4
--------------------
invading forces and their puppets 0
invading 1
invading forces 1
puppets 1
forces 0
Irregular SWs: 3 / 4
--------------------
the necessary adjustment of bank balance sheets 0
necessary adjustment 1
adjustment 1
adjustment of bank balance sheets 1
bank balance sheets 1
balance sheets 0
bank 0
balance 0
sheets 0
Irregular SWs: 4 / 8
--------------------
adj