In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from pandarallel import pandarallel
import json
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
import matplotlib.pyplot as plt  # To visualize
import json
from collections import Counter, defaultdict

from sklearn.linear_model import LinearRegression
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', 200)

# WN COLEX

In [3]:
df = pd.read_csv("../data/colex/wn_colex/wn_synsets.csv", sep="\t")

In [4]:
df["C1"]= df["SYNSET1"].str.split("#").str[0]
df["C2"]= df["SYNSET2"].str.split("#").str[0]

In [5]:
len(df)

6177223

### get only the top senses from synset.

In [6]:
df = df[df["SYNSET1"].str.contains("#1")]
df = df[df["SYNSET2"].str.contains("#1")]

In [7]:
len(df)

2024695

In [8]:
df["diff"]=df["C1"]!=df["C2"]

In [9]:
df = df[df["diff"]==True] # real colex

In [10]:
len(df)

1980472

In [11]:
def get_lower(x):
    try:
        return x.lower()
    except Exception:
        return x

In [12]:
df["lang"]=df["LANG"].apply(get_lower)

In [17]:
# df.to_csv("../data/colex/colex_wn#1.csv", index=False)

In [18]:
len(df)

1980472

# Merge df with VALENCE data

In [13]:
df["C1"]=df["C1"].str.lower()
df["C2"]=df["C2"].str.lower()

In [14]:
df[df["C1"]=="apple_juice"]

Unnamed: 0,SENSE_LEMMA,LANG,SYNSET1,SYNSET2,C1,C2,diff,lang
1492887,äppeljuice,SV,apple_juice#n#1,juice#n#1,apple_juice,juice,True,sv
3294938,appelsap,NL,apple_juice#n#1,mulled_cider#n#1,apple_juice,mulled_cider,True,nl
3294940,苹果汁,ZH,apple_juice#n#1,applesauce#n#1,apple_juice,applesauce,True,zh
3294941,苹果汁,ZH,apple_juice#n#1,mulled_cider#n#1,apple_juice,mulled_cider,True,zh
4460920,jabukovača,HR,apple_juice#n#1,cider#n#1,apple_juice,cider,True,hr
4460921,jabukovača,HR,apple_juice#n#1,applejack#n#1,apple_juice,applejack,True,hr
4460922,сок_од_јабуке,SR,apple_juice#n#1,cider#n#1,apple_juice,cider,True,sr
4460925,عصير_التفاح,AR,apple_juice#n#1,cider#n#1,apple_juice,cider,True,ar
4460926,عصير_التفاح,AR,apple_juice#n#1,applesauce#n#1,apple_juice,applesauce,True,ar
4460927,sok_od_jabuke,HR,apple_juice#n#1,cider#n#1,apple_juice,cider,True,hr


In [15]:
df_wkb = pd.read_csv("../data/colex_affective/datasets/WKB.csv")

In [16]:
df_wkb["len"]=df_wkb["Word"].str.split().str.len()

In [17]:
df_wkb["len"].value_counts()

len
1.0    13812
2.0       98
3.0        4
Name: count, dtype: int64

In [18]:
df_wkb["lemma"]=df_wkb["Word"].str.replace(" ", "_").str.lower()

In [19]:
wkb_cols = [ "lemma","V.Mean.Sum", "V.SD.Sum", "V.Rat.Sum", "A.Mean.Sum", "A.SD.Sum", "A.Rat.Sum", "D.Mean.Sum", "D.SD.Sum", "D.Rat.Sum"]

In [20]:
df_wkb  = df_wkb[wkb_cols]

In [21]:
df_wkb[df_wkb["lemma"]=="jail"]

Unnamed: 0,lemma,V.Mean.Sum,V.SD.Sum,V.Rat.Sum,A.Mean.Sum,A.SD.Sum,A.Rat.Sum,D.Mean.Sum,D.SD.Sum,D.Rat.Sum
6650,jail,1.91,1.44,870,4.47,2.5,38,3.91,2.91,23


In [22]:
len(df_wkb)

13915

In [23]:
len(df)

1980472

In [24]:
df_merge = pd.merge(df, df_wkb, how="left", left_on="C1",right_on="lemma")

In [25]:
renames_c1= {col:"C1."+col for col in wkb_cols}
renames_c2= {col:"C2."+col for col in wkb_cols}

In [26]:
df_merge.rename(columns = renames_c1, inplace=True)

In [27]:
df_merge = pd.merge(df_merge, df_wkb, how="left", left_on="C2",right_on="lemma")

In [28]:
df_merge.rename(columns = renames_c2, inplace=True)

In [120]:
# df_merge.dropna(subset=["C1.lemma", "C2.lemma"], inplace=True)

In [29]:
df_merge

Unnamed: 0,SENSE_LEMMA,LANG,SYNSET1,SYNSET2,C1,C2,diff,lang,C1.lemma,C1.V.Mean.Sum,C1.V.SD.Sum,C1.V.Rat.Sum,C1.A.Mean.Sum,C1.A.SD.Sum,C1.A.Rat.Sum,C1.D.Mean.Sum,C1.D.SD.Sum,C1.D.Rat.Sum,C2.lemma,C2.V.Mean.Sum,C2.V.SD.Sum,C2.V.Rat.Sum,C2.A.Mean.Sum,C2.A.SD.Sum,C2.A.Rat.Sum,C2.D.Mean.Sum,C2.D.SD.Sum,C2.D.Rat.Sum
0,tropidoclonion_lineatum,BG,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,bg,,,,,,,,,,,,,,,,,,,,
1,tropidoclonion_lineatum,RU,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,ru,,,,,,,,,,,,,,,,,,,,
2,tropidoclonion_lineatum,UK,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,uk,,,,,,,,,,,,,,,,,,,,
3,tropidoclonion_lineatum,GA,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,ga,,,,,,,,,,,,,,,,,,,,
4,tropidoclonion_lineatum,RO,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,ro,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1981301,menuil,MS,jack#v#1,pry#v#1,jack,pry,True,ms,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0
1981302,menuas,MS,jack#v#1,pry#v#1,jack,pry,True,ms,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0
1981303,menuil,ID,jack#v#1,pry#v#1,jack,pry,True,id,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0
1981304,menuas,ID,jack#v#1,pry#v#1,jack,pry,True,id,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0


In [30]:
df= df_merge

In [125]:
# df.to_csv("../data/affectiveness/wn_colex_wkb.csv", index=False)

# Get Phonemes

In [31]:
with open("../data/phon/wikipron/data/scrape/lib/languages.json") as f:
    langs = json.load(f)

langs_colex = set(df.lang.tolist())

langs_pron= list()
lang_dict = dict()
for lang, d in langs.items():
    langs_pron.append(lang)
    lang_dict[lang] = (lang, d["iso639_name"])
    
    langs_pron.append(d["wiktionary_code"])
    lang_dict[d["wiktionary_code"]] = (lang, d["iso639_name"])
    
    if "dialect" in d:
        
        dia_d = d["dialect"]
        for k, v in dia_d.items():
            lang_dict[k]= (lang, v)
            langs_pron.append(k)
            

In [32]:
df

Unnamed: 0,SENSE_LEMMA,LANG,SYNSET1,SYNSET2,C1,C2,diff,lang,C1.lemma,C1.V.Mean.Sum,C1.V.SD.Sum,C1.V.Rat.Sum,C1.A.Mean.Sum,C1.A.SD.Sum,C1.A.Rat.Sum,C1.D.Mean.Sum,C1.D.SD.Sum,C1.D.Rat.Sum,C2.lemma,C2.V.Mean.Sum,C2.V.SD.Sum,C2.V.Rat.Sum,C2.A.Mean.Sum,C2.A.SD.Sum,C2.A.Rat.Sum,C2.D.Mean.Sum,C2.D.SD.Sum,C2.D.Rat.Sum
0,tropidoclonion_lineatum,BG,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,bg,,,,,,,,,,,,,,,,,,,,
1,tropidoclonion_lineatum,RU,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,ru,,,,,,,,,,,,,,,,,,,,
2,tropidoclonion_lineatum,UK,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,uk,,,,,,,,,,,,,,,,,,,,
3,tropidoclonion_lineatum,GA,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,ga,,,,,,,,,,,,,,,,,,,,
4,tropidoclonion_lineatum,RO,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,ro,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1981301,menuil,MS,jack#v#1,pry#v#1,jack,pry,True,ms,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0
1981302,menuas,MS,jack#v#1,pry#v#1,jack,pry,True,ms,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0
1981303,menuil,ID,jack#v#1,pry#v#1,jack,pry,True,id,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0
1981304,menuas,ID,jack#v#1,pry#v#1,jack,pry,True,id,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0


In [33]:
lang_dict

{'aar': ('aar', 'Afar'),
 'aa': ('aar', 'Afar'),
 'abk': ('abk', 'Abkhazian'),
 'ab': ('abk', 'Abkhazian'),
 'acw': ('acw', 'Hijazi Arabic'),
 'ady': ('ady', 'Adygei; Adyghe'),
 'afb': ('afb', 'Gulf Arabic'),
 'afr': ('afr', 'Afrikaans'),
 'af': ('afr', 'Afrikaans'),
 'ain': ('ain', 'Ainu (Japan)'),
 'ajp': ('ajp', 'South Levantine Arabic'),
 'akk': ('akk', 'Akkadian'),
 'ale': ('ale', 'Aleut'),
 'alr': ('alr', 'Alutor'),
 'amh': ('amh', 'Amharic'),
 'am': ('amh', 'Amharic'),
 'ang': ('ang', 'Old English (ca. 450-1100)'),
 'aot': ('aot', 'Atong (India)'),
 'apw': ('apw', 'Western Apache'),
 'ara': ('ara', 'Arabic'),
 'ar': ('ara', 'Arabic'),
 'arc': ('arc',
  'Imperial Aramaic (700-300 BCE); Official Aramaic (700-300 BCE)'),
 'ary': ('ary', 'Moroccan Arabic'),
 'arz': ('arz', 'Egyptian Arabic'),
 'asm': ('asm', 'Assamese'),
 'as': ('asm', 'Assamese'),
 'ast': ('ast', 'Asturian'),
 'ayl': ('ayl', 'Libyan Arabic'),
 'aze': ('aze', 'Azerbaijani'),
 'az': ('aze', 'Azerbaijani'),
 'azg': ('

In [34]:
len(langs_pron), len(langs_colex), len(langs_colex.intersection(set(langs_pron)))

(543, 520, 214)

In [127]:
len(langs_pron), len(langs_colex), len(langs_colex.intersection(set(langs_pron)))

(543, 311, 171)

In [45]:
df

Unnamed: 0,SENSE_LEMMA,LANG,SYNSET1,SYNSET2,C1,C2,diff,lang,C1.lemma,C1.V.Mean.Sum,C1.V.SD.Sum,C1.V.Rat.Sum,C1.A.Mean.Sum,C1.A.SD.Sum,C1.A.Rat.Sum,C1.D.Mean.Sum,C1.D.SD.Sum,C1.D.Rat.Sum,C2.lemma,C2.V.Mean.Sum,C2.V.SD.Sum,C2.V.Rat.Sum,C2.A.Mean.Sum,C2.A.SD.Sum,C2.A.Rat.Sum,C2.D.Mean.Sum,C2.D.SD.Sum,C2.D.Rat.Sum,LANG_PRON,LANG_NAME
0,tropidoclonion_lineatum,BG,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,bg,,,,,,,,,,,,,,,,,,,,,bul,Bulgarian
1,tropidoclonion_lineatum,RU,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,ru,,,,,,,,,,,,,,,,,,,,,rus,Russian
2,tropidoclonion_lineatum,UK,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,uk,,,,,,,,,,,,,,,,,,,,,ukr,Ukrainian
3,tropidoclonion_lineatum,GA,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,ga,,,,,,,,,,,,,,,,,,,,,gle,Irish
4,tropidoclonion_lineatum,RO,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,ro,,,,,,,,,,,,,,,,,,,,,ron,Romanian; Moldavian; Moldovan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1981301,menuil,MS,jack#v#1,pry#v#1,jack,pry,True,ms,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0,msa,Malay (macrolanguage)
1981302,menuas,MS,jack#v#1,pry#v#1,jack,pry,True,ms,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0,msa,Malay (macrolanguage)
1981303,menuil,ID,jack#v#1,pry#v#1,jack,pry,True,id,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0,ind,Indonesian
1981304,menuas,ID,jack#v#1,pry#v#1,jack,pry,True,id,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0,ind,Indonesian


In [36]:
df["LANG_PRON"]=df["lang"].apply(lambda x: lang_dict[x][0] if x in lang_dict else x)
df["LANG_NAME"] = df["lang"].apply(lambda x: lang_dict[x][1] if x in lang_dict else x)

In [41]:
# df_inter = df[df["lang"].isin(langs_pron)]

In [42]:
# df_inter["LANG_PRON"] = df_inter["lang"].apply(lambda x: lang_dict[x][0])
# df_inter["LANG_NAME"] = df_inter["lang"].apply(lambda x: lang_dict[x][1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_inter["LANG_PRON"] = df_inter["lang"].apply(lambda x: lang_dict[x][0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_inter["LANG_NAME"] = df_inter["lang"].apply(lambda x: lang_dict[x][1])


In [38]:
df.drop_duplicates()

Unnamed: 0,SENSE_LEMMA,LANG,SYNSET1,SYNSET2,C1,C2,diff,lang,C1.lemma,C1.V.Mean.Sum,C1.V.SD.Sum,C1.V.Rat.Sum,C1.A.Mean.Sum,C1.A.SD.Sum,C1.A.Rat.Sum,C1.D.Mean.Sum,C1.D.SD.Sum,C1.D.Rat.Sum,C2.lemma,C2.V.Mean.Sum,C2.V.SD.Sum,C2.V.Rat.Sum,C2.A.Mean.Sum,C2.A.SD.Sum,C2.A.Rat.Sum,C2.D.Mean.Sum,C2.D.SD.Sum,C2.D.Rat.Sum,LANG_PRON,LANG_NAME
0,tropidoclonion_lineatum,BG,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,bg,,,,,,,,,,,,,,,,,,,,,bul,Bulgarian
1,tropidoclonion_lineatum,RU,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,ru,,,,,,,,,,,,,,,,,,,,,rus,Russian
2,tropidoclonion_lineatum,UK,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,uk,,,,,,,,,,,,,,,,,,,,,ukr,Ukrainian
3,tropidoclonion_lineatum,GA,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,ga,,,,,,,,,,,,,,,,,,,,,gle,Irish
4,tropidoclonion_lineatum,RO,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,ro,,,,,,,,,,,,,,,,,,,,,ron,Romanian; Moldavian; Moldovan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1981301,menuil,MS,jack#v#1,pry#v#1,jack,pry,True,ms,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0,msa,Malay (macrolanguage)
1981302,menuas,MS,jack#v#1,pry#v#1,jack,pry,True,ms,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0,msa,Malay (macrolanguage)
1981303,menuil,ID,jack#v#1,pry#v#1,jack,pry,True,id,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0,ind,Indonesian
1981304,menuas,ID,jack#v#1,pry#v#1,jack,pry,True,id,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0,ind,Indonesian


In [45]:
df["LEN"]=df["LANG_PRON"].str.len()

In [52]:
df = df.drop_duplicates(subset=["SENSE_LEMMA", "lang", "SYNSET1", "SYNSET2"])

In [53]:
df

Unnamed: 0,SENSE_LEMMA,LANG,SYNSET1,SYNSET2,C1,C2,diff,lang,C1.lemma,C1.V.Mean.Sum,C1.V.SD.Sum,C1.V.Rat.Sum,C1.A.Mean.Sum,C1.A.SD.Sum,C1.A.Rat.Sum,C1.D.Mean.Sum,C1.D.SD.Sum,C1.D.Rat.Sum,C2.lemma,C2.V.Mean.Sum,C2.V.SD.Sum,C2.V.Rat.Sum,C2.A.Mean.Sum,C2.A.SD.Sum,C2.A.Rat.Sum,C2.D.Mean.Sum,C2.D.SD.Sum,C2.D.Rat.Sum,LANG_PRON,LANG_NAME,LEN
0,tropidoclonion_lineatum,BG,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,bg,,,,,,,,,,,,,,,,,,,,,bul,Bulgarian,3.0
1,tropidoclonion_lineatum,RU,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,ru,,,,,,,,,,,,,,,,,,,,,rus,Russian,3.0
2,tropidoclonion_lineatum,UK,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,uk,,,,,,,,,,,,,,,,,,,,,ukr,Ukrainian,3.0
3,tropidoclonion_lineatum,GA,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,ga,,,,,,,,,,,,,,,,,,,,,gle,Irish,3.0
4,tropidoclonion_lineatum,RO,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,ro,,,,,,,,,,,,,,,,,,,,,ron,Romanian; Moldavian; Moldovan,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1981301,menuil,MS,jack#v#1,pry#v#1,jack,pry,True,ms,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0,msa,Malay (macrolanguage),3.0
1981302,menuas,MS,jack#v#1,pry#v#1,jack,pry,True,ms,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0,msa,Malay (macrolanguage),3.0
1981303,menuil,ID,jack#v#1,pry#v#1,jack,pry,True,id,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0,ind,Indonesian,3.0
1981304,menuas,ID,jack#v#1,pry#v#1,jack,pry,True,id,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0,ind,Indonesian,3.0


In [54]:
df["lemma_lang"]= df["SENSE_LEMMA"]+"_"+df["LANG_PRON"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["lemma_lang"]= df["SENSE_LEMMA"]+"_"+df["LANG_PRON"]


In [55]:
len(df)

1980472

In [56]:
pron_folder = "../data/phon/preprocessed"

In [57]:
import glob, os

In [58]:
lemma_dict = {}
for file in glob.glob(f"{pron_folder}/*.tsv"):
    print(file)
    lang = os.path.basename(file).replace(".tsv", "")
    df_pron = pd.read_csv(file, sep="\t")
    
    for lemma, pron in zip(df_pron["lemma"], df_pron["pron"]):
        
        try:
            lemma = lemma.replace(" ", "_")
            lemma_dict[lemma+"_"+lang]=pron
        except Exception:
            print(lemma, lang)
        

../data/phon/preprocessed/wau.tsv
../data/phon/preprocessed/nno.tsv
../data/phon/preprocessed/pjt.tsv
../data/phon/preprocessed/scn.tsv
../data/phon/preprocessed/slv.tsv
../data/phon/preprocessed/sme.tsv
../data/phon/preprocessed/chb.tsv
../data/phon/preprocessed/ary.tsv
../data/phon/preprocessed/ofs.tsv
nan ofs
../data/phon/preprocessed/stq.tsv
../data/phon/preprocessed/dlm.tsv
../data/phon/preprocessed/huu.tsv
../data/phon/preprocessed/ces.tsv
../data/phon/preprocessed/zha.tsv
../data/phon/preprocessed/sco.tsv
../data/phon/preprocessed/lat.tsv
../data/phon/preprocessed/ppl.tsv
../data/phon/preprocessed/nci.tsv
../data/phon/preprocessed/nav.tsv
../data/phon/preprocessed/nld.tsv
../data/phon/preprocessed/yor.tsv
../data/phon/preprocessed/lwl.tsv
../data/phon/preprocessed/ban.tsv
../data/phon/preprocessed/lav.tsv
../data/phon/preprocessed/hts.tsv
../data/phon/preprocessed/mga.tsv
../data/phon/preprocessed/mfe.tsv
../data/phon/preprocessed/arz.tsv
../data/phon/preprocessed/dng.tsv
../dat

In [60]:
lemma_dict

{'agama_wau': 'a ɰ a m a',
 'agapai_wau': 'a ɰ a p a ɪ',
 'agatapai_wau': 'a ɰ a t a p a ɪ',
 'ahala_wau': 'a h a l a',
 'ahamaitsapai_wau': 'a h a m a ɪ t͡s a p a ɪ',
 'ahatain_wau': 'a h a t a ĩ',
 'aitsa_ha_wau': 'a ɪ t͡s a h a',
 'aka_wau': 'a k a',
 'akain_wau': 'a k a ĩ',
 'akainxa_wau': 'a k a ĩ t j a',
 'akamapai_wau': 'a k a m a p a ɪ',
 'akitsatapai_wau': 'a k i t͡s a t a p a ɪ',
 'akixekojatapai_wau': 'a k i t j ɛ k ɨ ʐ a t a p a ɪ',
 'akiyuntuapa_wau': 'a k i j ũ t u a p a',
 'akulukatapai_wau': 'a k u l u k a t a p a ɪ',
 'amakanata_wau': 'a m a k a n a t a',
 'amamitsapai_wau': 'a m a m i t͡s a p a ɪ',
 'amapitsaitsapai_wau': 'a m a p i t͡s a ɪ t͡s a p a ɪ',
 'aminya_wau': 'a m ĩ j a',
 'amunaun_wau': 'a m u n ɐ̃ ʊ',
 'anapi_wau': 'a n a p i',
 'apaka_wau': 'a p a k a',
 'apakata_wau': 'a p a k a t a',
 'apiya_wau': 'a p i j a',
 'apotapai_wau': 'a p ɨ t a p a ɪ',
 'asatapai_wau': 'a s a t a p a ɪ',
 'ate_wau': 'a t ɛ',
 'atokapai_wau': 'a t ɨ k a p a ɪ',
 'atso_wau': 'a 

In [59]:
len(lemma_dict)

1882240

In [61]:
df["PRON"] = df["lemma_lang"].apply(lambda x: lemma_dict.get(x, np.NaN))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["PRON"] = df["lemma_lang"].apply(lambda x: lemma_dict.get(x, np.NaN))


In [153]:
# df.dropna(subset=["PRON"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=["PRON"], inplace=True)


In [62]:
df

Unnamed: 0,SENSE_LEMMA,LANG,SYNSET1,SYNSET2,C1,C2,diff,lang,C1.lemma,C1.V.Mean.Sum,C1.V.SD.Sum,C1.V.Rat.Sum,C1.A.Mean.Sum,C1.A.SD.Sum,C1.A.Rat.Sum,C1.D.Mean.Sum,C1.D.SD.Sum,C1.D.Rat.Sum,C2.lemma,C2.V.Mean.Sum,C2.V.SD.Sum,C2.V.Rat.Sum,C2.A.Mean.Sum,C2.A.SD.Sum,C2.A.Rat.Sum,C2.D.Mean.Sum,C2.D.SD.Sum,C2.D.Rat.Sum,LANG_PRON,LANG_NAME,LEN,lemma_lang,PRON
0,tropidoclonion_lineatum,BG,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,bg,,,,,,,,,,,,,,,,,,,,,bul,Bulgarian,3.0,tropidoclonion_lineatum_bul,
1,tropidoclonion_lineatum,RU,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,ru,,,,,,,,,,,,,,,,,,,,,rus,Russian,3.0,tropidoclonion_lineatum_rus,
2,tropidoclonion_lineatum,UK,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,uk,,,,,,,,,,,,,,,,,,,,,ukr,Ukrainian,3.0,tropidoclonion_lineatum_ukr,
3,tropidoclonion_lineatum,GA,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,ga,,,,,,,,,,,,,,,,,,,,,gle,Irish,3.0,tropidoclonion_lineatum_gle,
4,tropidoclonion_lineatum,RO,Tropidoclonion#n#1,lined_snake#n#1,tropidoclonion,lined_snake,True,ro,,,,,,,,,,,,,,,,,,,,,ron,Romanian; Moldavian; Moldovan,3.0,tropidoclonion_lineatum_ron,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1981301,menuil,MS,jack#v#1,pry#v#1,jack,pry,True,ms,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0,msa,Malay (macrolanguage),3.0,menuil_msa,
1981302,menuas,MS,jack#v#1,pry#v#1,jack,pry,True,ms,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0,msa,Malay (macrolanguage),3.0,menuas_msa,
1981303,menuil,ID,jack#v#1,pry#v#1,jack,pry,True,id,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0,ind,Indonesian,3.0,menuil_ind,
1981304,menuas,ID,jack#v#1,pry#v#1,jack,pry,True,id,,,,,,,,,,,pry,3.57,1.86,21.0,5.3,2.58,20.0,4.85,2.25,20.0,ind,Indonesian,3.0,menuas_ind,


In [65]:
from collections import defaultdict

In [70]:
dfp = df.dropna(subset=["PRON"]).drop_duplicates(subset=["lemma_lang", "PRON"])

In [88]:
dfp

Unnamed: 0,SENSE_LEMMA,LANG,SYNSET1,SYNSET2,C1,C2,diff,lang,C1.lemma,C1.V.Mean.Sum,C1.V.SD.Sum,C1.V.Rat.Sum,C1.A.Mean.Sum,C1.A.SD.Sum,C1.A.Rat.Sum,C1.D.Mean.Sum,C1.D.SD.Sum,C1.D.Rat.Sum,C2.lemma,C2.V.Mean.Sum,C2.V.SD.Sum,C2.V.Rat.Sum,C2.A.Mean.Sum,C2.A.SD.Sum,C2.A.Rat.Sum,C2.D.Mean.Sum,C2.D.SD.Sum,C2.D.Rat.Sum,LANG_PRON,LANG_NAME,LEN,lemma_lang,PRON
41,protubérance,FR,gnarl#v#1,tuber#n#1,gnarl,tuber,True,fr,,,,,,,,,,,,,,,,,,,,,fra,French,3.0,protubérance_fra,p ʁ ɔ t y b e ʁ ɑ̃ s
44,enflure,FR,swelling#n#1,tuber#n#1,swelling,tuber,True,fr,,,,,,,,,,,,,,,,,,,,,fra,French,3.0,enflure_fra,ɑ̃ f l y ʁ
48,कंद,HI,bulb#n#1,condyle#n#1,bulb,condyle,True,hi,bulb,5.86,1.68,21.0,3.26,2.40,19.0,4.82,2.48,22.0,,,,,,,,,,,hin,Hindi,3.0,कंद_hin,k ə̃ n̪ d̪
55,ubi,MS,bulb#n#1,tuber#n#1,bulb,tuber,True,ms,bulb,5.86,1.68,21.0,3.26,2.40,19.0,4.82,2.48,22.0,,,,,,,,,,,msa,Malay (macrolanguage),3.0,ubi_msa,u b i
58,củ,VI,tuber#n#1,tuberosity#n#1,tuber,tuberosity,True,vi,,,,,,,,,,,,,,,,,,,,,vie,Vietnamese,3.0,củ_vie,k u ˧˩
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1981281,interagire,IT,interact#v#1,intervene#v#1,interact,intervene,True,it,interact,6.00,1.41,20.0,3.78,2.07,23.0,5.46,2.52,28.0,intervene,5.21,1.40,19.0,4.48,2.48,23.0,5.82,2.02,28.0,ita,Italian,3.0,interagire_ita,i n t e r a d͡ʒ i r e
1981286,interfere,EN,interfere#v#1,intervene#v#1,interfere,intervene,True,en,interfere,3.94,1.86,18.0,4.33,2.78,21.0,4.71,2.24,28.0,intervene,5.21,1.40,19.0,4.48,2.48,23.0,5.82,2.02,28.0,eng,English,3.0,interfere_eng,ɪ n t ə f ɪ ə ɹ
1981296,intessere,IT,tat#v#1,weave#v#1,tat,weave,True,it,,,,,,,,,,,weave,5.41,1.22,22.0,3.10,2.05,21.0,6.36,1.89,22.0,ita,Italian,3.0,intessere_ita,i n t ɛ s s e r e
1981298,บุกรุก,TH,intrude#v#1,trespass#v#1,intrude,trespass,True,th,intrude,3.10,1.61,21.0,5.20,1.99,20.0,3.92,2.22,25.0,trespass,3.27,1.61,22.0,4.79,2.30,19.0,5.00,2.19,21.0,tha,Thai,3.0,บุกรุก_tha,b u k̚ ˨˩ r u k̚ ˦˥


In [89]:
dfp.to_csv("../data/finals/colex_wn_phon_dedup.csv", index=False)

In [73]:
len(dfp), len(dfp["LANG_PRON"].value_counts().to_dict())

(139698, 150)

In [75]:
lang2phones = defaultdict(list)
for lang, pron in zip(dfp["LANG_PRON"], dfp["PRON"]):
    lang2phones[lang]+= pron.split()

In [78]:
lang2phon = {}

for lang, phons in lang2phones.items():
    lang2phon[lang]= list(set(phons))

In [87]:
lang2phon["lit"]

['s̪ʲ',
 'ɪ̯̌ː',
 'ʒ͡ʲ',
 'ʊ',
 'i͡ə',
 'âˑ',
 'ä̂ː',
 'ɒ͜úˑ',
 'ɛ̂ˑ',
 't̪',
 's',
 'ə',
 'u͡ə',
 'k',
 'û',
 'ŋ',
 'd̪',
 'd',
 'ʃ',
 'ɐ͡ɪ',
 'ɡ',
 'ŋ́',
 'ǐː',
 'äː',
 'ʊ̯ˑ',
 'à',
 'æ̌ː',
 'ʋ',
 'z',
 'ǒː',
 'ə̯',
 'm',
 'æ̌ˑ',
 'ɣʲ',
 'ɪ̯ˑ',
 'xʲ',
 'ŋʲ',
 'ɪ̂',
 'ɫ',
 'ɪ̌͡ə',
 'ʒ',
 'n',
 'ʂ',
 'ɾ',
 'z̪',
 'i̯',
 'îː',
 'nʲ',
 'sʲ',
 'tʲ',
 'eː',
 'n̪ʲ',
 'j',
 'rː',
 'ɐ',
 'ɪ̯',
 'fʲ',
 'e',
 'ôː',
 'r',
 'd̪ʲ',
 'ɐ͡r',
 'f',
 'ɪ͡ə',
 'jː',
 'ɔ',
 'oː',
 'ɛ̂',
 'bʲ',
 'î',
 'êː',
 'ʑ',
 'iː',
 'aː',
 'v',
 'ʃʲ',
 'vʲ',
 'ǔː',
 's̪',
 'ɕ',
 'äˑ',
 'l',
 'o',
 'ɪ́',
 'ěː',
 'b',
 'u',
 'uː',
 'æ',
 'ɒ',
 'ä̌ː',
 'ʋʲ',
 'ɪ',
 'ɛː',
 'kʲ',
 'lʲ',
 'n̪',
 't͡ʃʲ',
 'â͡l',
 'mʲ',
 'uˑ',
 'ʒʲ',
 'ɡʲ',
 'ɪˑ',
 'rʲ',
 'ɛ',
 'dʲ',
 'ä̂ˑ',
 'ǎː',
 'd͡ʒ',
 'zʲ',
 'ʊ̯',
 'i',
 'ɾʲ',
 'p',
 'æː',
 'ɪ͡r',
 'ä̂',
 't',
 'ʐ',
 'ûː',
 'a',
 'pʲ',
 't̪ʲ',
 't͡sʲ',
 'ɑː']

In [80]:
lang2stats= {}
for lang, phons in lang2phon.items():
    lang2stats[lang]= len(phons)

In [81]:
lang2stats

{'fra': 74,
 'hin': 83,
 'msa': 66,
 'vie': 49,
 'pol': 37,
 'por': 87,
 'ron': 63,
 'jpn': 78,
 'swe': 117,
 'isl': 75,
 'ces': 49,
 'fin': 66,
 'spa': 54,
 'rus': 88,
 'nld': 78,
 'slk': 60,
 'ukr': 68,
 'hbs': 63,
 'deu': 96,
 'bul': 64,
 'tha': 50,
 'gle': 112,
 'eng': 75,
 'cmn': 74,
 'ita': 32,
 'heb': 63,
 'slv': 54,
 'lit': 126,
 'epo': 28,
 'tur': 111,
 'fas': 83,
 'tam': 48,
 'kat': 34,
 'hun': 69,
 'tgl': 42,
 'kor': 63,
 'hye': 40,
 'bak': 49,
 'dan': 119,
 'ind': 59,
 'glg': 55,
 'got': 25,
 'eus': 32,
 'mkd': 39,
 'ara': 53,
 'bel': 41,
 'aze': 51,
 'sqi': 68,
 'nor': 66,
 'cym': 49,
 'ell': 44,
 'mya': 62,
 'haw': 18,
 'mlt': 50,
 'lav': 74,
 'afr': 57,
 'glv': 20,
 'est': 61,
 'grc': 53,
 'khm': 47,
 'dsb': 34,
 'scn': 34,
 'ido': 29,
 'urd': 41,
 'ltz': 48,
 'kmr': 27,
 'kik': 6,
 'mon': 61,
 'gla': 69,
 'nno': 68,
 'ben': 49,
 'ast': 36,
 'bre': 51,
 'hau': 40,
 'asm': 23,
 'fao': 58,
 'kaz': 45,
 'sga': 10,
 'khb': 13,
 'oci': 32,
 'vol': 25,
 'san': 46,
 'mal': 37,


In [83]:
np.min(list(lang2stats.values())), # liv, 

1

In [84]:
np.max(list(lang2stats.values())), # lit

(126,)

In [85]:
np.median(list(lang2stats.values())), # liv, 

(32.5,)

In [64]:
df.to_csv("../data/finals/colex_wn_pron_aff.csv", index=False)

In [63]:
df[df["SENSE_LEMMA"]=="gefängnis"]

Unnamed: 0,SENSE_LEMMA,LANG,SYNSET1,SYNSET2,C1,C2,diff,lang,C1.lemma,C1.V.Mean.Sum,C1.V.SD.Sum,C1.V.Rat.Sum,C1.A.Mean.Sum,C1.A.SD.Sum,C1.A.Rat.Sum,C1.D.Mean.Sum,C1.D.SD.Sum,C1.D.Rat.Sum,C2.lemma,C2.V.Mean.Sum,C2.V.SD.Sum,C2.V.Rat.Sum,C2.A.Mean.Sum,C2.A.SD.Sum,C2.A.Rat.Sum,C2.D.Mean.Sum,C2.D.SD.Sum,C2.D.Rat.Sum,LANG_PRON,LANG_NAME,LEN,lemma_lang,PRON
1350906,gefängnis,DE,jail#n#1,prison#n#1,jail,prison,True,de,jail,1.91,1.44,870.0,4.47,2.5,38.0,3.91,2.91,23.0,prison,1.94,1.55,18.0,5.1,2.34,21.0,3.54,2.65,52.0,deu,German,3.0,gefängnis_deu,ɡ ə f ɛ ŋ n ɪ s


In [167]:
valdict1 = dict()
for lemma, v_mean, v_sd, a_mean, a_sd, d_mean, d_sd in zip(df["C1.lemma"], df["C1.V.Mean.Sum"], df["C1.V.SD.Sum"], 
                                                           df["C1.A.Mean.Sum"], df["C1.A.SD.Sum"], df["C1.D.Mean.Sum"], df["C1.D.SD.Sum"]):
    valdict1[lemma] = (v_mean, v_sd, a_mean, a_sd, d_mean, d_sd)
                                                           
                                                           

In [165]:
valdict2 = dict()
for lemma, v_mean, v_sd, a_mean, a_sd, d_mean, d_sd in zip(df["C2.lemma"], df["C2.V.Mean.Sum"], df["C2.V.SD.Sum"], 
                                                           df["C2.A.Mean.Sum"], df["C2.A.SD.Sum"], df["C2.D.Mean.Sum"], df["C2.D.SD.Sum"]):
    valdict2[lemma] = (v_mean, v_sd, a_mean, a_sd, d_mean, d_sd)                                                  

In [168]:
len(valdict1), len(valdict2)

(8277, 8566)

In [169]:
valdict1.update(valdict2)

In [170]:
len(valdict1)

9552

In [177]:
valdict

NameError: name 'valdict' is not defined

In [176]:
valdict1["gefängnis"]

KeyError: 'gefängnis'

In [172]:
lemmas,langs, prons, vmeans,vsds, ameans, asds, dmeans, dsds  = [],[],[],[],[],[],[],[],[]
for x, group in df.groupby(["SENSE_LEMMA", "LANG_PRON", "PRON"]):
    lemmas.append(x[0])
    langs.append(x[1])
    prons.append(x[2])
    
    concepts = list(set(group["C1"].tolist()+group["C2"].tolist()))
    
    vmeans_ls, vsds_ls, ameans_ls, asds_ls, dmeans_ls, dsds_ls  = [],[],[],[],[],[]
    for concept in concepts:
        vmean, vsd, amen, asd, dmean, dsd = valdict1[concept]
        vmeans_ls.append(vmean)
        vsds_ls.append(vsd)
        ameans_ls.append(amen)
        asds_ls.append(asd)
        dmeans_ls.append(dmean)
        dsds_ls.append(dsd)
        
    vmeans.append(np.mean(vmeans_ls))
    vsds.append(np.mean(vsds_ls))
    ameans.append(np.mean(ameans_ls))
    asds.append(np.mean(asds_ls))
    dmeans.append(np.mean(dmeans_ls))
    dsds.append(np.mean(dsds_ls))
    

In [173]:
phonedf = pd.DataFrame.from_dict({"SENSE_LEMMA": lemmas, "LANG_PRON":langs, "PRON":prons, "V.Mean.Sum":vmeans, "V.SD.Sum": vsds, "A.Mean.Sum": ameans, "A.SD.Sum":asds, 
                                 "D.Mean.Sum":dmeans, "D.SD.Sum":dsds })

In [174]:
phonedf

Unnamed: 0,SENSE_LEMMA,LANG_PRON,PRON,V.Mean.Sum,V.SD.Sum,A.Mean.Sum,A.SD.Sum,D.Mean.Sum,D.SD.Sum
0,a,hun,ɒː,5.040000,1.483333,3.948333,2.381667,5.128333,2.176667
1,a,ita,a,4.340000,1.820000,3.623333,2.483333,5.096667,1.993333
2,aaien,nld,aː i̯ ə n,6.833333,1.446667,4.236667,2.590000,6.256667,1.973333
3,aaltoilu,fin,ɑː l t̪ o̞ i̯ l u,4.575000,1.745000,4.280000,2.145000,5.170000,2.080000
4,aamuhämärä,fin,ɑː m u h æ m æ r æ,6.490000,2.070000,3.650000,2.275000,4.890000,2.445000
...,...,...,...,...,...,...,...,...,...
58943,𐌿𐍂𐍂𐌿𐌽𐍃,got,u r r u n s,5.395000,1.750000,3.375000,2.535000,5.035000,2.615000
58944,𐍃𐌺𐌴𐌹𐌼𐌰,got,s k iː m a,4.935000,1.700000,4.165000,2.195000,5.550000,2.160000
58945,𐍅𐌰𐌹𐌽𐌰𐌷𐍃,got,w ɛː n a h s,2.220000,1.485000,5.080000,2.720000,3.785000,2.170000
58946,𐍅𐍉𐌸𐌴𐌹𐍃,got,w oː θ iː s,7.505000,1.445000,3.525000,2.720000,6.410000,1.965000


In [175]:
phonedf.to_csv("../data/affectiveness/wn_wkb_wikipron.csv", index=False)