In [39]:
import pandas as pd
import numpy as np
from itertools import product
from tqdm import tqdm
import networkx as nx
import pickle

from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
stage1= "../data/stage1"

# 1. Load files

## load phonology 


In [26]:
df_phon = pd.read_csv(f"{stage1}/pmiLanguageDistances.csv", sep="\t")

In [196]:
df_phon

Unnamed: 0,JaegerCode1,JaegerCode2,Glottocode1,Glottocode2,PMI
0,NC.BANTOID.A53_BAFIA_RIKPA,TNG.MADANG.ABASAKUR,bafi1243,pall1244,0.839955
1,NC.BANTOID.A53_BAFIA_RIKPA,Sep.UPPER_SEPIK.ABAU_2,bafi1243,abau1245,0.877663
2,NC.BANTOID.A53_BAFIA_RIKPA,NWC.NORTHWEST_CAUCASIAN.ABAZA,bafi1243,abaz1241,0.883918
3,NC.BANTOID.A53_BAFIA_RIKPA,NWC.NORTHWEST_CAUCASIAN.ABKHAZ,bafi1243,abkh1244,0.882524
4,NC.BANTOID.A53_BAFIA_RIKPA,NWC.NORTHWEST_CAUCASIAN.ABKHAZ_2,bafi1243,abkh1244,0.880243
...,...,...,...,...,...
4331998,NC.BANTOID.ZIGULA,NC.BANTOID.ZOOMBO_2,zigu1244,koon1244,0.671316
4331999,NC.BANTOID.ZIGULA,NC.BANTOID.ZOOMBO_3,zigu1244,koon1244,0.700825
4332000,NC.BANTOID.ZIGULA,NC.BANTOID.ZOOMBO_4,zigu1244,koon1244,0.733510
4332001,NC.BANTOID.ZIGULA,NC.BANTOID.ZULU,zigu1244,zulu1248,0.712642


In [31]:
df_phon = df_phon.dropna()

In [None]:
phon_dict = dict()

langs=[]
for lang1, lang2, pmi in zip(df_phon["Glottocode1"], df_phon["Glottocode2"], df_phon["PMI"]):
    l1,l2 = sorted([lang1, lang2])
    if l1!=l2:
        phon_dict[(l1,l2)]= pmi
        langs.append(l1)
        langs.append(l2)
    else:
        print(l1,l2,pmi)

In [34]:
len(set(langs))

1562

In [37]:
phon_dict[('bafi1243', 'zaca1242')]

0.863057842180655

## load language info

In [131]:
from ast import literal_eval

In [109]:
language_all = pd.read_csv("../data/languages/languages_dataframe.csv", index_col=1).drop(["Unnamed: 0"],axis=1)

In [78]:
langd= pd.read_csv("../data/languages/languages_family_area.csv", index_col=0).to_dict(orient="index")

In [17]:
df_lang = pd.read_csv(f"{stage1}/languages_info.csv", index_col=0)

In [20]:
lang_dict = df_lang.to_dict(orient="index")

In [115]:
langd

{'musk1252': {'name': 'Muskogean',
  'family_id': nan,
  'top-level family': nan,
  'macroarea': nan,
  'latitude': nan,
  'longitude': nan,
  'iso639-3': nan},
 'maba1274': {'name': 'Maban',
  'family_id': nan,
  'top-level family': nan,
  'macroarea': nan,
  'latitude': nan,
  'longitude': nan,
  'iso639-3': nan},
 'guri1248': {'name': 'Guriaso',
  'family_id': nan,
  'top-level family': nan,
  'macroarea': 'Papunesia',
  'latitude': -3.5718,
  'longitude': 141.597,
  'iso639-3': 'grx'},
 'pawa1255': {'name': 'Pawaia',
  'family_id': nan,
  'top-level family': nan,
  'macroarea': 'Papunesia',
  'latitude': -6.88021,
  'longitude': 145.081,
  'iso639-3': 'pwa'},
 'bora1262': {'name': 'Boran',
  'family_id': nan,
  'top-level family': nan,
  'macroarea': nan,
  'latitude': nan,
  'longitude': nan,
  'iso639-3': nan},
 'guam1236': {'name': 'Guamo',
  'family_id': nan,
  'top-level family': nan,
  'macroarea': 'South America',
  'latitude': 8.2351027,
  'longitude': -67.4018753,
  'iso63

In [116]:
language_all["parent_name"]= language_all["parent"].apply(lambda x: langd[x]["name"] if x in langd else np.NaN)

In [122]:
language_all["id"]=language_all.index

In [123]:
language_all["family"]=language_all["id"].apply(lambda x: langd[x]["top-level family"] if x in langd else np.NaN)

In [126]:
language_all["family_id"]=language_all["id"].apply(lambda x: langd[x]["family_id"] if x in langd else np.NaN)

In [142]:
language_all["macroarea"]= language_all["macroareas"].apply(lambda x: literal_eval(x)[0] if len(literal_eval(x))>0 else np.NaN)

In [146]:
language_all.to_csv("../data/languages/languages_all.csv")

In [143]:
# langdict = language_all.to_dict(orient="index")

In [144]:
langdict["dutc1256"]["macroarea"]

'Eurasia'

In [87]:
import json
with open("../data/stage1/language_contacts_nr/dutc1256.json") as f:
    d = json.load(f)

In [None]:
print("dutc1256", lang_dict["dutc1256"]["name"], lang_dict["dutc1256"]["latitude"], lang_dict["dutc1256"]["longitude"])
for lang , dist in d["dutc1256"].items():
    if dist<=9:
        print(lang, lang_dict[lang]["name"], dist, lang_dict[lang]["latitude"], lang_dict[lang]["longitude"])
# d["dutc1256"]["stan1295"]

## load colex information

In [23]:
df = pd.read_csv(f"{stage1}/colex_pmi/clics3_nuclear.csv")

In [24]:
df_pmi = df[["source", "target", "weight", "target_nr", "source_nr", "pmi"]]

In [46]:
df_pmi.head(2)

Unnamed: 0,source,target,weight,target_nr,source_nr,pmi
0,nege1244,pume1238,2,21,14,5.542637
1,lakk1238,nege1244,2,9,21,6.180067


In [82]:
colex_nr_dict = dict()

for source, target, source_nr, target_nr in zip(df_pmi["source"],df_pmi["target"], df_pmi["source_nr"], df_pmi["target_nr"]):
    if source not in colex_nr_dict:
        colex_nr_dict[source]=source_nr
    if target not in colex_nr_dict:
        colex_nr_dict[target]=target_nr
    

In [165]:
colex_dict = dict()

for source, target, weight, pmi in zip(df_pmi["source"],df_pmi["target"], df_pmi["weight"], df_pmi["pmi"]):
    t1,t2 = sorted([source, target])
    colex_dict[(t1,t2)]= (weight, pmi)


In [167]:
colex_dict['nege1244', 'pume1238']

(2, 5.542637091247636)

In [83]:
len(colex_nr_dict)

1129

In [101]:
colex_nr_dict["nege1244"]

14

## load geo graph

In [40]:
with open(f"{stage1}/language_geo_graph.pickle", "rb") as f:
    graph = pickle.load(f)

In [161]:
graph.edges['guin1254', 'pawa1255']

{'contact': 1242, 'geodist': 17172.307067554568}

In [160]:
graph.nodes["guin1254"]

{'coord': (7.93196, -8.98843)}

In [159]:
len(graph.nodes), len(graph.edges)

(1561, 1217580)

# 2.Populate graph with colex, phon and lang information

In [99]:
g = nx.Graph()

g.graph["dataset"]="clics3"
g.graph["wordlist"]="nuclear"

### 2.1 add node attributes

- get from colexifications.
- name, family, parent, branch, colex_nr

In [154]:
literal_eval(langdict["nege1244"]["timespan"])

{'start_year': 1700,
 'start_month': 1,
 'start_day': 1,
 'end_year': 1987,
 'end_month': 1,
 'end_day': 1}

In [156]:
for lang, colex_nr in colex_nr_dict.items():
    if lang in langdict:
        node_dict = langdict[lang] # geo information
        name = node_dict["name"]
        fam = node_dict["family"]
        parent = node_dict["parent_name"]
        branch = node_dict["branch"]
        iso_code = node_dict["iso639_3"]
        lat = node_dict["latitude"]
        long=node_dict["longitude"]
        area = node_dict["macroarea"]
        times=[]
        if str(node_dict["timespan"])!="nan":
            
            timespan = literal_eval(node_dict["timespan"])
            start_year= timespan["start_year"]
            times.append(start_year)
            end_year=timespan["end_year"]
            times.append(end_year)
        
       
        nr_colex = colex_nr_dict[lang]
        
        g.add_node(lang, 
                   colex_nr = nr_colex,
                   name=name,
                   family= fam,
                   parent= parent,
                   branch= branch,
                   iso3 = iso_code,
                   area = area,
                   timespan = tuple(times),
                   coord= (lat, long))

In [157]:
g.nodes["nege1244"]

{'colex_nr': 14,
 'name': 'Negerhollands',
 'family': 'Indo-European',
 'parent': 'Zeeuwic',
 'branch': 'Global Dutch',
 'iso3': 'dcr',
 'area': 'North America',
 'timespan': (1700, 1987),
 'coord': (18.3416, -64.8922)}

In [190]:
g.nodes["nepa1254"]

{'colex_nr': 9,
 'name': 'Nepali',
 'family': 'Indo-European',
 'parent': 'Eastern Pahari',
 'branch': 'Eastern Pahari',
 'iso3': 'npi',
 'area': 'Eurasia',
 'timespan': (),
 'coord': (28.0, 85.0)}

In [192]:
g.edges['cofa1242', 'muba1238']

{'phon_pmi': 0.8228045110425997,
 'colex_pmi': 6.654846594833661,
 'weight': 1,
 'contact': 1000,
 'geodist': 15518.142529426157,
 'neighbour': False}

### 2.2 add edges and attributes

In [163]:
phon_dict['bafi1243', 'pall1244']

0.8399546217176125

In [170]:
intersected_langs = set(colex_dict.keys()).intersection(set(phon_dict.keys()))
len(intersected_langs)

34519

In [None]:
intersected_langs

In [171]:
len(set(colex_dict.keys())), len(set(phon_dict.keys()))

(167142, 726975)

In [191]:

for edge in graph.edges:
    contact = graph.edges[edge]["contact"]
    geodist = graph.edges[edge]["geodist"]
    if edge in g.edges:
        g.edges[edge]["contact"]=contact
        g.edges[edge]["geodist"]=geodist
        if contact <10:
            g.edges[edge]["neighbour"]=True
        else:
            g.edges[edge]["neighbour"]=False
    

In [194]:
len(g.edges), len(g.nodes)

(859598, 2068)

- on the edges: phon_pmi, colex_pmi, colex_weight, geo_contact, geo_dist, geo_neighbour, branch (Same?)

In [172]:

for langp, phon_pmi in phon_dict.items():
    l1,l2 = langp
    g.add_edge(l1,l2, phon_pmi= phon_pmi)

In [179]:
('pume1238', 'bafi1243') in g.edges

True

In [184]:
g.edges['pume1238', 'bafi1243']

{'phon_pmi': 0.865793672052637}

In [176]:
g.edges['bafi1243','pume1238']

{'phon_pmi': 0.865793672052637}

In [187]:
for langp, colex_pmi_ in colex_dict.items():
    l1,l2 = langp
    weight, pmi = colex_pmi_
    if (l1,l2) in g.edges:
        g.edges[l1,l2]["colex_pmi"]=pmi
        g.edges[l1,l2]["weight"]=weight
    else:
        g.add_edge(l1,l2, colex_pmi=pmi, weight=weight)

In [189]:
g.edges['nepa1254', 'yuga1244']

{'phon_pmi': 0.8599357983068425, 'colex_pmi': 8.34999201330524, 'weight': 1}