In [24]:
import pandas as pd
import json 
from itertools import chain
from collections import defaultdict
import os
import yaml
import numpy as np

try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper
from scipy.spatial import distance
from scipy.stats.stats import pearsonr   

In [9]:
with open("../data/TypPred/wals+clics_langs.json") as f:
          clics_langs = json.load(f)

with open("../data/TypPred/wals+wn_langs.json") as f:
          wn_langs = json.load(f)
        
with open("../data/TypPred/wals+uriel_langs.json") as f:
          uriel_langs = json.load(f)

In [12]:
with open("../data/TypPred/wals_features.yaml") as f:
    wals_features = yaml.load(f, Loader)

In [19]:
feature_areas = ["Lexicon", "Complex_Sentences", "Nominal_Categories", "Simple_Cluases"]

In [5]:
data_dir = "../data/TypPred/datasets/"

train_file = os.path.join(data_dir, f"train_dev.csv")
test_file = os.path.join(data_dir, f"test.csv")
train = pd.read_csv(train_file)
test= pd.read_csv(test_file)

df_train_test = pd.concat([train,test], axis=0)


In [14]:
len(clics_langs), len(wn_langs), len(uriel_langs)

(737, 330, 624)

In [10]:
df_clics = df_train_test[df_train_test["ISO"].isin(clics_langs)]
df_wn = df_train_test[df_train_test["ISO"].isin(wn_langs)]
df_uriel = df_train_test[df_train_test["ISO"].isin(uriel_langs)]

In [92]:
stats_df(df_uriel)

****** Complex_Sentences*******
nr of features 7
avg #labels 4
avg #nr_sample 63
****** Lexicon*******
nr of features 13
avg #labels 4
avg #nr_sample 92
****** Morphology*******
****** Nominal_Categories*******
nr of features 29
avg #labels 5
avg #nr_sample 112
****** Nominal_Syntax*******
****** Other*******
****** Phonology*******
****** Sign_Languages*******
****** Simple_Cluases*******
nr of features 26
avg #labels 4
avg #nr_sample 117
****** Verbal_Categories*******
****** Word_Order*******
total nr_features:  190
avg nr labels:  9
avg nr samples:  238
10


In [91]:
stats_df(df_wn)

****** Complex_Sentences*******
nr of features 7
avg #labels 4
avg #nr_sample 56
****** Lexicon*******
nr of features 13
avg #labels 2
avg #nr_sample 58
****** Morphology*******
****** Nominal_Categories*******
nr of features 29
avg #labels 5
avg #nr_sample 92
****** Nominal_Syntax*******
****** Other*******
****** Phonology*******
****** Sign_Languages*******
****** Simple_Cluases*******
nr of features 26
avg #labels 4
avg #nr_sample 89
****** Verbal_Categories*******
****** Word_Order*******
total nr_features:  185
avg nr labels:  8
avg nr samples:  166
10


In [90]:
stats_df(df_clics)

****** Complex_Sentences*******
nr of features 7
avg #labels 4
avg #nr_sample 86
****** Lexicon*******
nr of features 13
avg #labels 4
avg #nr_sample 93
****** Morphology*******
****** Nominal_Categories*******
nr of features 29
avg #labels 5
avg #nr_sample 145
****** Nominal_Syntax*******
****** Other*******
****** Phonology*******
****** Sign_Languages*******
****** Simple_Cluases*******
nr of features 26
avg #labels 4
avg #nr_sample 142
****** Verbal_Categories*******
****** Word_Order*******
total nr_features:  188
avg nr labels:  9
avg nr samples:  288
10


In [89]:
def stats_df(df_clics):
    nr_features = []
    nr_labels =[]
    nr_samples=[]
    for feature_area in wals_features.keys():

        if feature_area in feature_areas:

            features = wals_features[feature_area]
            print(f"****** {feature_area}*******")
            clics_feature_counter = defaultdict(list)
            for idx, feature in features.items():

                clics_counter = df_clics[feature].dropna().value_counts().to_dict()
                if sum(list(clics_counter.values()))>0:
                    clics_feature_counter["feature_nr"].append(1)
                    clics_feature_counter["nr_label_ids"].append(len(clics_counter))
                    clics_feature_counter["nr_samples"].append(len(df_clics[feature].dropna()))
            if sum(clics_feature_counter["feature_nr"])>0:
                print("nr of features" , sum(clics_feature_counter["feature_nr"]))
                # print("median #labels", np.median(clics_feature_counter["nr_label_ids"]))
                # print("max #labels", np.max(clics_feature_counter["nr_label_ids"]))
                # print("min #labels", np.min(clics_feature_counter["nr_label_ids"]))
                print("avg #labels", round(np.mean(clics_feature_counter["nr_label_ids"])))

                # print("median #nr_sample", np.median(clics_feature_counter["nr_samples"]))
                # print("max #nr_sample", np.max(clics_feature_counter["nr_samples"]))
                # print("min #nr_sample", np.min(clics_feature_counter["nr_samples"]))
                print("avg #nr_sample", round(np.mean(clics_feature_counter["nr_samples"])))

                nr_features.append(sum(clics_feature_counter["feature_nr"]))
                nr_labels.append(sum(clics_feature_counter["nr_label_ids"]))
                nr_samples.append(sum(clics_feature_counter["nr_samples"]))
        else:
            features = wals_features[feature_area]
            print(f"****** {feature_area}*******")

            clics_feature_counter_ = defaultdict(list)
            for idx, feature in features.items():
                
                clics_counter = df_clics[feature].dropna().value_counts().to_dict()
                if sum(list(clics_counter.values()))>0:
                    clics_feature_counter_["feature_nr"].append(1)
                    clics_feature_counter_["nr_label_ids"].append(len(clics_counter))
                    clics_feature_counter_["nr_samples"].append(len(df_clics[feature].dropna()))
            
            if sum(clics_feature_counter_["feature_nr"])>0:
                # print("nr of features" , sum(clics_feature_counter_["feature_nr"]))
                # print("median #labels", np.median(clics_feature_counter["nr_label_ids"]))
                # print("max #labels", np.max(clics_feature_counter["nr_label_ids"]))
                # print("min #labels", np.min(clics_feature_counter["nr_label_ids"]))
                # print("avg #labels", round(np.mean(clics_feature_counter_["nr_label_ids"])))

                # print("median #nr_sample", np.median(clics_feature_counter["nr_samples"]))
                # print("max #nr_sample", np.max(clics_feature_counter["nr_samples"]))
                # print("min #nr_sample", np.min(clics_feature_counter["nr_samples"]))
                # print("avg #nr_sample", round(np.mean(clics_feature_counter_["nr_samples"])))


                nr_features.append(sum(clics_feature_counter_["feature_nr"]))
                nr_labels.append(sum(clics_feature_counter_["nr_label_ids"]))
                nr_samples.append(sum(clics_feature_counter_["nr_samples"]))
            


    print("==========total=========")
    LEN = len(nr_features)

    print("total nr_features: ",round(np.sum(nr_features)))

    # print("median nr_features: ",np.median(nr_features))
    # print("median nr labels: ",np.median(nr_labels))
    # print("median nr samples: ",np.median(nr_samples))
    # print("avg nr_features: ",round(np.mean(nr_features)))
    print("avg nr labels: ", round(np.mean(nr_labels)/LEN))
    print("avg nr samples: ", round(np.mean(nr_samples)/LEN))
    # print("max nr samples: ", round(np.max(nr_samples)))
    print(LEN)
                                                       

#### 

In [54]:
df_wals = pd.read_csv("../data/TypPred/wals_by_languages.csv")

In [65]:
len(df_wals)

2371

In [55]:
df_wals

Unnamed: 0,ISO,Consonant_Inventories,Vowel_Quality_Inventories,Consonant-Vowel_Ratio,Voicing_in_Plosives_and_Fricatives,Voicing_and_Gaps_in_Plosive_Systems,Uvular_Consonants,Glottalized_Consonants,Lateral_Consonants,The_Velar_Nasal,...,SNegOV_Order,SONegV_Order,SOVNeg_Order,The_Position_of_Negative_Morphemes_in_Verb-Initial_Languages,Double_negation_in_verb-initial_languages,Verb-Initial_with_Preverbal_Negative,Verb-Initial_with_Negative_that_is_Immediately_Postverbal_or_between_Subject_and_Object,Verb-Initial_with_Clause-Final_Negative,The_Position_of_Negative_Morphemes_in_Object-Initial_Languages,NON_NULL
0,eng,Average,Large (7-14),Low,In both plosives and fricatives,None missing in /p t k b d g/,,No glottalized consonants,"/l/, no obstruent laterals",No initial velar nasal,...,,,,,,,,,,160
1,fra,Average,Large (7-14),Low,In both plosives and fricatives,None missing in /p t k b d g/,Uvular continuants only,No glottalized consonants,"/l/, no obstruent laterals",No velar nasal,...,,,,,,,,,,159
2,deu,Average,Large (7-14),Low,In both plosives and fricatives,None missing in /p t k b d g/,Uvular continuants only,No glottalized consonants,"/l/, no obstruent laterals",No initial velar nasal,...,No SNegOV,Word&NoDoubleNeg,NoSOVNeg,,,,,,,158
3,rus,Moderately large,Average (5-6),High,In both plosives and fricatives,None missing in /p t k b d g/,,No glottalized consonants,"/l/, no obstruent laterals",No velar nasal,...,,,,,,,,,,157
4,fin,Moderately small,Large (7-14),Moderately low,In both plosives and fricatives,None missing in /p t k b d g/,,No glottalized consonants,"/l/, no obstruent laterals",No initial velar nasal,...,,,,,,,,,,156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2366,ulc,,,,,,,,,Initial velar nasal,...,,,,,,,,,,2
2367,psr,,,,,,,,,,...,,,,,,,,,,2
2368,skb,,,,,,,,,Initial velar nasal,...,,,,,,,,,,2
2369,mtk,,,,,,,,,,...,,,,,,,,,,2


In [4]:
df_parameter = pd.read_csv("../data/cldf-datasets-wals-878ea47/raw/parameter.csv")

In [7]:
feature2id = dict(zip([name.replace(" ","_") for name in df_parameter["name"]], df_parameter["id"]))

In [8]:
id2feature = dict(zip([name.replace(" ","_") for name in df_parameter["id"]], df_parameter["name"]))

In [37]:
difference

{'121A',
 '128A',
 '138A',
 '140A',
 '142A',
 '144A',
 '144B',
 '144C',
 '144D',
 '144E',
 '144F',
 '144G',
 '29A',
 '57A',
 '64A',
 '80A',
 '97A'}

In [39]:
[x for x in range(1,20)]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [44]:
phonology_ids = list(chain.from_iterable([[str(x)+"A", str(x)+"B"] for x in range(1,20)]))
morphology_ids = list(chain.from_iterable([[str(x)+"A", str(x)+"B"] for x in range(20,30)]))
nominal_categories_ids = list(chain.from_iterable([[str(x)+"A", str(x)+"B"] for x in range(30,58)]))
nominal_syntax_ids = list(chain.from_iterable([[str(x)+"A", str(x)+"B"] for x in range(58,65)]))
verbal_categories_ids = list(chain.from_iterable([[str(x)+"A", str(x)+"B"] for x in range(65,81)]))
word_order_ids = list(chain.from_iterable([[str(x)+"A", str(x)+"B", str(x)+"C", str(x)+"D", str(x)+"E", str(x)+"F", str(x)+"G"] for x in range(81,98)])) + list(chain.from_iterable([[str(x)+"A", str(x)+"B", str(x)+"C", str(x)+"D", str(x)+"E", str(x)+"F", str(x)+"G"] for x in range(143,145)])) +[str(144)+y for y in ["H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y"]]
simple_clauses_ids = list(chain.from_iterable([[str(x)+"A", str(x)+"B"] for x in range(98,122)]))
complex_sentences_ids = list(chain.from_iterable([[str(x)+"A", str(x)+"B"] for x in range(122,129)]))
lexicon_ids = list(chain.from_iterable([[str(x)+"A", str(x)+"B"] for x in range(129,139)]))
sign_languages_ids = list(chain.from_iterable([[str(x)+"A"] for x in range(139,141)]))
other_ids = list(chain.from_iterable([[str(x)+"A", str(x)+"B"] for x in range(141,143)]))

In [45]:
len(feature2id)

192

In [46]:
feature_dict= defaultdict(dict)
counter= 0
indices = []
for idx, feature in id2feature.items():
    if idx in phonology_ids:
        
        feature_dict["Phonology"][idx]=feature.replace(" ","_")
        indices.append(idx)
        counter+=1
    if idx in morphology_ids:
        feature_dict["Morphology"][idx] = feature.replace(" ","_")
        indices.append(idx)
        counter+=1
    if idx in nominal_categories_ids:
        feature_dict["Nominal_Categories"][idx]=feature.replace(" ","_")
        indices.append(idx)
        counter+=1
    if idx in nominal_syntax_ids:
        feature_dict["Nominal_Syntax"][idx]=feature.replace(" ","_")
        indices.append(idx)
        counter+=1
    
    if idx in verbal_categories_ids:
        feature_dict["Verbal_Categories"][idx]=feature.replace(" ","_")
        indices.append(idx)
        counter+=1
        
    if idx in word_order_ids:
        feature_dict["Word_Order"][idx]=feature.replace(" ","_")
        indices.append(idx)
        counter+=1
    
    if idx in simple_clauses_ids:
        feature_dict["Simple_Cluases"][idx]=feature.replace(" ","_")
        indices.append(idx)
        counter+=1
    
    if idx in complex_sentences_ids:
        feature_dict["Complex_Sentences"][idx]=feature.replace(" ","_")
        indices.append(idx)
        counter+=1
    if idx in lexicon_ids:
        feature_dict["Lexicon"][idx]=feature.replace(" ","_")
        indices.append(idx)
        counter+=1
    if idx in sign_languages_ids:
        feature_dict["Sign_Languages"][idx]=feature.replace(" ","_")
        indices.append(idx)
        counter+=1
    
    if idx in other_ids:
        feature_dict["Other"][idx]=feature.replace(" ","_")
        indices.append(idx)
        counter+=1
        
        

In [47]:
feature_ids = list(id2feature.keys())
difference = set(feature_ids).difference(set(indices))

In [48]:
difference

set()

In [65]:
import yaml
try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper
    
with open("../data/TypPred/preprocessed/wals_features.yaml", "w")as f:
    yaml.dump(feature_dict, f, default_flow_style=False)

In [66]:
len(feature_dict.keys())

11