In [69]:
import json 
import pandas as pd
import os 

import yaml
try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper
    
from collections import defaultdict
import seaborn as sns
sns.set_theme(style="white")

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
with open("../data/TypPred/feature_dict.json") as f:
    feature_dict = json.load(f)

In [5]:

with open("../data/TypPred/wals_features.yaml") as f:
    features = yaml.load(f, Loader=Loader)

In [6]:
features

{'Complex_Sentences': {'122A': 'Relativization_on_Subjects',
  '123A': 'Relativization_on_Obliques',
  '124A': "'Want'_Complement_Subjects",
  '125A': 'Purpose_Clauses',
  '126A': "'When'_Clauses",
  '127A': 'Reason_Clauses',
  '128A': 'Utterance_Complement_Clauses'},
 'Lexicon': {'129A': 'Hand_and_Arm',
  '130A': 'Finger_and_Hand',
  '130B': "Cultural_Categories_of_Languages_with_Identity_of_'Finger'_and_'Hand'",
  '131A': 'Numeral_Bases',
  '132A': 'Number_of_Non-Derived_Basic_Colour_Categories',
  '133A': 'Number_of_Basic_Colour_Categories',
  '134A': 'Green_and_Blue',
  '135A': 'Red_and_Yellow',
  '136A': 'M-T_Pronouns',
  '136B': 'M_in_First_Person_Singular',
  '137A': 'N-M_Pronouns',
  '137B': 'M_in_Second_Person_Singular',
  '138A': 'Tea'},
 'Morphology': {'20A': 'Fusion_of_Selected_Inflectional_Formatives',
  '21A': 'Exponence_of_Selected_Inflectional_Formatives',
  '21B': 'Exponence_of_Tense-Aspect-Mood_Inflection',
  '22A': 'Inflectional_Synthesis_of_the_Verb',
  '23A': 'Locu

In [7]:
lexicon_features = features["Lexicon"]

In [8]:
lexicon_features

{'129A': 'Hand_and_Arm',
 '130A': 'Finger_and_Hand',
 '130B': "Cultural_Categories_of_Languages_with_Identity_of_'Finger'_and_'Hand'",
 '131A': 'Numeral_Bases',
 '132A': 'Number_of_Non-Derived_Basic_Colour_Categories',
 '133A': 'Number_of_Basic_Colour_Categories',
 '134A': 'Green_and_Blue',
 '135A': 'Red_and_Yellow',
 '136A': 'M-T_Pronouns',
 '136B': 'M_in_First_Person_Singular',
 '137A': 'N-M_Pronouns',
 '137B': 'M_in_Second_Person_Singular',
 '138A': 'Tea'}

In [23]:

with open("../data/TypPred/feature_maps.json") as f:
    feature_maps = json.load(f)

In [25]:
feature_maps['Hand_and_Arm']

{'Different': 0, 'Identical': 1}

In [71]:

label_dist = defaultdict(dict)

for feature_id, feature in lexicon_features.items():
    clics_dict = get_datasets_dist(feature_id, "clics")
    total_labels_clics = sum(list(clics_dict[feature_id].values()))
    # ascending order
    wn_dict = get_datasets_dist(feature_id, "wn")

    total_labels_wn = sum(list(wn_dict[feature_id].values()))
    label_dist[feature_id] = {
            "feature": feature,
            "weight_clics": {k: v / total_labels_clics for k, v in clics_dict[feature_id].items()},
            "weight_wn":{k: v / total_labels_wn for k, v in wn_dict[feature_id].items()},
            "id2value": {v: k for k, v in feature_maps[feature].items()}}


In [72]:
label_dist

defaultdict(dict,
            {'129A': {'feature': 'Hand_and_Arm',
              'weight_clics': {0: 0.6666666666666666, 1: 0.3333333333333333},
              'weight_wn': {0: 0.5573770491803278, 1: 0.4426229508196721},
              'id2value': {0: 'Different', 1: 'Identical'}},
             '130A': {'feature': 'Finger_and_Hand',
              'weight_clics': {0: 0.9067357512953368, 1: 0.09326424870466321},
              'weight_wn': {0: 0.9067796610169492, 1: 0.09322033898305085},
              'id2value': {0: 'Different', 1: 'Identical'}},
             '130B': {'feature': "Cultural_Categories_of_Languages_with_Identity_of_'Finger'_and_'Hand'",
              'weight_clics': {0: 0.5294117647058824,
               2: 0.29411764705882354,
               1: 0.17647058823529413},
              'weight_wn': {0: 0.5, 1: 0.3, 2: 0.2},
              'id2value': {0: 'Hunter-gatherers',
               1: 'Farmer-foragers',
               2: 'Full-fledged farmers'}},
             '131A': {'featu

In [76]:


#### compare wn and random
lexicon_values_wn_dict = defaultdict(dict)
lexicon_values_clics_dict = defaultdict(dict)

### get the results from wn/clics/wn_concept/random
for feature_id, d in label_dist.items():
    weight_clics = d["weight_clics"]
    weight_wn = d["weight_wn"]
    id2value = d["id2value"]

    with open(f"../output/clics/oneff_{feature_id}.json") as f:
        random_clics_report = json.load(f)["test"]["report"]

    with open(f"../output/clics/oneff_clics_prone_concat+max_{feature_id}.json") as f:
        clics_report = json.load(f)["test"]["report"]
        
    random_clics_result = dict()
    clics_result = dict()
    
    for value_id in id2value:
        if str(value_id) in clics_report:
            
            clics_result[value_id] = clics_report[str(value_id)]["f1-score"]
        if str(value_id) in random_clics_report:
            random_clics_result[value_id] = random_clics_report[str(value_id)]["f1-score"]
    
    lexicon_values_clics_dict[feature_id] = {
        "feature": d["feature"],
        "id2value": id2value,
        "label_weights_clics":weight_clics,
        "random_clics": (random_clics_result, random_clics_report["macro avg"]["f1-score"]),
        "clics": (clics_result, clics_report["macro avg"]["f1-score"])
    }


    if os.path.exists(f"../output/wn/oneff_{feature_id}.json"):
        with open(f"../output/wn/oneff_{feature_id}.json") as f:
            random_wn_report = json.load(f)["test"]["report"]

        with open(f"../output/wn/oneff_wn_glove_add+avg_{feature_id}.json") as f:
            wn_report = json.load(f)["test"]["report"]

        with open(f"../output/wn/oneff_wn_concept_glove_concat+avg_{feature_id}.json") as f:
            wn_concept_report = json.load(f)["test"]["report"]

    
        random_wn_result = dict()
        wn_result = dict()
        wn_concept_result = dict()

        for value_id in id2value:
            if str(value_id) in clics_report:

                clics_result[value_id] = clics_report[str(value_id)]["f1-score"]
            if str(value_id) in random_clics_report:
                random_clics_result[value_id] = random_clics_report[str(value_id)]["f1-score"]

            if str(value_id) in random_wn_report:
                random_wn_result[value_id] = random_wn_report[str(value_id)]["f1-score"]
            if str(value_id) in wn_report:
                wn_result[value_id] = wn_report[str(value_id)]["f1-score"]
            if str(value_id) in wn_concept_report:
                wn_concept_result[value_id] = wn_concept_report[str(value_id)]["f1-score"]

        lexicon_values_wn_dict[feature_id] = {
            "feature": d["feature"],
            "id2value": id2value,

            "label_weights_wn":weight_wn,
            "random_wn": (random_wn_result, random_wn_report["macro avg"]["f1-score"]),
            "wn": (wn_result, wn_report["macro avg"]["f1-score"]),
            "wn_concept": (wn_concept_result, wn_concept_report["macro avg"]["f1-score"])
        }


In [77]:
lexicon_values_wn_dict

lexicon_values_dict

In [78]:
lexicon_values_wn_dict

defaultdict(dict,
            {'129A': {'feature': 'Hand_and_Arm',
              'id2value': {0: 'Different', 1: 'Identical'},
              'label_weights_wn': {0: 0.5573770491803278,
               1: 0.4426229508196721},
              'random_wn': ({0: 0.0, 1: 0.7368421052631579},
               0.3684210526315789),
              'wn': ({0: 0.625, 1: 0.25}, 0.4375),
              'wn_concept': ({0: 0.3076923076923077, 1: 0.18181818181818182},
               0.24475524475524477)},
             '130A': {'feature': 'Finger_and_Hand',
              'id2value': {0: 'Different', 1: 'Identical'},
              'label_weights_wn': {0: 0.9067796610169492,
               1: 0.09322033898305085},
              'random_wn': ({0: 0.9600000000000001, 1: 0.0},
               0.48000000000000004),
              'wn': ({0: 0.9600000000000001, 1: 0.0}, 0.48000000000000004),
              'wn_concept': ({0: 0.9600000000000001, 1: 0.0},
               0.48000000000000004)},
             '130B': {'featu

In [66]:
lexicon_results_dict= defaultdict(dict)

for feature_id, label_dict in lexicon_values_dict.items():
    
    print(label_dict)
    feature_results = defaultdict(list)
    
    feature = label_dict["feature"]
    label_weights = label_dict["label_weights"]
    id2value = label_dict["id2value"]
    
    f1_scores = list()
    values = list()
    models = list()
    weights = list()
    
    def add_item(model):
        d = label_dict[model][0]
        for v,f in d.items():
            f1_scores.append(f)
            values.append(id2value[v])
            models.append(model)
            weights.append(label_weights[v])
    
    add_item("random")
    add_item("clics")
    add_item("wn")
    add_item("wn_concept")
    
    lexicon_results_dict[feature_id]={
        "Test(F1)": f1_scores,
        "Value": values, 
        "Model":models,
        "Weight": weights
    }
        
        
    
    
    
    

{'feature': 'Hand_and_Arm', 'id2value': {0: 'Different', 1: 'Identical'}, 'value_order_ascend': [1, 0], 'label_weights': {0: 0.6666666666666666, 1: 0.3333333333333333}, 'random': ({1: 0.0, 0: 0.7058823529411764}, 0.3529411764705882), 'clics': ({1: 0.625, 0: 0.0}, 0.3125), 'wn': ({1: 0.25, 0: 0.625}, 0.4375), 'wn_concept': ({1: 0.18181818181818182, 0: 0.3076923076923077}, 0.24475524475524477)}
{'feature': 'Finger_and_Hand', 'id2value': {0: 'Different', 1: 'Identical'}, 'value_order_ascend': [1, 0], 'label_weights': {0: 0.9067357512953368, 1: 0.09326424870466321}, 'random': ({1: 0.0, 0: 0.9743589743589743}, 0.48717948717948717), 'clics': ({1: 0.0, 0: 0.9743589743589743}, 0.48717948717948717), 'wn': ({1: 0.0, 0: 0.9600000000000001}, 0.48000000000000004), 'wn_concept': ({1: 0.0, 0: 0.9600000000000001}, 0.48000000000000004)}
{'feature': 'M-T_Pronouns', 'id2value': {0: 'No M-T pronouns', 1: 'M-T pronouns, paradigmatic', 2: 'M-T pronouns, non-paradigmatic'}, 'value_order_ascend': [2, 1, 0], '

In [70]:
for feature_id, results in lexicon_results_dict.items():
    df_feature_id = pd.DataFrame.from_dict(results)
    
    df_feature_id.to_csv(f"../output/lexicon/{feature_id}.csv",index=False)

In [67]:
lexicon_results_dict

defaultdict(dict,
            {'129A': {'Test(F1)': [0.0,
               0.7058823529411764,
               0.625,
               0.0,
               0.25,
               0.625,
               0.18181818181818182,
               0.3076923076923077],
              'Value': ['Identical',
               'Different',
               'Identical',
               'Different',
               'Identical',
               'Different',
               'Identical',
               'Different'],
              'Model': ['random',
               'random',
               'clics',
               'clics',
               'wn',
               'wn',
               'wn_concept',
               'wn_concept'],
              'Weight': [0.3333333333333333,
               0.6666666666666666,
               0.3333333333333333,
               0.6666666666666666,
               0.3333333333333333,
               0.6666666666666666,
               0.3333333333333333,
               0.6666666666666666]},
             '13

In [19]:
from sklearn.model_selection import train_test_split, KFold

def load_lang_list(langs):
    input_folder = "../data/TypPred/"

    if langs == "uriel":
        filepath = os.path.join(input_folder, "wals+uriel_langs.json")
    elif langs == "clics":
        filepath = os.path.join(input_folder, "wals+clics_langs.json")
    elif langs == "wn":
        filepath = os.path.join(input_folder, "wals+wn_langs.json")
    else:
        filepath = None

    if filepath is not None:
        with open(filepath) as f:
            langs_list = json.load(f)

        return langs_list
    else:
        return None


def get_datasets_dist(feature_id, langs):
    feature = lexicon_features[feature_id]
    train_data_label_dist_dict = defaultdict(dict)
    
    train_file = f"../data/TypPred/datasets/features/train_{feature_id}.csv"
    dev_file = f"../data/TypPred/datasets/features/dev_{feature_id}.csv"
    test_file = f"../data/TypPred/datasets/features/test_{feature_id}.csv"
    
    langs_list = load_lang_list(langs)
    
    train_data = pd.read_csv(train_file)
    dev_data = pd.read_csv(dev_file)
    test_data = pd.read_csv(test_file)

    train_data[feature] = train_data[feature].astype("int")
    dev_data[feature] = dev_data[feature].astype("int")
    test_data[feature] = test_data[feature].astype("int")

    train_data = train_data[train_data["ISO"].isin(langs_list)]
    dev_data = dev_data[dev_data["ISO"].isin(langs_list)]
    
    if len(train_data) > 0 and len(dev_data) > 0:
        langs_train = set(train_data["ISO"].tolist())
        langs_dev = set(dev_data["ISO"].tolist())
        langs_test = set(test_data["ISO"].tolist())

        df_train_test = pd.concat([train_data, test_data], axis=0)
        train_data, test_data = train_test_split(df_train_test, test_size=0.1, shuffle=False)

        df_train_dev = pd.concat([train_data, dev_data], axis=0)
        
        train_data_label_dist_dict[feature_id]=df_train_dev[feature].value_counts().to_dict()
    
    return train_data_label_dist_dict

        