In [65]:
import pandas as pd
from sklearn.metrics.pairwise import nan_euclidean_distances, cosine_similarity
from pathlib import Path
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, OneHotEncoder


# WALS features

In [28]:
df_wals= pd.read_csv("../datasets/typology/wals_dedup.csv")

In [36]:
df_wals["Consonant Inventories"].value_counts()

Consonant Inventories
Average             195
Moderately small    116
Moderately large     92
Small                86
Large                56
Name: count, dtype: int64

In [37]:
df_wals["Vowel Quality Inventories"].value_counts()

Vowel Quality Inventories
Average (5-6)    278
Large (7-14)     179
Small (2-4)       89
Name: count, dtype: int64

In [38]:
len(df_wals)

2432

In [99]:
df_wals_features = df_wals.drop(columns=['WALS-ID', 'ISO','Name', 'Family', 'Latitude', 'Longitude', 'Genus'])

In [100]:
df_wals_features["Lateral Consonants"].value_counts()

Lateral Consonants
/l/, no obstruent laterals                     372
No laterals                                     93
/l/ and lateral obstruent                       47
Laterals, but no /l/, no obstruent laterals     29
No /l/, but lateral obstruents                   8
Name: count, dtype: int64

In [101]:
df_wals_features.index= df_wals_features["Glottocode"]

In [102]:
df_wals_features = df_wals_features.drop(columns=["Glottocode"])

In [103]:
len(df_wals_features)

2432

In [105]:
label_encoder_eval = LabelEncoder()

In [86]:
df_wals_features = df_wals_features.fillna(-1)

In [92]:
df_wals_features = df_wals_features.astype(str)

In [115]:
col2vec = {}
for col in df_wals_features.columns:
    unique_categories = df_wals_features[col].dropna().unique()
    category_mapping = {category: index for index, category in enumerate(unique_categories, start=0)}
    category_mapping[np.nan] = -1
    df_wals_features[col]= df_wals_features[col].map(category_mapping)

In [119]:
df_wals_features["Consonant Inventories"].value_counts()

Consonant Inventories
-1    1887
 0     195
 1     116
 2      92
 4      86
 3      56
Name: count, dtype: int64

In [98]:
df_vec["Consonant Inventories"].value_counts()

Consonant Inventories
0    1887
1     195
4     116
3      92
5      86
2      56
Name: count, dtype: int64

In [121]:
df_wals_features.to_csv("../datasets/typology/wals_lang2features.csv")

In [136]:
def crop_wals(gb_df, perc):
    """Remove languages from dataframe that do not have at least <perc>% feature coverage"""
    rows_list=[]
    tot_feats = len([x for x in gb_df.columns])
    for i, row in gb_df.iterrows():
        no_data = row.to_list().count(-1)
        # print(i, no_data)
        if (tot_feats - no_data) >=(perc * tot_feats):
            rows_list.append(row)
            # try:
            #     gb_df = gb_df.drop(i)
            # except Exception as msg:
            #     print(msg)

    return pd.concat(rows_list, axis=1)

In [137]:
len(df_features)

418

In [138]:
df_wals_features_cropped = crop_wals(df_wals_features, 0.25)

In [141]:
df_wals_features_cropped.T.to_csv("../datasets/typology/wals_cropped.csv")

# Grambank features

In [2]:
df_grambank = pd.read_csv("../datasets/typology/gb_lang_feat_vals.csv", index_col=0)

In [3]:
len(df_grambank)

2467

In [4]:
gb_feats = [x for x in df_grambank.columns.to_list() if x.startswith("GB")]

In [5]:
# codes from https://github.com/esther2000/typdiv-sampling/blob/main/data/compute_all_distances.py
# Credits to the author

def crop(gb_df, perc):
    """Remove languages from dataframe that do not have at least <perc>% feature coverage"""
    tot_feats = len([x for x in gb_df.columns if x.startswith('GB')])
    for i, row in gb_df.iterrows():
        no_data = row.to_list().count('no_cov') + row.to_list().count('?')
        if (tot_feats - no_data) < (perc * tot_feats):
            gb_df = gb_df.drop(i)

    return gb_df

In [6]:
gb_matrix = crop(df_grambank, 0.25) #

In [7]:
len(gb_matrix)

2414

In [8]:
gb_matrix.head(2)

Unnamed: 0,Lang_ID,GB020,GB021,GB022,GB023,GB024,GB025,GB026,GB027,GB028,...,GB421,GB422,GB430,GB431,GB432,GB433,GB519,GB520,GB521,GB522
0,abad1241,?,?,?,?,2,1,0,0,1,...,?,?,0,0,0,1,?,?,?,1
1,abar1238,1,0,0,1,2,2,0,0,0,...,1,0,?,?,?,?,0,0,1,0


In [9]:
gb_matrix = gb_matrix.replace({"no_cov":np.nan, "?":np.nan}) # replace no_cov, and ?

In [12]:
lang_vecs = {
        row["Lang_ID"]: row[[x for x in gb_feats]].to_list()
        for _, row in gb_matrix.iterrows()
    }

In [16]:
langs = gb_matrix["Lang_ID"].tolist()

In [17]:
vecs = [lang_vecs[lang] for lang in langs]

In [18]:
sim_matrix_euclidean = nan_euclidean_distances(vecs,vecs)

In [22]:
sim_df = pd.DataFrame(sim_matrix_euclidean, columns=langs, index=langs).fillna(0)  # NOTE: 0 for maximisation only!

In [24]:
sim_df.at["abad1241", "abar1238"]

np.float64(7.881871212174793)

In [25]:
sim_df.to_csv("../datasets/lang2lang/grambank_sim.csv")

In [89]:
gb_matrix = gb_matrix.fillna(-1)

In [90]:
gb_matrix.to_csv("../datasets/typology/grambank_lang2features.csv",index=False)