# Comparing Wav2Vec vectors with Linguistic Features

In [1]:
import glob
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.stats import stats
import torch

## Loading data

In [2]:
## loading language label mappings
with open("../data/abo16.tsv", "r") as fin:
    abo16_map = {}
    for ln in fin.readlines()[1:]:
        toks = ln.strip().split("\t")
        if len(toks) < 2: continue
        abo16_map[toks[1]] = toks[0]
        if len(toks) >= 3:
            abo16_map[toks[2]] = toks[0]

In [3]:
## loading language features
feat_types = pd.read_csv("../data/langfeats/formosan_lgs_types.csv", index_col=0)
feats = pd.read_csv("../data/langfeats/formosan_lgs.csv", index_col=0).transpose()
feats = feats.rename(columns=lambda x: abo16_map.get(x, f"({x})"))     
keep_cols = sorted([x for x in feats.columns.values if not x.startswith("(")])
feats = feats.loc[:, keep_cols]

In [4]:
feats.head()

Language,Amis,Atayal,Bunun,Cou,Kavalan,Paiwan,Pinuyumayan,Rukai,Saysiyat,Seediq,Thau,Yami
highfrontvowel,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
highcentralvowel,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
highbackvowel,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
Midfrontvowel,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
Midcentralvowel,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0


In [5]:
feat_types.head()

Unnamed: 0_level_0,LingType,IsVowel,Sonority
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
highfrontvowel,Phonology,Vowel,Sonorant
highcentralvowel,Phonology,Vowel,Sonorant
highbackvowel,Phonology,Vowel,Sonorant
Midfrontvowel,Phonology,Vowel,Sonorant
Midcentralvowel,Phonology,Vowel,Sonorant


In [6]:
## loading vectors
vec_path = glob.glob("../data/vecs/abo16-an*.pkl")[0]
with open(vec_path, "rb") as fin:
    evals = pickle.load(fin)
print("Loading vectors from: ", vec_path)

Loading vectors from:  ../data/vecs/abo16-an-02.vec.pkl


In [7]:
## building h_proj
ans = evals["ans"]
preds = evals["preds"]
mask = np.array([((x == y) and (x != "misc")) for x, y in zip(ans, preds)])
ans_lst = sorted(set(ans))
ans_lst.remove('misc')
hlast = evals["last"][mask, :]
ans = np.array(ans)[mask]
preds = np.array(preds)[mask]

## projecting to low dimension
pca_h = PCA(5)
h_proj = pca_h.fit_transform(hlast)
h_proj = h_proj / np.linalg.norm(h_proj, axis=1)[:, np.newaxis]

In [8]:
hlast.shape, h_proj.shape

((1871, 1024), (1871, 5))

## Helper functions

In [9]:
def compute_lang_medoids(hvec, sample_ratio=1., rng=None):
    medoids = []
    if not rng:
        rng = np.random.RandomState()
        
    for ans_x in ans_lst:
        mat = hvec[np.array(ans)==ans_x]
        mat_nr = mat.shape[0]
        sample_idxs = np.arange(mat_nr)
        rng.shuffle(sample_idxs)     
        sample_idxs = sample_idxs[:int(mat_nr*sample_ratio)]
        # import pdb; pdb.set_trace()
        sampled_mat = mat[sample_idxs]        
        
        D = squareform(pdist(sampled_mat, metric='cosine'))
        med_idx = np.argmin(D.sum(axis=0))
        medoids.append(mat[med_idx]) 
    return np.vstack(medoids)

In [10]:
def lower_tri(mat):
    return mat[np.tril_indices(mat.shape[0])]

def compute_corr_sim(feat_corr, med_corr):
    ## align features
    med_labels = med_corr.index.values.tolist()
    feat_labels = feat_corr.index.values.tolist()
    lang_both = set(med_labels).intersection(feat_labels)
    lang_both = list(sorted(lang_both))
    feat_corrmat = feat_corr.loc[lang_both].loc[:, lang_both]
    med_corrmat = med_corr.loc[lang_both].loc[:, lang_both]
    feat_corr_vec = lower_tri(feat_corrmat.to_numpy())
    med_corr_vec = lower_tri(med_corrmat.to_numpy())
    return stats.spearmanr(feat_corr_vec, med_corr_vec)

In [11]:
def fivenum(x):
    return {"mean": np.mean(x), 
            "std": np.std(x),
            "median": np.median(x), 
            "Q95": np.quantile(x, .95),
            "Q05": np.quantile(x, .05),
           }

## Feature Partitions

In [12]:
# build randomized ling features
rand_feat_values = feats.values.copy()
feat_rng = np.random.RandomState(333)
for r_idx in range(rand_feat_values.shape[0]):
    rand_row = rand_feat_values[r_idx]
    feat_rng.shuffle(rand_row)
    rand_feat_values[r_idx, :] = rand_row
rand_feats = pd.DataFrame(rand_feat_values, index=feats.index, columns=feats.columns)

In [13]:
FEAT_CATEGORIES = ("full",
    "phono", "morpho", "syntax", 
    "vowel", "consonant", 
    "sonorant", "obstruent")

n_hot = feats.sum(axis=1)
feat_mask = ~((n_hot == 0.) | (n_hot==13.))

def get_feat_mask(feat_cat, feat_val):
    return (feat_mask & 
        feat_types.apply(lambda x: x[feat_cat]==feat_val, axis=1))

full_feat_mask = feat_mask
phono_feat_mask = get_feat_mask("LingType", "Phonology")
morpho_feat_mask = get_feat_mask("LingType", "Morphology")
syntax_feat_mask = get_feat_mask("LingType", "Syntax")
vowel_feat_mask = get_feat_mask("IsVowel", "Vowel")
consonant_feat_mask = get_feat_mask("IsVowel", "Consonant")
sonorant_feat_mask = get_feat_mask("Sonority", "Sonorant")
obstruent_feat_mask = get_feat_mask("Sonority", "Obstruent")

In [14]:
for feat_cat in FEAT_CATEGORIES:    
    print(feat_cat, globals()[f"{feat_cat}_feat_mask"].sum())

full 120
phono 56
morpho 43
syntax 21
vowel 10
consonant 46
sonorant 22
obstruent 31


## Compute CorrSim under Feature Partitions

In [15]:
def compute_corrsim_samples(feat_corr, sample_ratio=.5, n_samples=100):
    spearman_rs = []
    rng = np.random.RandomState(123)
    for idx in tqdm(range(n_samples)):        
        medoids = compute_lang_medoids(h_proj, 0.5, rng)
        Dm = squareform(pdist(medoids, metric='cosine'))
        # convert cosine distance to cosine similarity
        hmed_corr = pd.DataFrame(1-Dm, index=ans_lst, columns=ans_lst)
        sr = compute_corr_sim(feat_corr, hmed_corr)
        spearman_rs.append(sr.correlation)
    return spearman_rs

In [16]:
feat_stats = {}
for feat_cat in FEAT_CATEGORIES:    
    feat_mask = globals()[f"{feat_cat}_feat_mask"]
    feat_corr = feats[feat_mask].corr()
    feat_rs = compute_corrsim_samples(feat_corr)
    feat_stats[feat_cat] = fivenum(feat_rs)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [17]:
from tabulate import tabulate

In [21]:
feat_stats_dfr = pd.DataFrame.from_dict(feat_stats).transpose()
feat_stats_dfr.to_csv("../data/ling_feat_stats.csv")
feat_stats_dfr

Unnamed: 0,mean,std,median,Q95,Q05
full,0.545272,0.022863,0.54548,0.579367,0.51029
phono,0.572864,0.024292,0.574855,0.607586,0.526364
morpho,0.430814,0.023488,0.431962,0.467582,0.39183
syntax,0.347671,0.031784,0.346305,0.407836,0.295362
vowel,0.499212,0.035223,0.496155,0.55815,0.441031
consonant,0.559916,0.024754,0.56704,0.592787,0.514609
sonorant,0.512463,0.027437,0.514575,0.555389,0.464539
obstruent,0.56774,0.029086,0.572806,0.606004,0.516479


## Randomized control

In [19]:
rand_feat_stats = {}
for feat_cat in FEAT_CATEGORIES:    
    feat_mask = globals()[f"{feat_cat}_feat_mask"]
    feat_corr = rand_feats[feat_mask].corr()
    feat_rs = compute_corrsim_samples(feat_corr)
    rand_feat_stats[feat_cat] = fivenum(feat_rs)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [20]:
rand_feat_stats_dfr = pd.DataFrame.from_dict(rand_feat_stats).transpose()
rand_feat_stats_dfr.to_csv("../data/ling_rand_feat_stats.csv")
rand_feat_stats_dfr

Unnamed: 0,mean,std,median,Q95,Q05
full,0.322295,0.028448,0.322338,0.366206,0.276426
phono,0.352515,0.030396,0.35621,0.395579,0.294798
morpho,0.334848,0.024258,0.334968,0.377291,0.290689
syntax,0.360666,0.027919,0.359014,0.419621,0.323616
vowel,0.311926,0.023135,0.313356,0.34906,0.275237
consonant,0.380161,0.031781,0.376015,0.434446,0.319874
sonorant,0.394268,0.025003,0.397022,0.428948,0.357944
obstruent,0.326621,0.026733,0.326755,0.36753,0.280558
