In [13]:
import os
import json
import numpy as np
import pandas as pd
import statsmodels.api as sm

from statistics import mean
from scipy.stats import pearsonr

np.random.seed(9876789)
pd.set_option('display.max_rows', 500)
%matplotlib inline

In [14]:
lang2gold = json.load(open("../work/lang2count.json"))
lang2gold = {lang.replace("2", "X"): score for lang, score in lang2gold.items()}
lang2gold = {k: v for k, v in sorted(lang2gold.items(), key=lambda x: x[0])}
sum(sorted(lang2gold.values(), reverse=True)[:10])

0.5693262685643564

In [15]:
models = [
        "Transformer", 
        "LSTM",
        "RNN",
        "Transformer-action_td",
        "Transformer-action_lc-as",
        "LSTM-action_td",
        "LSTM-action_lc-as",
        "RNN-action_td",
        "RNN-action_lc-as",
        "3gram", 
        "4gram",
        "5gram", 
        "3gram_actions_td",
        "4gram_actions_td",
        "5gram_actions_td",
        "3gram_actions_lc-as",
        "4gram_actions_lc-as",
        "5gram_actions_lc-as",
        "RNNG_top_down",
        "RNNG_top_down_beam",
        "RNNG_in_order",
        "RNNG_in_order_beam",
        "SRNNG_top_down",
        "SRNNG_top_down_beam",
        "SRNNG_in_order",
        "SRNNG_in_order_beam",
        "llama2-7b"
        ]

In [16]:
def switch_consistency(preds, langs):
    consistency = [max(lang.count("0"), lang.count("1")) for lang in langs]
    return pearsonr(preds, consistency)[0]

def left_preference(preds, langs):
    consistency = [lang.count("0") for lang in langs]
    return pearsonr(preds, consistency)[0]

def micro_correl(preds, lang2gold):
    correls = []
    for key in ["00", "01", "10", "11"]:
        target_golds = [gold for l, gold in lang2gold.items() if l[:2] == key]
        target_preds = [pred for l, pred in zip(lang2gold.keys(), preds) if l[:2] == key]
        correls.append(pearsonr(target_golds, target_preds)[0])
    return mean(correls)

def top3_langs(preds):
    top3 = [l.replace("1", "R").replace("0", "L").replace("X", "") for l, _ in sorted(zip(lang2gold.keys(), preds), key=lambda x: x[1], reverse=True)[:3]]
    return ", ".join(top3)

def calc_stats(preds: list, lang2gold: dict):
    correl = pearsonr(preds, list(lang2gold.values()))[0]
    m_correl = micro_correl(preds, lang2gold)
    consistency = switch_consistency(preds, lang2gold.keys())
    left_pref = left_preference(preds, list(lang2gold.keys()))
    top3 = top3_langs(preds)
    return correl, m_correl, consistency, left_pref, top3


In [17]:
print(switch_consistency(list(lang2gold.values()), list(lang2gold.keys())))
print(left_preference(list(lang2gold.values()), list(lang2gold.keys())))
print(top3_langs(list(lang2gold.values())))

0.3683459382690032
0.10540786110684897
LRRRRL, LRRRRR, LLLRRL


In [18]:
y = list(lang2gold.values())
results_summary = pd.DataFrame(columns=["model", "fold", "k", "slope", "delta loglik", "AIC", "correl", "micro_correl", "consistency", "left_pref", "top3_langs"])

for k in [0.5, 1, 2, 3, "log"]:
    for model in models:
        for fold in ["0", "20000", "40000", "60000", "80000"]:
            path = f"../work/results/lang_ppl_distributions/{model}_fold{fold}.json"
            if os.path.exists(path):
                lang2ppl = json.load(open(path))
                lang2ppl = {k: v for k, v in sorted(lang2ppl.items(), key=lambda x: x[0])} 
                if k == "log":
                    ppls = [np.log(ppl) for lang, ppl in lang2ppl.items()]
                elif k == "exp":
                    ppls = [np.exp(ppl) for lang, ppl in lang2ppl.items()]
                else:
                    ppls = [ppl**k for lang, ppl in lang2ppl.items()]
                X = sm.add_constant(ppls)
                linear_model = sm.OLS(y, X)
                results = linear_model.fit()
                slope = results.params[1]
                negative_ppls = [-ppl for ppl in ppls]
                correl, m_correl, consistency, left_pref, top3 = calc_stats(negative_ppls, lang2gold)
                results_summary.loc[len(results_summary)] = [model, fold, k, slope, 0, results.aic, correl, m_correl, consistency, left_pref, top3]
            else:
                print(path)
results_summary.to_csv("../work/results/regression/results_20230908.csv")

## Stack depth

In [19]:
y = list(lang2gold.values())
results_summary = pd.DataFrame(columns=["model", "fold", "k", "slope", "delta loglik", "AIC", "correl", "micro_correl", "consistency", "left_pref", "top3_langs"])

for k in [0.5, 1, 2, 3, "log"]:
    for traversal in ["td", "lc-as"]:
        for fold in ["0", "20000", "40000", "60000", "80000"]:
            lang2ppl = json.load(open(f"../work/results/lang_stack_depth/{traversal}_fold{fold}.json"))
            lang2ppl = {k: v for k, v in sorted(lang2ppl.items(), key=lambda x: x[0])} 
            if k == "log":
                ppls = [np.log(ppl) for lang, ppl in lang2ppl.items()]
            elif k == "exp":
                ppls = [np.exp(ppl) for lang, ppl in lang2ppl.items()]
            else:
                ppls = [ppl**k for lang, ppl in lang2ppl.items()]
            X = sm.add_constant(ppls)
            linear_model = sm.OLS(y, X)
            results = linear_model.fit()
            slope = results.params[1]
            negative_ppls = [-ppl for ppl in ppls]
            correl, m_correl, consistency, left_pref, top3 = calc_stats(negative_ppls, lang2gold)
            results_summary.loc[len(results_summary)] = [traversal, fold, k, slope, 0, results.aic, correl, m_correl, consistency, left_pref, top3]
                  
results_summary.to_csv("../work/results/regression/results_20231217_stack_depth.csv")

## Parseability

In [20]:
rnngs = [
          "RNNG_top_down_beam",
          "RNNG_in_order_beam",
          "SRNNG_top_down_beam",
          "SRNNG_in_order_beam",
          ]

y = list(lang2gold.values())
results_summary = pd.DataFrame(columns=["model", "fold", "k", "slope", "delta loglik", "AIC", "correl", "micro_correl", "consistency", "left_pref", "top3_langs"])

for k in [0.5, 1, 2, 3, "log"]:
    for model in rnngs:
        for fold in ["0", "20000", "40000", "60000", "80000"]:
            lang2ppl = json.load(open(f"../work/results/lang_parseability_distributions/{model}_fold{fold}.json"))
            lang2ppl = {k: v["fscore"] for k, v in sorted(lang2ppl.items(), key=lambda x: x[0])} 
            if k == "log":
                ppls = [np.log(ppl) for lang, ppl in lang2ppl.items()]
            elif k == "exp":
                ppls = [np.exp(ppl) for lang, ppl in lang2ppl.items()]
            else:
                ppls = [ppl**k for lang, ppl in lang2ppl.items()]
            X = sm.add_constant(ppls)
            linear_model = sm.OLS(y, X)
            results = linear_model.fit()
            slope = results.params[1]
            negative_ppls = [-ppl for ppl in ppls]
            correl, m_correl, consistency, left_pref, top3 = calc_stats(negative_ppls, lang2gold)
            results_summary.loc[len(results_summary)] = [model, fold, k, slope, 0, results.aic, correl, m_correl, consistency, left_pref, top3]
                 
results_summary.to_csv("../work/results/regression/results_20230927_parseability.csv")

## Parseability and predictability

In [21]:
rnngs = [
          "RNNG_top_down_beam",
          "RNNG_in_order_beam",
          "SRNNG_top_down_beam",
          "SRNNG_in_order_beam",
          ]

y = list(lang2gold.values())
results_summary = pd.DataFrame(columns=["model", "fold", "slope", "pvalue", "delta loglik", "AIC_baseline", "AIC",])
for model in rnngs:
    for fold in ["0", "20000", "40000", "60000", "80000"]:

        lang2ppl = json.load(open(f"../work/results/lang_ppl_distributions/{model}_fold{fold}.json"))
        lang2ppl = {k: v for k, v in sorted(lang2ppl.items(), key=lambda x: x[0])} 
        ppls = [-ppl for lang, ppl in lang2ppl.items()]
        X = sm.add_constant(ppls)
        baseline_model = sm.OLS(y, X)
        baseline_model = baseline_model.fit()

        lang2parse = json.load(open(f"../work/results/lang_parseability_distributions/{model}_fold{fold}.json"))
        lang2parse = {k: v["fscore"] for k, v in sorted(lang2parse.items(), key=lambda x: x[0])} 
        parses = [parse for lang, parse in lang2parse.items()]
        X = np.column_stack((ppls, parses))
        X = sm.add_constant(X)
        linear_model = sm.OLS(y, X)
        results = linear_model.fit()

        slope = results.params[2]
        pvalue = results.pvalues[2]

        delta_loglik = results.llf - baseline_model.llf
        results_summary.loc[len(results_summary)] = [model, fold, slope, pvalue, delta_loglik, baseline_model.aic, results.aic]

results_summary.to_csv("../work/results/regression/results_20230927_parseability_predictability.csv")