In [1]:
import json
import glob
import pandas as pd

from collections import defaultdict
from statistics import mean


def marginalize_complementizer(lang2score):
    lang2marginalized_score = defaultdict(list)
    for lang, score in lang2score.items():
        lang2marginalized_score[lang[:2] + "X" + lang[3:]].append(float(score))
    lang2score = {}
    for lang, scores in lang2marginalized_score.items():
        lang2score[lang] = mean(scores)
    return lang2score

def summarize(dir_path, model_name="", fold=""):
    files = glob.glob(dir_path, recursive=True)
    lang2ppl = {}
    for file in files:
        lang = file.split("/")[-3]
        with open(file) as f:
            ppl = [float(line.split()[1]) for line in f if line.startswith("ppl: ")][0]
            lang2ppl[lang] = ppl
    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))

## N-gram

In [2]:
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    summarize(f"../work/tree_per_line/**/{fold}/ngram/tst.subword.3gram.ppl", "3gram", fold)
    summarize(f"../work/tree_per_line/**/{fold}/ngarm/tst.subword.4gram.ppl", "4gram", fold)
    summarize(f"../work/tree_per_line/**/{fold}/ngram/tst.subword.5gram.ppl", "5gram", fold)

    summarize(f"../work/tree_per_line/**/{fold}/td/ngram/tst.actions.subword.3gram.ppl", "td_actions_3gram", fold)
    summarize(f"../work/tree_per_line/**/{fold}/ls-as/ngram/tst.actions.subword.3gram.ppl", "lc-as_actions_3gram", fold)

    summarize(f"../work/tree_per_line/**/{fold}/td/ngram/tst.actions.subword.4gram.ppl", "td_actions_4gram", fold)
    summarize(f"../work/tree_per_line/**/{fold}/lc-as/ngram/tst.actions.subword.4gram.ppl", "lc-as_actions_4gram", fold)

    summarize(f"../work/tree_per_line/**/{fold}/td/ngram/tst.actions.subword.5gram.ppl", "td_actions_5gram", fold)
    summarize(f"../work/tree_per_line/**/{fold}/lc-as/ngram/tst.actions.subword.5gram.ppl", "lc-as_actions_5gram", fold)

## Neural LMs

In [10]:
model_name = "Transformer"
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    files = glob.glob(f"../work/results/**/{fold}/fairseq/trans/test.results", recursive=True)
    lang2ppl = {}
    for file in files:
        with open(file) as f:
            lang = file.split("/")[-5]
            lines = f.readlines()
            ppl_line = lines[-1].strip()
            ppl = float(ppl_line.split("Perplexity:")[-1].strip())
            lang2ppl[lang] = ppl
    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))

In [2]:
model_name = "LSTM"
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    files = glob.glob(f"../work/results/**/{fold}/fairseq/lstm/test.results", recursive=True)

    lang2ppl = {}
    for file in files:
        with open(file) as f:
            lang = file.split("/")[-5]
            lines = f.readlines()
            ppl_line = lines[-1].strip()
            ppl = float(ppl_line.split("Perplexity:")[-1].strip())
            lang2ppl[lang] = ppl
    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))

In [None]:
model_name = "RNN"
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    files = glob.glob(f"../work/results/**/{fold}/fairseq/rnn/test.results", recursive=True)

    lang2ppl = {}
    for file in files:
        with open(file) as f:
            lang = file.split("/")[-5]
            lines = f.readlines()
            ppl_line = lines[-1].strip()
            ppl = float(ppl_line.split("Perplexity:")[-1].strip())
            lang2ppl[lang] = ppl

    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))

## Neural PLMs

In [None]:
model_name = "Transformer-action_td"
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    files = glob.glob(f"../work/results/**/{fold}/td/fairseq/trans/test.results", recursive=True)
    lang2ppl = {}
    for file in files:
        with open(file) as f:
            lang = file.split("/")[-5]
            lines = f.readlines()
            ppl_line = lines[-1].strip()
            ppl = float(ppl_line.split("Perplexity:")[-1].strip())
            lang2ppl[lang] = ppl
    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))

In [None]:
model_name = "Transformer-action_lc-as"
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    files = glob.glob(f"../work/results/**/{fold}/lc-as/fairseq/trans/test.results", recursive=True)
    lang2ppl = {}
    for file in files:
        with open(file) as f:
            lang = file.split("/")[-5]
            lines = f.readlines()
            ppl_line = lines[-1].strip()
            ppl = float(ppl_line.split("Perplexity:")[-1].strip())
            lang2ppl[lang] = ppl
    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))

In [None]:
model_name = "LSTM-action_td"
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    files = glob.glob(f"../work/results/**/{fold}/td/fairseq/lstm/test.results", recursive=True)
    lang2ppl = {}
    for file in files:
        with open(file) as f:
            lang = file.split("/")[-5]
            lines = f.readlines()
            ppl_line = lines[-1].strip()
            ppl = float(ppl_line.split("Perplexity:")[-1].strip())
            lang2ppl[lang] = ppl
    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))

In [None]:
model_name = "LSTM-action_lc-as"
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    files = glob.glob(f"../work/results/**/{fold}/lc-as/fairseq/lstm/test.results", recursive=True)
    lang2ppl = {}
    for file in files:
        with open(file) as f:
            lang = file.split("/")[-5]
            lines = f.readlines()
            ppl_line = lines[-1].strip()
            ppl = float(ppl_line.split("Perplexity:")[-1].strip())
            lang2ppl[lang] = ppl
    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))

In [None]:
model_name = "RNN-action_td"
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    files = glob.glob(f"../work/results/**/{fold}/td/fairseq/rnn/test.results", recursive=True)
    lang2ppl = {}
    for file in files:
        with open(file) as f:
            lang = file.split("/")[-5]
            lines = f.readlines()
            ppl_line = lines[-1].strip()
            ppl = float(ppl_line.split("Perplexity:")[-1].strip())
            lang2ppl[lang] = ppl
    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))

In [None]:
model_name = "RNN-action_lc-as"
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    files = glob.glob(f"../work/results/**/{fold}/lc-as/fairseq/rnn/test.results", recursive=True)
    lang2ppl = {}
    for file in files:
        with open(file) as f:
            lang = file.split("/")[-5]
            lines = f.readlines()
            ppl_line = lines[-1].strip()
            ppl = float(ppl_line.split("Perplexity:")[-1].strip())
            lang2ppl[lang] = ppl
    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))

## RNNG

In [13]:
model_name = "RNNG_top_down"
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    lang2ppl = {}
    if fold == "0":
        df = pd.read_csv("../work/summary/RNNG.csv", dtype={"lang": "str"})
        df = df[(df["traversal"]=="top_down") & (df["fold"]==0)][["lang", "Test ppl"]]
        for row in df[["lang", "Test ppl"]].iterrows():
            lang = row[1][0]
            if len(lang) == 7:
                lang2ppl[lang] = row[1][1]
    else:
        files = glob.glob(f"../work/results/**/{fold}/top_down/rnng/model.bin.log", recursive=True)
        for file in files:
            lang = file.split("/")[-5]
            with open(file) as f:
                result_line = [line for line in f if "'Step': 11, 'Test ppl':" in line]
                if result_line:
                    ppl = float(result_line[0].split("'Test ppl':")[1].strip().split(",")[0])
                    lang2ppl[lang] = ppl

    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))

In [14]:
model_name = "RNNG_in_order"
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    lang2ppl = {}
    if fold == "0":
        df = pd.read_csv("../work/pre-results/RNNG.csv", dtype={"lang": "str"})
        df = df[(df["traversal"]=="in_order") & (df["fold"]==0)][["lang", "Test ppl"]]
        for row in df[["lang", "Test ppl"]].iterrows():
            lang = row[1][0]
            if len(lang) == 7:
                lang2ppl[lang] = row[1][1]
    else:
        files = glob.glob(f"../work/results/**/{fold}/in_order/rnng/model.bin.log", recursive=True)
        for file in files:
            lang = file.split("/")[-5]
            with open(file) as f:
                result_line = [line for line in f if "'Step': 11, 'Test ppl':" in line]
                if result_line:
                    ppl = float(result_line[0].split("'Test ppl':")[1].strip().split(",")[0])
                    lang2ppl[lang] = ppl

    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))

In [2]:
model_name = "RNNG_top_down_beam"
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    lang2ppl = defaultdict(list)
    files = glob.glob(f"../work/results/**/{fold}/top_down/rnng/model.bin.surprisals.fixed_beam.100_10_1", recursive=True)
    for file in files:
        lang = file.split("/")[-5]
        with open(file) as f:
            result_line = [line for line in f if 'perplexity' in line]
            if result_line:
                result = result_line[0]
                ppl = float(result.split()[1])
                lang2ppl[lang].append(ppl)
    lang2ppl = {lang: mean(ppls) for lang, ppls in lang2ppl.items()}
    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))

In [None]:
model_name = "RNNG_in_order_beam"
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    lang2ppl = defaultdict(list)
    files = glob.glob(f"../work/results/**/{fold}/in_order/rnng/model.bin.surprisals.fixed_beam.100_10_1", recursive=True)
    for file in files:
        lang = file.split("/")[-5]
        with open(file) as f:
            result_line = [line for line in f if 'perplexity' in line]
            if result_line:
                result = result_line[0]
                ppl = float(result.split()[1])
                lang2ppl[lang].append(ppl)
    lang2ppl = {lang: mean(ppls) for lang, ppls in lang2ppl.items()}
    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))

In [17]:
model_name = "SRNNG_top_down"
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    lang2ppl = {}
    if fold == "0":
        df = pd.read_csv("../work/pre-results/RNNG_weak_all.csv", dtype={"lang": "str"})
        df = df[(df["traversal"]=="top_down")][["lang", "Test ppl"]]
        for row in df[["lang", "Test ppl"]].iterrows():
            lang = row[1][0]
            if len(lang) == 7:
                lang2ppl[lang] = row[1][1]
    else:
        files = glob.glob(f"../work/results/basic/**/{fold}/top_down/srnng/model.bin.log", recursive=True)
        for file in files:
            lang = file.split("/")[-5]
            with open(file) as f:
                result_line = [line for line in f if "'Step': 11, 'Test ppl':" in line]
                if result_line:
                    ppl = float(result_line[0].split("'Test ppl':")[1].strip().split(",")[0])
                    lang2ppl[lang] = ppl

    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))

In [18]:
model_name = "SRNNG_in_order"
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    lang2ppl = {}
    if fold == "0":
        df = pd.read_csv("../work/pre-results/RNNG_weak_all.csv", dtype={"lang": "str"})
        df = df[(df["traversal"]=="in_order")][["lang", "Test ppl"]]
        for row in df[["lang", "Test ppl"]].iterrows():
            lang = row[1][0]
            if len(lang) == 7:
                lang2ppl[lang] = row[1][1]
    else:
        files = glob.glob(f"../work/results/**/{fold}/in_order/srnng/model.bin.log", recursive=True)
        for file in files:
            lang = file.split("/")[-5]
            with open(file) as f:
                result_line = [line for line in f if "'Step': 11, 'Test ppl':" in line]
                if result_line:
                    ppl = float(result_line[0].split("'Test ppl':")[1].strip().split(",")[0])
                    lang2ppl[lang] = ppl
 
    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))

In [None]:
model_name = "SRNNG_top_down_beam"
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    lang2ppl = defaultdict(list)
    files = glob.glob(f"../work/results/**/{fold}/top_down/srnng/model.bin.surprisals.fixed_beam.100_10_1", recursive=True)
    for file in files:
        lang = file.split("/")[-5]
        with open(file) as f:
            result_line = [line for line in f if 'perplexity' in line]
            if result_line:
                result = result_line[0]
                ppl = float(result.split()[1])
                lang2ppl[lang].append(ppl)

    lang2ppl = {lang: mean(ppls) for lang, ppls in lang2ppl.items()}
    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))

In [None]:
model_name = "SRNNG_in_order_beam"
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    lang2ppl = defaultdict(list)
    files = glob.glob(f"../work/results/**/{fold}/in_order/srnng/model.bin.surprisals.fixed_beam.100_10_1", recursive=True)
    for file in files:
        lang = file.split("/")[-5]
        with open(file) as f:
            result_line = [line for line in f if 'perplexity' in line]
            if result_line:
                result = result_line[0]
                ppl = float(result.split()[1])
                lang2ppl[lang].append(ppl)
    lang2ppl = {lang: mean(ppls) for lang, ppls in lang2ppl.items()}
    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))

## LLama2

In [6]:
model_name = "llama2-7b"
folds = ["0", "20000", "40000", "60000", "80000"]
for fold in folds:
    lang2ppl = defaultdict(list)
    files = glob.glob(f"../work/results/**/{fold}/Llama-2-7b-hf/ppl.txt", recursive=True)
    for file in files:
        lang = file.split("/")[-4]
        with open(file) as f:
            result_line = f.readlines()
            if result_line:
                result = result_line[0]
                ppl = float(result.strip())
                lang2ppl[lang].append(ppl)
    lang2ppl = {lang: mean(ppls) for lang, ppls in lang2ppl.items()}
    lang2ppl = marginalize_complementizer(lang2ppl)
    json.dump(lang2ppl, open(f"../work/lang_ppl_distributions/{model_name}_fold{fold}.json", "w"))