# Code for the analyses in the paper "Improving Generalization of Norwegian ASR with Limited Linguistic Resources"

## Imports and loading of datasets

In [1]:
# Imports
import pandas as pd
from jiwer import wer, cer
from scipy.stats import power_divergence
from collections import Counter
from pathlib import Path
import re
import numpy as np

In [2]:
# Make a df of the transcriptions
w_lm = False
filenamepattern = re.compile(r"^((?:nbtale12|nbtale3|npsc|nst|rundkast)(?:_(?:test|validation))*)_*(.*)")
if w_lm:
    filenamepattern = re.compile(r"^((?:nbtale12|nbtale3|npsc|nst|rundkast)(?:_(?:test|validation))*)_*(.*)_w_lm")
dfs = {}
csvs = Path("transcriptions").glob("*.csv")
if w_lm:
    csvs = Path("transcriptions_w_lm").glob("*.csv")

for fn in csvs:
    match = filenamepattern.match(fn.stem)
    dataset = match.group(1)
    asr_model = match.group(2)
    df = pd.read_csv(fn)
    if dataset not in dfs.keys():
        df["dataset"] = dataset
        df[asr_model] = df["transcription"]
        df.drop("transcription", axis=1, inplace=True)
        dfs[dataset] = df
    else:
        dfs[dataset][asr_model] = df["transcription"]
df = pd.concat(dfs.values())

def clean_mult_space(trans):
    """Clean up instances of multiple spaces"""
    if isinstance(trans, str):
        trans = trans.strip()
        trans = re.sub(r"\s\s+", " ", trans)
        trans = trans.replace("\\", " ")
    return trans

# This does not appear to have any effect on WER. However, it removes empty strings
# from lexical analysis   
df["standardized_text"] = df["standardized_text"].apply(lambda x: clean_mult_space(x))
df["stortinget_model"] = df["stortinget_model"].apply(lambda x: clean_mult_space(x))
df["combined_short_model"] = df["combined_short_model"].apply(lambda x: clean_mult_space(x))
df["combined_long_model"] = df["combined_long_model"].apply(lambda x: clean_mult_space(x))
df["rundkast_model"] = df["rundkast_model"].apply(lambda x: clean_mult_space(x))

In [3]:
df.dataset.unique()

array(['rundkast_test', 'npsc_test', 'nbtale12', 'rundkast_validation',
       'npsc_validation', 'nst_validation', 'nst_test', 'nbtale3'],
      dtype=object)

In [4]:
df = df.query("dataset not in ['npsc_validation', 'rundkast_validation', 'nst_validation']")

In [5]:
df = df.drop('Unnamed: 0', axis=1).dropna(subset=["standardized_text"]).query("region != 'unknown'")

In [6]:
round(df.groupby("dataset").duration.sum()/3600, 1)

dataset
nbtale12         15.6
nbtale3          11.6
npsc_test         9.1
nst_test         25.6
rundkast_test     5.8
Name: duration, dtype: float64

In [7]:
# Doing replacements in a separate df so we can see how much
# these transcription differences contribute to the results

replacements = {
    r"(\b)n r k s(\b)": r"\1nrks\2",
    r"(\b)n r k(\b)": r"\1nrk\2",
    r"(\b)u s a s(\b)": r"\1usas\2",
    r"(\b)u s a(\b)": r"\1usa\2",
    r"(\b)a f p(\b)": r"\1afp\2",
    r"(\b)u d i s(\b)": r"\1udis\2",
    r"(\b)u d i(\b)": r"\1udi\2",
    r"(\b)e u s(\b)": r"\1eus\2",
    r"(\b)e u(\b)": r"\1eu\2",
    r"(\b)s a s(\b)": r"\1sas\2",
    r"(\b)s m s(\b)": r"\1sms\2",
    r"(\b)n t n u(\b)": r"\1ntnu\2",
    r"(\b)d n b s(\b)": r"\1dnbs\2",
    r"(\b)d n b(\b)": r"\1dnb\2",
    r"(\b)o s s e s(\b)": r"\1osses\2",
    r"(\b)o s s e(\b)": r"\1osse\2",
    r"(\b)s v s(\b)": r"\1svs\2",
    r"(\b)s v(\b)": r"\1sv\2",
    r"-": "",
    r"(\b)mmm(\b)": r"\1\2",
    r"(\b)eee(\b)": r"\1\2",
    r"(\b)qqq(\b)": r"\1\2",
}
def clean_string(string):
    if type(string) == str:
        for k, v in replacements.items():
            string = re.sub(k, v, string)
        if string == " ":
            string = ""
    return string

df_cleaned = df.copy()
df_cleaned["standardized_text"] = df_cleaned["standardized_text"].apply(lambda x: clean_string(x))
df_cleaned["stortinget_model"] = df_cleaned["stortinget_model"].apply(lambda x:clean_string(x))
df_cleaned["combined_short_model"] = df_cleaned["combined_short_model"].apply(lambda x: clean_string(x))
df_cleaned["combined_long_model"] = df_cleaned["combined_long_model"].apply(lambda x: clean_string(x))
df_cleaned["rundkast_model"] = df_cleaned["rundkast_model"].apply(lambda x: clean_string(x))
df_cleaned = df_cleaned[~df_cleaned.standardized_text.isin([' ', ''])]

In [8]:
df["word_count"] = df.standardized_text.apply(lambda x: len(x.split(" ")))
df_cleaned["word_count"] = df_cleaned.standardized_text.apply(lambda x: len(x.split(" ")))

In [9]:
def compute_score(gold, pred, method=wer):
    """Score a segment according to a scoring method
    (wer or cer)."""
    try:
        if type(pred) != str:
            pred = ""
        if type(gold) != str:
            gold = ""
        return method(gold, pred)*100
    except ValueError:
        return 100


In [10]:
# Get number of word and character errors per segments
# This makes it easier to calculate WER and CER independent
# of segment length later
df["rundkast_num_word_errors"] = df.apply(lambda row: compute_score(row["standardized_text"], row["rundkast_model"])*row["word_count"], axis=1)
df["stortinget_num_word_errors"] = df.apply(lambda row: compute_score(row["standardized_text"], row["stortinget_model"])*row["word_count"], axis=1)
df["combined_short_num_word_errors"] = df.apply(lambda row: compute_score(row["standardized_text"], row["combined_short_model"])*row["word_count"], axis=1)
df["combined_long_num_word_errors"] = df.apply(lambda row: compute_score(row["standardized_text"], row["combined_long_model"])*row["word_count"], axis=1)
df_native = df.query("region != 'foreign'")
df_foreign = df.query("region == 'foreign'")

df_cleaned["rundkast_num_word_errors"] = df_cleaned.apply(lambda row: compute_score(row["standardized_text"], row["rundkast_model"])*row["word_count"], axis=1)
df_cleaned["stortinget_num_word_errors"] = df_cleaned.apply(lambda row: compute_score(row["standardized_text"], row["stortinget_model"])*row["word_count"], axis=1)
df_cleaned["combined_short_num_word_errors"] = df_cleaned.apply(lambda row: compute_score(row["standardized_text"], row["combined_short_model"])*row["word_count"], axis=1)
df_cleaned["combined_long_num_word_errors"] = df_cleaned.apply(lambda row: compute_score(row["standardized_text"], row["combined_long_model"])*row["word_count"], axis=1)
df_cleaned_native = df_cleaned.query("region != 'foreign'")
df_cleaned_foreign = df_cleaned.query("region == 'foreign'")

## Calculation functions

In [11]:
def get_wer(df, num_error_col):
    return round(df[num_error_col].sum()/df["word_count"].sum(), 1)

In [12]:
def relative_change(initial_val, final_val):
    return round((final_val-initial_val)/initial_val*100, 1)

In [13]:
def get_score_by_column(df, gb_col, stat_col, count_col):
    """group by gb_col in ds and calculate error rate given a stat_col with segmentwise number of errors"""
    return round(df.groupby(gb_col)[stat_col].sum()/df.groupby(gb_col)[count_col].sum(), 1)

In [14]:
def make_grouped_df(df, gb_col):
    """Pass a groubby column and create a df with scoring and relative change for different datasets
    and dataset combinations"""
    grouped_df = pd.DataFrame(get_score_by_column(df, gb_col, "rundkast_num_word_errors", "word_count"))
    grouped_df.columns = ["rundkast_wer"]
    grouped_df["stortinget_wer"] = get_score_by_column(df, gb_col, "stortinget_num_word_errors", "word_count")
    grouped_df["combined_short_wer"] = get_score_by_column(df, gb_col, "combined_short_num_word_errors", "word_count")
    grouped_df["combined_long_wer"] = get_score_by_column(df, gb_col, "combined_long_num_word_errors", "word_count")
    grouped_df["relative_change_rundkast_to_combined_short_wer"] = grouped_df.apply(lambda row: relative_change(row.rundkast_wer, row.combined_short_wer), axis=1)
    grouped_df["relative_change_stortinget_to_combined_short_wer"] = grouped_df.apply(lambda row: relative_change(row.stortinget_wer, row.combined_short_wer), axis=1)
    grouped_df["relative_change_rundkast_to_combined_long_wer"] = grouped_df.apply(lambda row: relative_change(row.rundkast_wer, row.combined_long_wer), axis=1)
    grouped_df["relative_change_stortinget_to_combined_long_wer"] = grouped_df.apply(lambda row: relative_change(row.stortinget_wer, row.combined_long_wer), axis=1)
    grouped_df["relative_change_combined_short_to_combined_long_wer"] = grouped_df.apply(lambda row: relative_change(row.combined_short_wer, row.combined_long_wer), axis=1)

    return grouped_df

In [15]:
def make_grouped_wer_plot(results_df, x_label):
    dataset_plot = results_df[["rundkast_wer", "stortinget_wer", "combined_short_wer", "combined_long_wer"]].plot(kind='bar', figsize=(10,10))
    dataset_plot.set_xlabel(x_label)
    dataset_plot.set_ylabel("wer")

## Scoring on the corpus as a whole

In [16]:
def get_corpus_stats(df):
    return {
    "wer_rundkast": get_wer(df, "rundkast_num_word_errors"),
    "wer_stortinget" : get_wer(df, "stortinget_num_word_errors"),
    "wer_combined_short": get_wer(df, "combined_short_num_word_errors"),
    "wer_combined_long": get_wer(df, "combined_long_num_word_errors"),
    }

def print_corpus_stats(stats_dict):
    print(f"WER Rundkast: {stats_dict['wer_rundkast']}")
    print(f"WER Stortinget: {stats_dict['wer_stortinget']}")
    print(f"WER combined_short: {stats_dict['wer_combined_short']}")
    print(f"WER combined_long: {stats_dict['wer_combined_long']}")
    print(f"Relative WER change from Rundkast to combined_short: {relative_change(stats_dict['wer_rundkast'], stats_dict['wer_combined_short'])}")
    print(f"Relative WER change from Rundkast to combined_long: {relative_change(stats_dict['wer_rundkast'], stats_dict['wer_combined_long'])}")
    print(f"Relative WER change from Stortinget to combined_short: {relative_change(stats_dict['wer_stortinget'], stats_dict['wer_combined_short'])}")
    print(f"Relative WER change from Stortinget to combined_long: {relative_change(stats_dict['wer_stortinget'], stats_dict['wer_combined_long'])}")
    print(f"Relative WER change from combined_short to combined_long: {relative_change(stats_dict['wer_combined_short'], stats_dict['wer_combined_long'])}")

In [17]:
corpus_stats_native = get_corpus_stats(df_native)
corpus_stats_foreign = get_corpus_stats(df_foreign)
corpus_stats_cleaned_native = get_corpus_stats(df_cleaned_native)
corpus_stats_cleaned_foreign = get_corpus_stats(df_cleaned_foreign)
corpus_stats_no_test = get_corpus_stats(df_native.query("dataset not in ['npsc_test', 'rundkast_test']"))

### Scoring -- native speakers

In [18]:
print_corpus_stats(corpus_stats_native)

WER Rundkast: 18.1
WER Stortinget: 16.8
WER combined_short: 14.7
WER combined_long: 13.4
Relative WER change from Rundkast to combined_short: -18.8
Relative WER change from Rundkast to combined_long: -26.0
Relative WER change from Stortinget to combined_short: -12.5
Relative WER change from Stortinget to combined_long: -20.2
Relative WER change from combined_short to combined_long: -8.8


In [19]:
print_corpus_stats(corpus_stats_cleaned_native)

WER Rundkast: 16.5
WER Stortinget: 16.0
WER combined_short: 13.6
WER combined_long: 12.2
Relative WER change from Rundkast to combined_short: -17.6
Relative WER change from Rundkast to combined_long: -26.1
Relative WER change from Stortinget to combined_short: -15.0
Relative WER change from Stortinget to combined_long: -23.8
Relative WER change from combined_short to combined_long: -10.3


### Scoring -- non-native speakers

In [20]:
print_corpus_stats(corpus_stats_foreign)

WER Rundkast: 35.1
WER Stortinget: 35.4
WER combined_short: 32.4
WER combined_long: 31.5
Relative WER change from Rundkast to combined_short: -7.7
Relative WER change from Rundkast to combined_long: -10.3
Relative WER change from Stortinget to combined_short: -8.5
Relative WER change from Stortinget to combined_long: -11.0
Relative WER change from combined_short to combined_long: -2.8


In [21]:
print_corpus_stats(corpus_stats_cleaned_foreign)

WER Rundkast: 33.7
WER Stortinget: 34.8
WER combined_short: 31.1
WER combined_long: 29.9
Relative WER change from Rundkast to combined_short: -7.7
Relative WER change from Rundkast to combined_long: -11.3
Relative WER change from Stortinget to combined_short: -10.6
Relative WER change from Stortinget to combined_long: -14.1
Relative WER change from combined_short to combined_long: -3.9


## Scoring on the individual datasets

In [22]:
dataset_result_native = make_grouped_df(df_native, "dataset")
dataset_result_cleaned_native = make_grouped_df(df_cleaned_native, "dataset")

In [23]:
# Native speakers
dataset_result_native

Unnamed: 0_level_0,rundkast_wer,stortinget_wer,combined_short_wer,combined_long_wer,relative_change_rundkast_to_combined_short_wer,relative_change_stortinget_to_combined_short_wer,relative_change_rundkast_to_combined_long_wer,relative_change_stortinget_to_combined_long_wer,relative_change_combined_short_to_combined_long_wer
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
nbtale12,24.5,25.8,23.5,21.9,-4.1,-8.9,-10.6,-15.1,-6.8
nbtale3,28.9,25.8,24.4,23.0,-15.6,-5.4,-20.4,-10.9,-5.7
npsc_test,19.5,9.3,9.9,7.9,-49.2,6.5,-59.5,-15.1,-20.2
nst_test,10.6,11.2,9.2,8.6,-13.2,-17.9,-18.9,-23.2,-6.5
rundkast_test,17.8,24.0,17.2,15.9,-3.4,-28.3,-10.7,-33.8,-7.6


In [24]:
dataset_result_cleaned_native

Unnamed: 0_level_0,rundkast_wer,stortinget_wer,combined_short_wer,combined_long_wer,relative_change_rundkast_to_combined_short_wer,relative_change_stortinget_to_combined_short_wer,relative_change_rundkast_to_combined_long_wer,relative_change_stortinget_to_combined_long_wer,relative_change_combined_short_to_combined_long_wer
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
nbtale12,22.5,25.3,21.7,19.5,-3.6,-14.2,-13.3,-22.9,-10.1
nbtale3,27.2,24.4,22.8,21.2,-16.2,-6.6,-22.1,-13.1,-7.0
npsc_test,17.4,8.8,9.3,7.2,-46.6,5.7,-58.6,-18.2,-22.6
nst_test,9.7,11.1,8.7,7.8,-10.3,-21.6,-19.6,-29.7,-10.3
rundkast_test,16.0,21.1,15.4,14.1,-3.7,-27.0,-11.9,-33.2,-8.4


### Scoring per dialect region

In [25]:
# Looking at the dialect results with only unplanned speech
region_results_nbtale3 = make_grouped_df(df.query("dataset == 'nbtale3'"), "region")
region_results_nbtale3

Unnamed: 0_level_0,rundkast_wer,stortinget_wer,combined_short_wer,combined_long_wer,relative_change_rundkast_to_combined_short_wer,relative_change_stortinget_to_combined_short_wer,relative_change_rundkast_to_combined_long_wer,relative_change_stortinget_to_combined_long_wer,relative_change_combined_short_to_combined_long_wer
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
east,22.3,21.5,19.3,18.2,-13.5,-10.2,-18.4,-15.3,-5.7
foreign,30.5,31.0,27.4,26.1,-10.2,-11.6,-14.4,-15.8,-4.7
mid,32.0,27.7,27.5,25.5,-14.1,-0.7,-20.3,-7.9,-7.3
north,27.1,24.6,22.9,21.5,-15.5,-6.9,-20.7,-12.6,-6.1
south,25.7,22.3,21.1,20.5,-17.9,-5.4,-20.2,-8.1,-2.8
west,32.7,28.6,27.2,25.8,-16.8,-4.9,-21.1,-9.8,-5.1


## Lexical analysis

In [26]:
def make_freqdist(wordlist):
     return sorted(Counter(wordlist).items(), key=lambda pair: pair[1], reverse=True)

In [27]:
df_native_nbtale_nst = df_native.copy().query("dataset not in ['npsc_test', 'rundkast_test']")
df_native_cleaned_nbtale_nst = df_cleaned_native.copy().query("dataset not in ['npsc_test', 'rundkast_test']")

def get_words_from_col(col):
    sentlist = col.tolist()
    return [x for y in sentlist for x in y.split(" ")]

df_words = df_native_nbtale_nst.dropna(subset=["standardized_text", "rundkast_model", "stortinget_model", "combined_short_model", "combined_long_model"])
reference_wds = get_words_from_col(df_words.standardized_text)
rundkast_wds = get_words_from_col(df_words["rundkast_model"])
stortinget_wds = get_words_from_col(df_words["stortinget_model"])
combined_short_wds = get_words_from_col(df_words["combined_short_model"])
combined_long_wds = get_words_from_col(df_words["combined_long_model"])

In [28]:
#Make frequency word lists (native only)
ref_wdcount = make_freqdist(reference_wds)
rundkast_wdcount = make_freqdist(rundkast_wds)
stortinget_wdcount = make_freqdist(stortinget_wds)
combined_short_wdcount = make_freqdist(combined_short_wds)
combined_long_wdcount = make_freqdist(combined_long_wds)

In [29]:
def make_lexicon_df(list_of_freqdists, list_of_names):
    mydict = {}
    for dist, name in zip(list_of_freqdists, list_of_names):
        for word, n in dist:
            if word not in mydict.keys():
                mydict[word] = {nm: 0 for nm in list_of_names}
            mydict[word][name] += n
    return pd.DataFrame(mydict).transpose()

In [30]:
wordfreq_df = make_lexicon_df([ref_wdcount, rundkast_wdcount, stortinget_wdcount, combined_short_wdcount, combined_long_wdcount], ["reference", "rundkast", "stortinget", "combined_short", "combined_long"])

In [31]:
def make_keyness_df(freq_df, ref_col, compare_col):
    """Keyness computation following this: https://alvinntnu.github.io/NTNU_ENC2036_LECTURES/keyword-analysis.html#computing-keynesss"""
    print()
    keyness_df = freq_df[[ref_col, compare_col]].copy()
    keyness_df.columns = ["reference", "comparison"]
    keyness_df["reference_other"] = keyness_df["reference"].sum()-keyness_df["reference"]
    keyness_df["comparison_other"] = keyness_df["comparison"].sum()-keyness_df["comparison"]
    row_sum_word = keyness_df["reference"] + keyness_df["comparison"]
    row_sum_other = keyness_df["reference_other"] + keyness_df["comparison_other"]
    column_sum_ref = keyness_df["reference"] + keyness_df["reference_other"]
    column_sum_comp = keyness_df["comparison"] + keyness_df["comparison_other"]
    table_sum = row_sum_word + row_sum_other
    keyness_df["reference_exp"] = (row_sum_word * column_sum_ref) / table_sum
    keyness_df["comparison_exp"] = (row_sum_word * column_sum_comp) / table_sum
    keyness_df["reference_other_exp"] = (row_sum_other * column_sum_ref) / table_sum
    keyness_df["comparison_other_exp"] = (row_sum_other * column_sum_comp) / table_sum
    keyness_df["log_likelihood"] = keyness_df.apply(lambda x: power_divergence(f_obs=(x["reference"], x["comparison"], x["reference_other"], x["comparison_other"]), f_exp=(x["reference_exp"], x["comparison_exp"], x["reference_other_exp"], x["comparison_other_exp"]), lambda_=0, ddof=1)[0], axis=1)
    keyness_df["chi_square"] = keyness_df.apply(lambda x: power_divergence(f_obs=(x["reference"], x["comparison"], x["reference_other"], x["comparison_other"]), f_exp=(x["reference_exp"], x["comparison_exp"], x["reference_other_exp"], x["comparison_other_exp"]), lambda_=0, ddof=1)[1], axis=1)
    keyness_df = keyness_df[["reference", "comparison", "log_likelihood", "chi_square"]]
    keyness_df["more_freq_in_comp"] = keyness_df["reference"]/keyness_df["reference"].sum() < keyness_df["comparison"]/keyness_df["comparison"].sum()
    return keyness_df

In [32]:
rundkast_keyness_df = make_keyness_df(wordfreq_df, "reference", "rundkast")
stortinget_keyness_df = make_keyness_df(wordfreq_df, "reference", "stortinget")
combined_short_keyness_df = make_keyness_df(wordfreq_df, "reference", "combined_short")
combined_long_keyness_df = make_keyness_df(wordfreq_df, "reference", "combined_long")




  terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp)
  terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp)





  terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp)
  terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp)





  terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp)
  terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp)





  terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp)
  terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp)


In [33]:
def highlight_rows(row):
    if row["more_freq_in_comp"]:
        color = '#fa0000' # Red
    else:
        color = '#1F18C0' # Blue
    return ['background-color: {}'.format(color) for r in row]


The log-likelihood score indicates how typical or atypical a term is in the compared corpus relative to the target corpus. Words with higher numbers are more typical/atypical for the compared corpus, while words with low numbers are more similar to the compared corpus. Since both typical and atypical words get high numbers, it makes sense to look at the frequency in both corpora too. 

In [46]:
# Stortinget relative to ground truth
# Red = more frequent in Stortinget

pd.set_option('display.max_rows', 100)
stortinget_keyness_df.sort_values(by="log_likelihood", ascending=False).head(100).style.apply(highlight_rows, axis=1)

Unnamed: 0,reference,comparison,log_likelihood,chi_square,more_freq_in_comp
mmm,994,0,1371.368916,0.0,False
e,16,334,358.101376,0.0,True
apples,241,0,332.165227,0.0,False
itunes-politikk,232,0,319.756929,0.0,False
plageånden,240,5,288.940624,0.0,False
o,6,180,206.371606,0.0,True
ipod-,237,23,203.250228,0.0,False
ee,0,128,178.562387,0.0,True
appels,0,124,172.981397,0.0,True
eee,2833,1963,152.744334,0.0,False


In [47]:
# Rundkast relative to ground truth
# Red = more frequent in Rundkast

pd.set_option('display.max_rows', 100)
rundkast_keyness_df.sort_values(by="log_likelihood", ascending=False).head(100).style.apply(highlight_rows, axis=1)

Unnamed: 0,reference,comparison,log_likelihood,chi_square,more_freq_in_comp
mmm,994,1,1358.520964,0.0,False
én,272,0,375.335676,0.0,False
apples,241,0,332.544884,0.0,False
ipod-,237,0,327.023746,0.0,False
e,16,307,322.581587,0.0,True
itunes-politikk,232,0,320.122405,0.0,False
q,0,222,309.380073,0.0,True
råttent,238,23,204.782416,0.0,False
politikk,17,198,180.471793,0.0,True
ee,0,128,178.359162,0.0,True


In [48]:
combined_short_keyness_df.sort_values(by="log_likelihood", ascending=False).head(100).style.apply(highlight_rows, axis=1)

Unnamed: 0,reference,comparison,log_likelihood,chi_square,more_freq_in_comp
mmm,994,0,1377.843281,0.0,False
e,16,356,384.591127,0.0,True
apples,241,0,333.733889,0.0,False
ipod-,237,0,328.193012,0.0,False
itunes-politikk,232,0,321.266997,0.0,False
én,272,8,315.120987,0.0,False
ee,0,148,205.499681,0.0,True
o,6,170,192.018423,0.0,True
appels,0,136,188.834564,0.0,True
a,6,155,172.27471,0.0,True


In [49]:
combined_long_keyness_df.sort_values(by="log_likelihood", ascending=False).head(100).style.apply(highlight_rows, axis=1)

Unnamed: 0,reference,comparison,log_likelihood,chi_square,more_freq_in_comp
mmm,994,2,1352.801377,0.0,False
apples,241,0,333.983278,0.0,False
ipod-,237,0,328.43826,0.0,False
q,0,232,321.929716,0.0,True
itunes-politikk,232,0,321.507071,0.0,False
én,272,7,321.245618,0.0,False
e,16,288,296.45137,0.0,True
ipod,6,160,178.685144,0.0,True
o,6,156,173.432293,0.0,True
ee,0,124,172.041458,0.0,True


## Hapax analysis

In [50]:
hap_ref = wordfreq_df.query('reference == 1')
hap_rundkast = wordfreq_df.query('rundkast == 1')
hap_stortinget = wordfreq_df.query('stortinget == 1')
hap_combined_short = wordfreq_df.query('combined_short == 1')
hap_combined_long = wordfreq_df.query('combined_long == 1')

In [51]:
print(f"number of hapaces in ground truth: {hap_ref.reference.count()}")
print(f"number of hapaces in Rundkast: {hap_rundkast.reference.count()}")
print(f"number of hapaces in Stortinget: {hap_stortinget.reference.count()}")
print(f"number of hapaces in Combined Short: {hap_combined_short.reference.count()}")
print(f"number of hapaces in Combined Long: {hap_combined_long.reference.count()}")

number of hapaces in ground truth: 9672
number of hapaces in Rundkast: 20215
number of hapaces in Stortinget: 20591
number of hapaces in Combined Short: 18676
number of hapaces in Combined Long: 17613


In [54]:
hap_rundkast.query("reference == 0").sample(100, random_state=1)

Unnamed: 0,reference,rundkast,stortinget,combined_short,combined_long
diplomet,0,1,0,1,0
nabu,0,1,1,1,0
indresogen,0,1,1,1,0
kunstakademis,0,1,0,0,0
norden,0,1,1,2,1
clubstør,0,1,0,0,0
rash,0,1,0,0,0
job,0,1,0,0,0
hemstemvet,0,1,0,0,0
avturgåerne,0,1,0,0,0


In [55]:
hap_stortinget.query("reference == 0").sample(100, random_state=1)

Unnamed: 0,reference,rundkast,stortinget,combined_short,combined_long
hurtigs,0,0,1,1,0
forbillelig,0,0,1,1,1
voyekleivene,0,0,1,1,0
ligene,0,0,1,0,0
molyjessen,0,0,1,0,0
meforskjellige,0,0,1,1,0
løvende,0,0,1,0,0
selvb,0,0,1,0,0
rekerasjonsområder,0,0,1,0,0
oppretten,0,0,1,0,0


In [56]:
hap_combined_short.query("reference == 0").sample(100, random_state=1)

Unnamed: 0,reference,rundkast,stortinget,combined_short,combined_long
festk,0,0,0,1,0
godd,0,1,4,1,1
benene,0,0,1,1,0
dawnport,0,0,0,1,0
oppover-bakker,0,0,0,1,1
leda,0,0,0,1,1
danse-forestillinger,0,1,0,1,1
somnm,0,0,0,1,0
eierselskapet,0,0,1,1,1
suplement,0,0,0,1,0


In [57]:
hap_combined_long.query("reference == 0").sample(100, random_state=1)

Unnamed: 0,reference,rundkast,stortinget,combined_short,combined_long
ltså,0,0,0,2,1
øvne,0,1,1,1,1
lu,0,1,1,1,1
brukke,0,0,0,0,1
insitament,0,1,1,1,1
etvilsomt,0,0,1,0,1
utseden,0,1,0,0,1
aard,0,0,0,0,1
plagån,0,0,0,0,1
huse,0,0,3,1,1
