In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from pathlib import Path

from src.metrics import lacc_cer_by_freq_classes
from src.inference_utils import PrecomputeBasedModel
from src.data_utils import get_sample_from_row_original

In [2]:
CURDIR = Path.cwd()

DATADIR = CURDIR / "data" / "original"
assert DATADIR.exists()

MODELS_DIR = CURDIR / "models"
assert MODELS_DIR.exists()

MODEL_ID = MODELS_DIR / 'precompute_baseline'
assert MODEL_ID.exists()

In [3]:
df = pd.read_csv(DATADIR / "test.csv", index_col=0, sep="\t")
df["sample"] = df.apply(lambda row: get_sample_from_row_original(row)[0], axis=1)

df_holdout = df[df["split"] == "holdout"]
df_unknown = df[df["split"] == "unknown"]

df.shape[0], df_holdout.shape[0], df_unknown.shape[0]

(153991, 138882, 15109)

In [4]:
model = PrecomputeBasedModel(MODEL_ID)

In [5]:
# All
lacc_cer_by_freq_classes(model.predict, df).round(3)

Unnamed: 0,class,lAcc,lAcc (norm),CER (total),CER (errors)
0,1-100,0.999,0.999,0.001,0.581
1,101-1000,0.995,0.995,0.002,0.283
2,1001-10000,0.986,0.988,0.003,0.218
3,10001-n,0.983,0.985,0.008,0.457
4,all,0.991,0.992,0.004,0.383


In [6]:
# Holdout
lacc_cer_by_freq_classes(model.predict, df_holdout).round(3)

Unnamed: 0,class,lAcc,lAcc (norm),CER (total),CER (errors)
0,1-100,0.999,0.999,0.001,0.677
1,101-1000,0.997,0.997,0.001,0.271
2,1001-10000,0.989,0.991,0.003,0.248
3,10001-n,0.995,0.995,0.004,0.573
4,all,0.996,0.996,0.002,0.412


In [7]:
# Unknown
lacc_cer_by_freq_classes(model.predict, df_unknown).round(3)

Unnamed: 0,class,lAcc,lAcc (norm),CER (total),CER (errors)
0,1-100,0.877,0.884,0.043,0.332
1,101-1000,0.944,0.951,0.017,0.303
2,1001-10000,0.967,0.971,0.006,0.167
3,10001-n,0.933,0.939,0.029,0.411
4,all,0.943,0.949,0.022,0.362
