In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from pathlib import Path

from src.metrics import lacc_cer_by_freq_classes
from src.inference_utils import PrecomputeBasedModel
from src.data_utils import get_sample_from_row_original

In [2]:
CURDIR = Path.cwd()

DATADIR = CURDIR / "data" / "original"
assert DATADIR.exists()

MODELS_DIR = CURDIR / "models"
assert MODELS_DIR.exists()

MODEL_ID = MODELS_DIR / 'precompute_distill_4-4'
assert MODEL_ID.exists()

In [3]:
df = pd.read_csv(DATADIR / "test.csv", index_col=0, sep="\t")
df["sample"] = df.apply(lambda row: get_sample_from_row_original(row)[0], axis=1)

df_holdout = df[df["split"] == "holdout"]
df_unknown = df[df["split"] == "unknown"]

df.shape[0], df_holdout.shape[0], df_unknown.shape[0]

(153991, 138882, 15109)

In [4]:
model = PrecomputeBasedModel(MODEL_ID)

In [5]:
# All
lacc_cer_by_freq_classes(model.predict, df).round(3)

Unnamed: 0,class,lAcc,lAcc (norm),CER (total),CER (errors)
0,1-100,0.998,0.998,0.001,0.637
1,101-1000,0.992,0.993,0.002,0.29
2,1001-10000,0.97,0.972,0.007,0.221
3,10001-n,0.975,0.977,0.014,0.521
4,all,0.984,0.986,0.007,0.404


In [6]:
# Holdout
lacc_cer_by_freq_classes(model.predict, df_holdout).round(3)

Unnamed: 0,class,lAcc,lAcc (norm),CER (total),CER (errors)
0,1-100,0.999,0.999,0.001,0.702
1,101-1000,0.995,0.996,0.001,0.295
2,1001-10000,0.98,0.982,0.005,0.236
3,10001-n,0.991,0.992,0.006,0.668
4,all,0.993,0.993,0.003,0.433


In [7]:
# Unknown
lacc_cer_by_freq_classes(model.predict, df_unknown).round(3)

Unnamed: 0,class,lAcc,lAcc (norm),CER (total),CER (errors)
0,1-100,0.829,0.856,0.085,0.476
1,101-1000,0.903,0.914,0.027,0.282
2,1001-10000,0.918,0.924,0.017,0.203
3,10001-n,0.901,0.911,0.047,0.46
4,all,0.905,0.914,0.037,0.383
