# Dataset stats for the paper "Improving Generalization of Norwegian ASR with Limited Linguistic Resources"

## Stats for the subsets used in the experiments

In [22]:
import pandas as pd
from pathlib import Path

In [23]:
npsc_files = list(Path("npsc").glob("*_nb.csv"))
rundkast_files = list(Path("rundkast").glob("*_nb.csv"))
nbtale_files = list(Path("nbtale").glob("*.csv"))
nst_files = list(Path("nst").glob("*.csv"))
combined_sample_files = list(Path("combined_datasets").glob("*sample_nb*.csv"))
combined_total_files = list(Path("combined_datasets").glob("*total_nb*.csv"))
all_files = npsc_files + rundkast_files + nbtale_files + nst_files + combined_sample_files + combined_total_files

In [24]:
def get_stats(filename):
    dataset = filename.stem
    df = pd.read_csv(filename).query("region != 'foreign'")
    duration = round(df.duration.sum()/3600, 1)
    speakers = df.speaker_id.nunique()
    return {"dataset": dataset, "duration": duration, "num_speakers": speakers}


In [25]:
stats = []
for f in all_files:
    statsdict = get_stats(f)
    stats.append(statsdict)
stats_df = pd.DataFrame(stats)

In [26]:
stats_df

Unnamed: 0,dataset,duration,num_speakers
0,npsc_test_nb,9.1,84
1,npsc_validation_nb,9.6,78
2,npsc_train_nb,70.3,234
3,rundkast_test_nb,5.9,387
4,rundkast_validation_nb,5.5,347
5,rundkast_train_nb,43.6,1032
6,nbtale_12,9.3,240
7,nbtale_3,7.4,229
8,nst_validation,25.8,58
9,nst_test,25.6,58


## Speaker count in NPSC

In [12]:
cols = [
    "speaker_id",
    "gender",
    "utterance_id",
    "language",
    "raw_text",
    "full_audio_file",
    "original_data_split",
    "region",
    "duration",
    "start",
    "end",
    "utterance_audio_file",
    "standardized_text",
]

In [14]:
dfs = []
raw_dir = Path("raw_datasets")
npsc_raw_files = list(raw_dir.glob("npsc_*.csv"))
for f in npsc_raw_files:
    df = pd.read_csv(f, names=cols)
    dfs.append(df)
npsc_raw_df = pd.concat(dfs)

In [16]:
npsc_raw_df.speaker_id.nunique()

267