In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

import pandas as pd
from IPython.display import display, Audio
from tqdm.auto import tqdm

sys.path.append("..")

from GSSP_utils.path_conf import cgn_root_dir, cgn_ort_path, loc_data_dir
from GSSP_utils.cgn.ort_parser import parse_ort_file, parse_ort_file_agg
import numpy as np
import torchaudio

pd.options.display.max_rows = 80
pd.options.display.max_columns = None


Text types:

* `tta`: spontaneous conversations (face-to-face)
* `ttb`: interviews with teachers of Dutch<br>
    is inderdaad iemand die het merendeel van de tijd spreekt en dan een interviewer.<br>
    => lijkt me heel goed voor `spontaneous` speech.
* `ttc`: spontaneous telephone dialogues (recorded via a switchboard)
* `ttd`:  spontaneous telephone dialogues (recorded on MD with local interface)
* `tte`: simulated business negotiations
* `ttf`: **interviews/discussions/debates (broadcast)** --> spontaneous speech (wel multi users, ben benieuwd hoeveel samples we hiervan gaan overhouden)
* `ttg`: **(political) discussions/debates/meetings (non-broadcast)** --> spontaneous speech
* `tth`: lessons recorded in a classroom
* `tti`: live (eg sport) commentaries (broadcast)
* `ttj`:  newsreports/reportages (broadcast)
* `ttk`: news (broadcast)
* `ttl`: commentaries/columns/reviews (broadcast) ==> klinkt als semi-guided /spontaneous speech
* `ttm`: ceremonious speeches/sermons `to investigate` ==> hier vooral voorgelezen spraak
* `ttn`: lectures/seminars `to investigate` ==> klinkt goed 
* `tto`: **read speech**

# EDA

 ## Listening to some audio samples

In [3]:
component = "n"
lang = "nl"  # 'vl'

files = list(cgn_root_dir.glob(f"cdroms/comp-{component}/{lang}/*.wav"))

f = np.random.choice(files)
wav, sr = torchaudio.load(f)
wav = wav.numpy().ravel()
print(f"{len(wav):,} \t {sr} Hz {len(wav) / sr:.2f} s")


9,581,397 	 16000 Hz 598.84 s


In [4]:
rec_name = "fv400103"

files = list(cgn_root_dir.glob(f"cdroms/comp-*/*/{rec_name}.wav"))

f = np.random.choice(files)
wav, sr = torchaudio.load(f)
wav = wav.numpy().ravel()
print(f"{len(wav):,} \t {sr} Hz {len(wav) / sr:.2f} s")
wav = wav[: sr * 200]

# Audio(data=wav, rate=sr, autoplay=True)


34,000,414 	 16000 Hz 2125.03 s


# ORT file data parsing ([ort docs](../../docs/cgn/ort_format.htm))

Parse some `ort` files and analyze the parsed output

In [3]:
# ort_file = list(cgn_ort_path.glob(f"*/*/{rec_name}.ort"))[0]
ort_file = list(cgn_ort_path.glob(f"*/*/*.ort"))[0]
display(ort_file)

df_ort = parse_ort_file(ort_file)
display(df_ort)

df_ort["duration_s"] = df_ort["t_stop"] - df_ort["t_start"]
df_ort.sort_values(by="duration_s", ascending=False).head(10)


PosixPath('/media/SPS/cgn/ort/comp-a/vl/fv400392.ort')

Unnamed: 0,speaker_name,t_start,t_stop,transcript,rec_name
0,BACKGROUND,0.000,88.702,,fv400392
1,BACKGROUND,88.702,90.145,kopje wordt neergezet.,fv400392
2,BACKGROUND,90.145,100.912,,fv400392
3,BACKGROUND,100.912,105.761,koffie wordt ingeschonken.,fv400392
4,BACKGROUND,105.761,112.738,,fv400392
...,...,...,...,...,...
313,V40098,363.691,364.445,,fv400392
314,V40098,364.445,366.268,rond Pilsen.,fv400392
315,V40098,366.268,367.585,,fv400392
316,V40098,367.585,368.405,ja.,fv400392


Unnamed: 0,speaker_name,t_start,t_stop,transcript,rec_name,duration_s
12,COMMENT,0.0,370.237,,fv400392,370.237
8,BACKGROUND,191.788,344.713,,fv400392,152.925
0,BACKGROUND,0.0,88.702,,fv400392,88.702
6,BACKGROUND,117.632,189.863,,fv400392,72.231
87,V40097,196.069,255.178,,fv400392,59.109
10,BACKGROUND,346.219,362.375,,fv400392,16.156
55,V40097,97.628,111.229,,fv400392,13.601
134,V40097,340.376,353.175,,fv400392,12.799
167,V40098,57.379,70.035,,fv400392,12.656
65,V40097,134.792,145.918,,fv400392,11.126


In [4]:
df_ort = parse_ort_file_agg(ort_file)
df_ort


Unnamed: 0,speaker_name,t_start,t_stop,transcript,transcr_len,duration_s,rec_name
0,V40097,0.000,5.226,uh dat klopt. uh. je kent ook nog Paul uh Piet...,50,5.226,fv400392
1,V40097,6.295,9.626,awel 'k ga ne*d keer een typisch voorbeeld gev...,73,3.331,fv400392
2,V40097,10.000,23.832,maar wij dialectsprekers toen we klein waren w...,218,13.832,fv400392
3,V40097,24.585,43.683,ja dus uh wanneer de naam eindigt op S. wannee...,304,19.098,fv400392
4,V40097,44.123,47.014,uh dat was een typisch dialectische uhm,39,2.891,fv400392
...,...,...,...,...,...,...,...
111,BACKGROUND,100.912,105.761,koffie wordt ingeschonken.,26,4.849,fv400392
112,BACKGROUND,112.738,117.632,koffie wordt ingeschonken.,26,4.894,fv400392
113,BACKGROUND,189.863,191.788,kopje wordt neergezet.,22,1.925,fv400392
114,BACKGROUND,344.713,346.219,kopje wordt neergezet.,22,1.506,fv400392


### Parsing all the `.ort` files

In [5]:
from tqdm.auto import tqdm

df_list = []
for ort_file in tqdm(list(cgn_ort_path.glob("**/*.ort"))):
    df_list.append(parse_ort_file_agg(ort_file))

df_ort_agg = pd.concat(df_list)
df_ort_agg.to_parquet(loc_data_dir / "df_cgn_ort_agg.parquet")

display(df_ort_agg[df_ort_agg["duration_s"] > 15])


  0%|          | 0/12780 [00:00<?, ?it/s]

Unnamed: 0,speaker_name,t_start,t_stop,transcript,transcr_len,duration_s,rec_name
3,V40097,24.585,43.683,ja dus uh wanneer de naam eindigt op S. wannee...,304,19.098,fv400392
81,V40098,189.058,207.362,doo*a uh via Eurochildren*v dat zijn dus de ki...,271,18.304,fv400392
17,V40074,101.093,125.855,'k vind dat wel leuk. da's wel goed voor jou o...,371,24.762,fv400274
30,V40074,220.402,253.263,ja. en ook hij kan op- en afkomen*d. 't is maa...,463,32.861,fv400274
37,V40074,287.181,307.906,als je bij iemand bent of... bijvoorbeeld de l...,362,20.725,fv400274
...,...,...,...,...,...,...,...
22,N00083,95.751,134.844,mijn naam is Bongers van de gemeente Arnhem en...,683,39.093,fn000076
23,N00084,243.469,326.439,Van Hout gemeente Rede. ik had een uh vraag o*...,1438,82.970,fn000076
24,N00085,447.624,475.957,mijn naam is uh De Gouw en ik werk voor het uh...,479,28.333,fn000076
25,N00085,477.026,504.535,waar ik eigenlijk benieuwd naar ben is hoe de ...,494,27.509,fn000076


# The accompanied `speaker` and `recording` metadata

more information about these components their properties can be found in the [../docs/cgn](../docs/cgn) folder

## `df_speak`: exploratory data analysis + parsing

In [6]:
df_speak = pd.read_csv(cgn_root_dir.joinpath("metadata", "speakers.txt"), sep="\t")
display(df_speak.shape)

non_unique_speak_columns = []
for c in df_speak.columns:
    if df_speak[c].nunique() <= 1:
        print(c, df_speak[c].unique())
        non_unique_speak_columns.append(c)

# Note: we use a list comprehension instead of set operations to retain the column order
df_speak[[c for c in df_speak.columns if c not in non_unique_speak_columns]].sample(4)


(4250, 23)

type ['PARTICIPANT']
version ['HEADER.version1.0']
update ['1/01/2004']
language ['SD']
notes ['none']


Unnamed: 0,creator,ID,sex,birthYear,birthPlace,birthRegion,firstLang,homeLang,workLang,resPlace,resRegion,eduPlace,eduRegion,eduSize,education,eduLevel,occupation,occLevel
453,CLS-KUN,N00681,sex1,1973,NL-50-,regN4a,SD,SD,SD,NL-25-,regN1a,NL-30-,regN1a,size1,hbo,edu1,unspecified,occX
826,CLS-KUN,N01255,sex2,1952,NL-473,regN4a,SD,SD,SD,NL-46-,regN4a,NL-46-,regN4a,size3,vbo,edu3,cleaner,occ4
2158,CLS-KUN,N08012,sex2,1922,NL-466,regN4a,SD,SD,SD,NL-46-,regN4a,NL-46-,regN4a,size3,lager onderwijs,edu3,housewife,occ7
2814,CLS-KUN,N09392,sex2,1951,NL-97-,regN3d,SD,SD,SD,NL-967,regN3d,NL-967,regN3d,size4,hbo,edu1,teacher primary school,occ2


### Parsing `df_speak`:

Note: these parsing rules are derived from the information of the [docs/cgn/metadata_speaker](../docs/cgn/metadata_speakers.htm) file.

In [7]:
df_speak["sex"] = df_speak["sex"].map(
    lambda x: {"sex1": "male", "sex2": "female", "sexX": "unknown"}.get(x, x)
)
df_speak["birthYear"] = (
    df_speak["birthYear"].map(lambda x: None if x == "19nn" else x).astype(float)
)

# occ_level_mapping = {
#     "occ1": "occupation requiring higher level of education (doctor, lawyer, etc.)",
#     "occ2": "occupation requiring middle level of education (teacher, journalist, etc.)",
#     "occ3": "occupation requiring lower level of education (mechanic, teacher nursery school, bank employee, etc.)",
#     "occ4": "occupation not requiring any level of education (garbage collector, cleaning lady, taxi driver, etc.)",
#     "occ5": "holding no job, unemployed",
#     "occ6": "holding no job, attending school",
#     "occ7": "holding no job; housewife",
#     "occ8": "holding no job, declared unfit",
#     "occ9": "holding no job; other",
#     "occA": "occupation in higher management or government",
#     "occB": "occupation requiring higher education",
#     "occC": "employed on the teaching or research staff in a university or a college",
#     "occD": "employed in an administrative office or a service organization",
#     "occE": "occupation not requiring any level of specification",
#     "occF": "self-employed",
#     "occG": "politicians",
#     "occH": "employed with the media (journalist, reporter) or artist",
#     "occI": "student, trainee",
#     "occJ": "holding no job",
#     "occX": "unknown",
# }
# df_speak["occLevel"] = df_speak["occLevel"].map(lambda x: occ_level_mapping.get(x, x))

# region_mapping = {
#     "regN1a": "The Netherlands, central region, Zuid-Holland, excl. Goeree Overflakee",
#     "regN1b": "The Netherlands, central region, Noord-Holland, excl. West Friesland",
#     "regN1c": "The Netherlands, central region, West Utrecht, incl. the city of Utrecht",
#     "regN2a": "The Netherlands, transitional region, Zeeland, incl. Goeree Overflakee and Zeeuws-Vlaanderen",
#     "regN2b": "The Netherlands, transitional region, Oost Utrecht, excl. the city of Utrecht",
#     "regN2c": "The Netherlands, transitional region, Gelders rivierengebied, incl. Arnhem and Nijmegen",
#     "regN2d": "The Netherlands, transitional region, Veluwe up to the river IJssel",
#     "regN2e": "The Netherlands, transitional region, West Friesland",
#     "regN2f": "The Netherlands, transitional region, Polders",
#     "regN3a": "The Netherlands, peripheral region 1 (north east), 'Achterhoek'",
#     "regN3b": "The Netherlands, peripheral region 1 (north east), Overijssel",
#     "regN3c": "The Netherlands, peripheral region 1 (north east), Drenthe",
#     "regN3d": "The Netherlands, peripheral region 1 (north east), Groningen",
#     "regN3e": "The Netherlands, peripheral region 1 (north east), Friesland",
#     "regN4a": "The Netherlands, peripheral region 2 (south), Noord-Brabant",
#     "regN4b": "The Netherlands, peripheral region 2 (south), Limburg",
#     "regNx": "The Netherlands, unknown",
#     "regV1": "Flanders, central region (Antwerpen and Vlaams-Brabant)",
#     "regV2": "Flanders, transitional region (Oost-Vlaanderen)",
#     "regV3": "Flanders, peripheral region 1 (West-Vlaanderen)",
#     "regV4": "Flanders, peripheral region 2 (Limburg)",
#     "regVx": "Flanders, unknown",
#     "regW": "Wallonia",
#     "regX": "region unknown",
#     "regZ": "region known to be outside of The Netherlands and Flanders",
# }

# for c in ['birthRegion', 'resRegion', 'eduRegion']:
#     df_speak[c] = df_speak[c].map(lambda x: region_mapping.get(x, x))


In [8]:
display(
    df_speak[[c for c in df_speak.columns if c not in non_unique_speak_columns]].shape
)
df_speak[[c for c in df_speak.columns if c not in non_unique_speak_columns]].sample(4)


(4250, 18)

Unnamed: 0,creator,ID,sex,birthYear,birthPlace,birthRegion,firstLang,homeLang,workLang,resPlace,resRegion,eduPlace,eduRegion,eduSize,education,eduLevel,occupation,occLevel
2724,CLS-KUN,N09268,female,,xxx,regX,SD,SD,SD,xxx,regX,xxx,regX,sizeX,hbo,edu1,teacher primary school,occ2
3490,ELIS-UG,V60343,female,1947.0,B-900,regV2,unknown,unknown,unknown,B-908,regV2,B-900,regV2,sizeX,hoger onderwijs of universiteit,edu1,parlementslid,occG
4213,ELIS-UG,V90537,female,1971.0,B-300,regV1,unknown,unknown,unknown,B-320,regV1,B-320,regV1,sizeX,hoger onderwijs of universiteit,edu1,leerkracht,occC
1153,CLS-KUN,N03295,female,1954.0,NL-30-,regN1a,SD,SD,SD,NL-30-,regN1a,NL-30-,regN1a,size1,hbo,edu1,columnist,occ2


## `df_rec`: exploratory data analysis + parsing

In [9]:
# metadata of the recordings and the speakers
df_rec = pd.read_csv(cgn_root_dir.joinpath("metadata", "recordings.txt"), sep="\t")
display(df_rec.shape)

non_unique_rec_columns = []
for c in df_rec.columns:
    if df_rec[c].nunique() <= 1:
        print(c, df_rec[c].unique())
        non_unique_rec_columns.append(c)


(12767, 86)

aXtype ['TEXT']
version ['HEADER.version1.0']
aXupdate ['1/01/2004']
respType ['SAMPLING']
respType.1 ['ORTHOGRAPHIC TRANSCRIPTION']
respType.2 ['PART-OF-SPEECH TAGGING']
respType.3 ['LEMMATISATION']
respType.4 ['LEXICON LINK-UP']
respType.5 [nan 'WORD SEGMENTATION']
respType.6 [nan 'PHONETIC TRANSCRIPTION']
respType.7 [nan 'SYNTACTIC ANNOTATION']
respType.8 [nan 'PROSODIC ANNOTATION']
unit ['MB']
extNote ['none']
distributor ['ELDA']
interactionXpassive ['no']
relationXactive ['not used']
relationXpassive ['not used']
aXdesc ['not used']
mutual ['not used']
micDistanceXperson ['unspecified']
dist ['unspecified']
cm ['unspecified']
noise ['unspecified']
status ['DIG2']
revDescXdate [nan]
respstmtXrespType [nan]
respstmtXrespName [nan]
revDescXdate.1 [nan]
respstmtXrespType.1 [nan]
respstmtXrespName.1 [nan]
revDescXdate.2 [nan]
respstmtXrespType.2 [nan]
respstmtXrespName.2 [nan]
revDescXdate.3 [nan]
respstmtXrespType.3 [nan]
respstmtXrespName.3 [nan]
revDescXdate.4 [nan]
respstmtXrespTy

In [10]:
useless_rec_cols = ["respName"] + [f"respName.{i}" for i in range(1, 9)]

df_rec[
    [c for c in df_rec.columns if c not in non_unique_rec_columns + useless_rec_cols]
].sample(3)


Unnamed: 0,recordingID,creator,info,wordCount,secCount,byteCount,wph,WAV-DVD,author,biblStringXtitle,pubName,pubPlace,pubDate,rexXdate,time,source,producer,target,term,speakerIDs,role,age,interactionXtype,interactionXactive,locName,locale,activity,recMediumXtype,microphoneXtype,recording,processing
194,fn000198,CLS-KUN,Dutch parliamentary debate,1845,662,20.2,10029,CGN_WAV_23,not applicable,not applicable,not applicable,not applicable,not applicable,1999,unspecified,Draadomroep,CGN,ttg prep3 mod3 dom2,unspecified,"N00181, N00313, N00368, N00369, N00370, N00371...","unspecified, unspecified, unspecified, unspeci...","age3, ageX, ageX, ageX, ageX, ageX, ageX, ageX...",it2,13,unspecified,unspecified,unspecified,DAT tape,unspecified,DIG2,DIG1
7793,fn007964,CLS-KUN,spontaneous conversation ('face-to-face'),1967,498,30.4,14234,CGN_WAV_08,not applicable,not applicable,not applicable,not applicable,not applicable,2002,unspecified,home recording,CGN,tta prep2 mod3 dom1,unspecified,"N01251, N01252, N01253",FAM: siblings SOC: friends,"age1, age1, age1",it3,3,unspecified,LOC1,unspecified,Mini Disk,SONY ECM-MS907,DIG2,DIG2
3430,fn003503,CLS-KUN,radio: (national) radio news,55,19,0.6,10403,CGN_WAV_28,not applicable,not applicable,not applicable,not applicable,not applicable,2001,unspecified,news bulletin ANP,ANP Radio,ttk prep3 mod1 dom2,IT-bedrijven vaak failliet,N02004,newsreader,age1,it4,1,unspecified,unspecified,unspecified,computer,unspecified,DIG2,DIG2


### `df_rec` EDA

**Note**: this parsing information is obtained from the [docs/cgn/metadata_recordings](../docs/cgn/metadata_recordings.htm) file.

In [11]:
df_rec.age.sample(
    3
)  # Age class to which the speaker belonged at the time of the recording

# age0 = under 18 years of age;
# age1 = 18-24 years of age;
# age2 = 25-34 years of age;
# age3 = 35 -44 years of age;
# age4 = 45-55 years of age;
# age5 = over 55 years of age;
# ageX = age unknown


5393                age2
2060                age3
7800    age1, age1, age1
Name: age, dtype: object

In [12]:
display(sorted(df_rec.rexXdate.unique()))  # recording date: date or year

print('-'*80)
print("recording time")
display(df_rec.time.sample(10))  # time of the recording (optional);
# if specified: heterogenic data (sometimes time(-ranges), sometimes "evening")

print('-'*80)
print("recordingID")
df_rec.recordingID  # the name of the recording -> <recordingID>.wav
# For all samples from the Netherlands the id starts with the letters fn;
# For all samples from Flanders the id starts with the letters fv


[1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003]

--------------------------------------------------------------------------------
recording time


346      unspecified
8490     unspecified
9566           20u00
9999     unspecified
7357     unspecified
10692    unspecified
3478     unspecified
8835     unspecified
8722     unspecified
9338           21u15
Name: time, dtype: object

--------------------------------------------------------------------------------
recordingID


0        fn000001
1        fn000002
2        fn000003
3        fn000004
4        fn000005
           ...   
12762    fv901194
12763    fv901195
12764    fv901196
12765    fv901204
12766    fv901214
Name: recordingID, Length: 12767, dtype: object

In [13]:
print("speaker IDs")
display(df_rec.speakerIDs)  # The IDs of the speakers that occur within the recording

print('-'*80)
print("author")
display(df_rec.author.sample(10))  # The author of the book or text that was read aloud


print('-'*80)
print("term")
# one or more keywords that characterize the subject matter in the sample 
display(df_rec.term.sample(10))


speaker IDs


0                N00023
1                N00023
2                N00013
3        N00059, N00060
4        N00059, N00061
              ...      
12762    V90595, V90597
12763    V90595, V90597
12764    V90595, V90597
12765    V90601, V90605
12766    V90588, V90591
Name: speakerIDs, Length: 12767, dtype: object

--------------------------------------------------------------------------------
author


3506      not applicable
11782    Sebastian Barry
69        not applicable
6078      not applicable
4122      not applicable
6725      not applicable
10052     not applicable
979       David Bakdacci
6572      not applicable
1998      not applicable
Name: author, dtype: object

--------------------------------------------------------------------------------
term


9679                                           unspecified
1533                                            NS overleg
5580     EU neemt besluit over negatief reisadvies Sri ...
7884                                           unspecified
11376                                          unspecified
5099              overmatig drankgebruik vakantie jongeren
1964     voetbalsupporters zitten nog steeds vast na we...
5041               personeel Sofiaziekenhuis niet vervolgd
5203                                   IOC beslist locatie
4143                                     sneeuw Oostenrijk
Name: term, dtype: object

In [16]:
df_rec.target
# gives information about four aspects: text type, degree of preparedness, mode, and domain;
# text type: specifies the component to which a sample belongs; (see list below)
# degree of preparedness: prep1 = scripted, prep2 = unscripted, prep3 = more-or-less scripted;
# mode: mod1 = broadcast, radio; mod2 = broadcast, tv; mod3 = non-broadcast
# domain: dom1 = private; dom2= public


0        tti prep2 mod1 dom2
1        ttl prep1 mod1 dom2
2        ttk prep3 mod1 dom2
3        ttb prep2 mod3 dom1
4        ttb prep2 mod3 dom1
                ...         
12762    ttc prep2 mod3 dom1
12763    ttc prep2 mod3 dom1
12764    ttc prep2 mod3 dom1
12765    ttc prep2 mod3 dom1
12766    ttc prep2 mod3 dom1
Name: target, Length: 12767, dtype: object

### Parsing `df_rec`

In [14]:
# We transform the input
df_rec[["text_type", "dop", "mode", "domain"]] = (
    df_rec.target.str.split(" ").explode().values.reshape(-1, 4)
)
df_rec["country"] = df_rec.recordingID.map(
    lambda x: {"v": "Flanders", "n": "Netherlands"}.get(x[1])
)

# degree of preparedness: prep1 = scripted, prep2 = unscripted, prep3 = more-or-less scripted;
df_rec["dop"] = df_rec["dop"].map(
    lambda x: {
        "prep1": "scripted",
        "prep2": "unscripted",
        "prep3": "semi-scripted",
    }.get(x, x)
)
# mode: mod1 = broadcast, radio; mod2 = broadcast, tv; mod3 = non-broadcast
df_rec["mode"] = df_rec["mode"].map(
    lambda x: {"mod1": "radio", "mod2": "tv", "mod3": "non-broadcast"}.get(x, x)
)
# domain: dom1 = private; dom2= public
df_rec["domain"] = df_rec["domain"].map(
    lambda x: {"dom1": "private", "dom2": "public"}.get(x, x)
)


In [15]:
display(df_rec.speakerIDs.str.contains(",").value_counts())

# For sake of simplicity, we only use monotonic speaker recordings
df_rec[~df_rec.speakerIDs.str.contains(",")].dop.value_counts()


False    8045
True     4722
Name: speakerIDs, dtype: int64

semi-scripted    5649
scripted         2123
unscripted        273
Name: dop, dtype: int64

In [16]:
# df_rec.groupby(["text_type", "dop", "mode", "domain"]).size().rename(
#     "#recordings"
# ).to_frame()

# Statistics of all single-speaker recordings
df_rec[~df_rec.speakerIDs.str.contains(",")].groupby(
    [
        "text_type",
        "dop",
        "mode",
        "domain",
        "noise",
        # "country"
    ]
).size().rename("#recordings").to_frame()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,#recordings
text_type,dop,mode,domain,noise,Unnamed: 5_level_1
ttf,semi-scripted,radio,public,unspecified,22
ttf,semi-scripted,tv,public,unspecified,11
ttg,semi-scripted,non-broadcast,public,unspecified,39
tth,semi-scripted,non-broadcast,public,unspecified,83
tti,unscripted,radio,public,unspecified,147
tti,unscripted,tv,public,unspecified,126
ttj,semi-scripted,radio,public,unspecified,39
ttj,semi-scripted,tv,public,unspecified,238
ttk,semi-scripted,radio,public,unspecified,5167
ttk,semi-scripted,tv,public,unspecified,50


---

## Merging the `ort_agg`, the `speaker` and recording metadata

In [20]:
df_ort_rec_speaker = pd.merge(
    pd.merge(
        df_ort_agg,
        df_rec[
            [
                c
                for c in df_rec.columns
                if c not in non_unique_rec_columns + useless_rec_cols
            ]
        ],
        left_on="rec_name",
        right_on="recordingID",
        how="left",
    ),
    df_speak[[c for c in df_speak.columns if c not in non_unique_speak_columns]],
    left_on=["speaker_name", "creator"],
    right_on=["ID", "creator"],
    how="left",
).drop(columns=["recordingID", "ID"])

df_ort_rec_speaker["component"] = df_ort_rec_speaker["text_type"].map(
    lambda x: x[-1] if isinstance(x, str) else None
)


In [21]:
# NOTE: we have no recording metadata for some telephone dialogues (component C),
# but we were not planning to use them anyway
no_matches = set(df_ort_agg["rec_name"]) - set(df_rec["recordingID"])

print("-" * 80)
print("ort - rec")
for file in no_matches:
    print(list(cgn_root_dir.glob(f"**/{file}.wav")))

no_matches = set(df_rec["recordingID"]) - set(df_ort_agg["rec_name"])
print("-" * 80)
print("rec - ort")
for file in no_matches:
    print(list(cgn_root_dir.glob(f"**/{file}.wav")))

df_ort_rec_speaker[df_ort_rec_speaker.duration_s > 15].groupby(
    [
        "text_type",
        "dop",
        # "mode",
        "domain",
        # "country",
        # "sex",
    ]
).size().rename(
    "#segments"
).to_frame()  # .to_markdown()


--------------------------------------------------------------------------------
ort - rec
[PosixPath('/media/SPS/cgn/cdroms/comp-c/vl/fv701161.wav')]
[PosixPath('/media/SPS/cgn/cdroms/comp-c/vl/fv701155.wav')]
[PosixPath('/media/SPS/cgn/cdroms/comp-c/vl/fv701123.wav')]
[PosixPath('/media/SPS/cgn/cdroms/comp-c/vl/fv701113.wav')]
[PosixPath('/media/SPS/cgn/cdroms/comp-c/vl/fv701105.wav')]
[PosixPath('/media/SPS/cgn/cdroms/comp-c/vl/fv701122.wav')]
[PosixPath('/media/SPS/cgn/cdroms/comp-c/vl/fv701106.wav')]
[PosixPath('/media/SPS/cgn/cdroms/comp-c/vl/fv701111.wav')]
[PosixPath('/media/SPS/cgn/cdroms/comp-c/vl/fv701124.wav')]
[PosixPath('/media/SPS/cgn/cdroms/comp-c/vl/fv701162.wav')]
[PosixPath('/media/SPS/cgn/cdroms/comp-c/vl/fv701103.wav')]
[PosixPath('/media/SPS/cgn/cdroms/comp-c/vl/fv701104.wav')]
[PosixPath('/media/SPS/cgn/cdroms/comp-c/vl/fv701116.wav')]
--------------------------------------------------------------------------------
rec - ort


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,#segments
text_type,dop,domain,Unnamed: 3_level_1
tta,unscripted,private,4378
ttb,unscripted,private,2376
ttc,unscripted,private,779
ttd,unscripted,private,756
tte,unscripted,private,46
ttf,semi-scripted,public,2349
ttg,semi-scripted,public,1405
ttg,unscripted,private,91
tth,semi-scripted,public,848
tti,unscripted,public,869


In [24]:
df_ort_rec_speaker.to_parquet(loc_data_dir / "df_cgn_ort_rec_speaker.parquet")
