In [1]:
from hashlib import md5
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [2]:
def file_hash(x, note=""):
    h = md5()
    h.update(Path(x).read_bytes())
    print(x, h.hexdigest()[:6], note, sep="\t")

## Data dependencies
```
./data/new_emit_all.pkl	4c57d2	
./data/tsnedf_100.pkl	fb9d93	
./data/all_data.csv	a4e413	CRLF
```

In [3]:
# all_data.csv might be different
file_hash("./data/new_emit_all.pkl")
file_hash('./data/tsnedf_100.pkl')
file_hash("./data/all_data.csv", "CRLF")

./data/new_emit_all.pkl	4c57d2	
./data/tsnedf_100.pkl	fb9d93	
./data/all_data.csv	a4e413	CRLF


In [4]:
new_emit_all = pd.read_pickle('./data/new_emit_all.pkl')
ac_data = pd.read_csv("./data/all_data.csv")
tsnedf = pd.read_pickle('./data/tsnedf_100.pkl')

ac_data = ac_data.rename({"nasality (a1-p0)": "nasality"}, axis=1)
new_emit_all = pd.concat([new_emit_all.reset_index(drop=True), tsnedf], axis=1)
new_emit_all["next_offset_s"] = new_emit_all.shift(-1, fill_value=-1).offset_s

In [5]:
new_emit_all[:2]

Unnamed: 0,offset_id,offset_s,bilstm,phone,filename,has_match,match_ho_id,ho_datalist,comp-1,comp-2,next_offset_s
0,346,10.38,"[0.008234375, 0.036981307, 0.05118245, 0.02981...",t̪ʰ,劉櫂豪-127903,no,,[],-5.941576,-25.208309,10.44
1,348,10.44,"[-0.093030915, -0.24093345, 0.037780587, 0.111...",e,劉櫂豪-127903,no,,[],5.955706,33.086273,11.64


In [6]:
len(set(new_emit_all.index.values))

11716

In [7]:
ac_data[:1]

Unnamed: 0,file,time_start,time_end,time_midpoint,speaker_gender,context_raw,position,syllable_structure,nasalized_perceived,function_1,function_2,nasality,freq_f1,freq_f2,freq_f3
0,127849,12.42569,13.07296,12.999325,m,h_oN_f,f,cv,1,b,b,2.62,581,1116,2568


In [8]:
ac_data.iloc[0]

file                      127849
time_start              12.42569
time_end                13.07296
time_midpoint          12.999325
speaker_gender                 m
context_raw               h_oN_f
position                       f
syllable_structure            cv
nasalized_perceived            1
function_1                     b
function_2                     b
nasality                    2.62
freq_f1                      581
freq_f2                     1116
freq_f3                     2568
Name: 0, dtype: object

## Check file names

In [9]:
ac_data.file.unique()

array([127849, 127889, 127903, 127909, 127940, 128068], dtype=int64)

In [10]:
new_emit_all.filename.unique()

array(['劉櫂豪-127903', '陳椒華-127940', '黃國書-127909', '許淑華-127889',
       '趙正宇-127849', '陳瑩-128068'], dtype=object)

## Align acoustic phones and allosaurus' emit frames

In [11]:
ac_aligned = []

for ac_x in tqdm(ac_data.itertuples()):            
        
    ac_fileno = str(ac_x.file)
    ac_start = ac_x.time_start
    ac_end = ac_x.time_end
    ac_mid = ac_x.time_midpoint            
    offset_s = new_emit_all.offset_s
    next_offset_s = new_emit_all.next_offset_s
    
    emit_x = new_emit_all.loc[
            (new_emit_all.filename.str.endswith(ac_fileno)) & 
            (((ac_start < offset_s) & (next_offset_s < ac_end)) | 
             ((offset_s < ac_mid) & (ac_mid < next_offset_s)))
    ]    
    
    if emit_x.shape[0] == 0:
        continue
    else:
        ac_item = {k: v for k, v in ac_x._asdict().items()
                   if k not in ("file", "Index")}
        for i in range(emit_x.shape[0]):
            emit_entry = emit_x.iloc[i]            
            ac_aligned.append({"emit_id": emit_entry.name, "ac_id": ac_x.Index,                     
                        "has_match": 'yes',
                        **ac_item})

0it [00:00, ?it/s]

In [12]:
len(ac_aligned)

863

In [13]:
aligned_ac_data = pd.DataFrame(ac_aligned).set_index("emit_id")\
            .join(new_emit_all.drop(["has_match", "match_ho_id", "ho_datalist"], axis=1), how='inner')
aligned_ac_data["emit_id"] = aligned_ac_data.index
aligned_ac_data["has_midpoint"] = aligned_ac_data.eval("offset_s < time_midpoint < next_offset_s")
aligned_ac_data["collapse_func"] = aligned_ac_data.function_1.apply(lambda x: "b" if x=='b' else 'o')
_ = aligned_ac_data.reset_index(drop=True)

In [14]:
aligned_ac_data = aligned_ac_data.drop_duplicates("emit_id")
print("Total aligned emit frames: ", aligned_ac_data.shape)
print("emit frame containing a midpoint: ", aligned_ac_data.has_midpoint.sum())

Total aligned emit frames:  (532, 27)
emit frame containing a midpoint:  194


In [15]:
# We successfully align 194 phones. Some fractions the phones occurred are entirely ignored by allosaurus
aligned_ac_data.ac_id.unique().shape

(194,)

In [16]:
# There are 27 variables
aligned_ac_data.columns

Index(['ac_id', 'has_match', 'time_start', 'time_end', 'time_midpoint',
       'speaker_gender', 'context_raw', 'position', 'syllable_structure',
       'nasalized_perceived', 'function_1', 'function_2', 'nasality',
       'freq_f1', 'freq_f2', 'freq_f3', 'offset_id', 'offset_s', 'bilstm',
       'phone', 'filename', 'comp-1', 'comp-2', 'next_offset_s', 'emit_id',
       'has_midpoint', 'collapse_func'],
      dtype='object')

In [17]:
aligned_ac_data.phone.value_counts()

a      133
n       49
x       48
i       48
o       45
ŋ       35
t       18
ɕ       17
ʂ       16
k       13
ə       12
e       12
m       11
j       10
t̪       8
tʰ       8
y        6
u        6
l        6
s        5
t̪ʰ      5
l̪       4
p        3
kʰ       3
s̪       3
ɤ        3
w        3
f        2
Name: phone, dtype: int64

## Output aligned data

```
./data/aligned_ac_data.pkl	3324ae	
./data/aligned_ac_data.csv	106e64	
```

In [18]:
aligned_ac_data.drop("bilstm", axis=1).to_csv("./data/aligned_ac_data.csv")
aligned_ac_data.to_pickle("./data/aligned_ac_data.pkl")
file_hash("./data/aligned_ac_data.pkl")
file_hash("./data/aligned_ac_data.csv")

./data/aligned_ac_data.pkl	3324ae	
./data/aligned_ac_data.csv	106e64	
