In [1]:
%load_ext watermark

## Data dependencies

```
Directory hash: '878029d4ec735540276c27aac3b21c7221088c34'
```

In [2]:
from hashlib import sha1
from pathlib import Path
asp_dir = "../data/asp"
h = sha1()
for path_x in Path(asp_dir).glob("*.txt"):
    h.update(path_x.read_text().encode())
h.hexdigest()

'878029d4ec735540276c27aac3b21c7221088c34'

## Load data

In [3]:
from tqdm.auto import tqdm
import pandas as pd
import torch
import torchaudio
import torchaudio.functional as T
from transformers import AutoProcessor, Wav2Vec2Model

In [4]:
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-xlsr-53")

Some weights of the model checkpoint at facebook/wav2vec2-large-xlsr-53 were not used when initializing Wav2Vec2Model: ['quantizer.weight_proj.weight', 'project_q.bias', 'quantizer.codevectors', 'project_hid.bias', 'project_hid.weight', 'project_q.weight', 'quantizer.weight_proj.bias']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
wav_dir = Path("../data/TaiwaneseMandarinCorpus/")
for path_x in Path(asp_dir).glob("*.txt"):
    df = pd.read_csv(path_x)
    df_vec = pd.DataFrame(columns=["w2v_first", "w2v_mean"], 
                          index=df.index)
    wav_path = wav_dir / f"{path_x.stem}.wav"
    assert wav_path.exists()
    
    wav, sr = torchaudio.load(wav_path)
    ## by-item loop
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        start_frame = int(row.start_t * sr)
        end_frame = int(row.end_t * sr)
        seg = wav[:,start_frame:end_frame]
        re_seg = T.resample(seg.squeeze(), sr, 16000)
        inputs = processor(re_seg, sampling_rate=16000, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)

        last_hidden_states = outputs.last_hidden_state.squeeze(dim=0)
        df_vec.at[idx, "w2v_first"] = last_hidden_states[0,:].numpy()
        df_vec.at[idx, "w2v_mean"] = last_hidden_states.mean(dim=0).numpy()
        
    break

  0%|          | 0/2512 [00:00<?, ?it/s]

In [6]:
df_merged = df.join(df_vec)

In [7]:
sbjname = path_x.stem
out_path = f"../data/{sbjname}.w2v.parquet"
df_merged.to_parquet(out_path)
!sha1sum $out_path

53afcbbae6cbf63b8b2d4c81fecd44e8dda1f9d6  ../data/CXH2_GY.w2v.parquet


## Watermark

In [8]:
%watermark

Last updated: 2023-04-23T19:34:54.623153+02:00

Python implementation: CPython
Python version       : 3.10.10
IPython version      : 8.12.0

Compiler    : Clang 14.0.6 
OS          : Darwin
Release     : 22.1.0
Machine     : arm64
Processor   : arm
CPU cores   : 8
Architecture: 64bit



In [9]:
%watermark --iversion

torch     : 2.0.0
pandas    : 1.5.2
torchaudio: 2.0.1

