In [1]:
import sys
sys.path.append('..')
sys.path.append('../external_libs/WavLM')


import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [2]:
from pathlib import Path
import torchaudio
import fairseq
import torch
device = torch.device('cuda')
device

device(type='cuda')

In [3]:
def get_mos_data(split):
    mos_list_file = f'../data/phase1-ood/DATA/sets/{split}_mos_list.txt'
    mos_data = {}
    for line in open(mos_list_file):
        elms = line.rstrip().split(',')
        if len(elms) == 2:
            file_id, mos = elms
            mos = float(mos)
            mos_data[file_id] = mos
        else:
            file_id = elms[0]
            mos_data[file_id] = 0
            
    return mos_data

In [4]:
train_mos_data = get_mos_data('train')
len(train_mos_data)

136

In [5]:
val_mos_data = get_mos_data('val')
len(val_mos_data)

136

In [6]:
unlabeled_mos_data = get_mos_data('unlabeled')
len(unlabeled_mos_data)

540

In [7]:
wav_dir = Path('../data/phase1-ood/DATA/wav/')


In [8]:
from WavLM import WavLM, WavLMConfig

checkpoint = torch.load('../external_libs/WavLM/WavLM-Large.pt')
cfg = WavLMConfig(checkpoint['cfg'])
model = WavLM(cfg)
model.load_state_dict(checkpoint['model'])


2022-02-16 16:40:47 | INFO | WavLM | WavLM Config: {'extractor_mode': 'layer_norm', 'encoder_layers': 24, 'encoder_embed_dim': 1024, 'encoder_ffn_embed_dim': 4096, 'encoder_attention_heads': 16, 'activation_fn': 'gelu', 'layer_norm_first': True, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'feature_grad_mult': 1.0, 'normalize': True, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.0, 'dropout_input': 0.0, 'dropout_features': 0.0, 'mask_length': 10, 'mask_prob': 0.8, 'mask_selection': 'static', 'mask_other': 0.0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': 'static', 'mask_channel_other': 0.0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'conv_pos': 128, 'conv_pos_groups': 16, 'relative_position_embedding': True, 'num_buckets': 320, 'max_distance': 800, 'gru_rel_pos': True}


<All keys matched successfully>

In [9]:
model.to(device)
model.eval();

In [10]:
def extract_mean(wavpath):
    with torch.no_grad():
        wav = torchaudio.load(wavpath)[0]
        res = model.extract_features(wav.to(device))
        return res[0].squeeze(0).mean(dim=0)


In [11]:
out_dir = Path('../out/utt_data/wavlm_large')
import os
os.makedirs(out_dir, exist_ok=True)


In [12]:
val_vecs = []
val_moss = []

for key, mos in tqdm(sorted(val_mos_data.items())):
    wavpath = wav_dir / key
    vec = extract_mean(wavpath)
    outpath = out_dir / (wavpath.stem + '.npy')
    
    vec = vec.detach().cpu().numpy()
    np.save(outpath, vec)
    
    val_vecs.append(vec)
    val_moss.append(mos)

  0%|          | 0/136 [00:00<?, ?it/s]

In [13]:
train_vecs = []
train_moss = []

for key, mos in tqdm(sorted(train_mos_data.items())):
    wavpath = wav_dir / key
    vec = extract_mean(wavpath)
    outpath = out_dir / (wavpath.stem + '.npy')
    
    vec = vec.detach().cpu().numpy()
    np.save(outpath, vec)
    
    train_vecs.append(vec)
    train_moss.append(mos)

  0%|          | 0/136 [00:00<?, ?it/s]

In [14]:
unlabeled_vecs = []
unlabeled_moss = []

for key, mos in tqdm(sorted(unlabeled_mos_data.items())):
    wavpath = wav_dir / key
    vec = extract_mean(wavpath)
    outpath = out_dir / (wavpath.stem + '.npy')
    
    vec = vec.detach().cpu().numpy()
    np.save(outpath, vec)
    
    unlabeled_vecs.append(vec)
    unlabeled_moss.append(mos)

  0%|          | 0/540 [00:00<?, ?it/s]