# Reproducibility

In [1]:
import torch
my_seed=19951008
torch.manual_seed(my_seed)
import numpy as np
np.random.seed(my_seed)
from tqdm import tqdm

# Import libraries

In [2]:
import json
from sklearn.preprocessing import LabelEncoder
import sys
## These two should correspond to the path where asvtorch code () is present, in particular the:
# - asvtorch/asvtorch folder
# - asvtorch/asvtorch/src folder
asvtorch/asvotorch
sys.path.append("../")
sys.path.append("../..")
from src.utterances.utterance_list import UtteranceList
from src.backend.vector_processing import VectorProcessor
import wandb
from src.gender_classifiers import LogisticRegression, FC2, FC4
from torch.autograd import Variable
import sklearn.metrics
from sklearn.model_selection import StratifiedKFold
import scipy.linalg
import itertools
import pandas as pd
from tqdm import tqdm

# i-Vectors
## Load features

In [5]:
! ls /media/hdd1/khaled/voxceleb_ivector_outputs-correct/ivector_400/utterances

trial_ivectors.pickle


In [6]:
%%time
plda_data = UtteranceList.load(
    'trial_ivectors',
    '/media/hdd1/khaled/voxceleb_ivector_outputs-correct/ivector_400/utterances')


Loading: /media/hdd1/khaled/voxceleb_ivector_outputs-correct/ivector_400/utterances/trial_ivectors.pickle
Loaded (41.811 s): /media/hdd1/khaled/voxceleb_ivector_outputs-correct/ivector_400/utterances/trial_ivectors.pickle
CPU times: user 23.3 s, sys: 12.1 s, total: 35.4 s
Wall time: 41.8 s


In [7]:
def get_correct_recordings_index(spk_labels):
    spk_labels_dict = {i:spk_labels.count(i) for i in set(spk_labels)}
    least_freq_spk = min(list(spk_labels_dict.values()))
    print(least_freq_spk)
    speaker_indexes = []
    frequency_spk_labels_dict = {}
    for x in set(spk_labels):
        frequency_spk_labels_dict[x] = 0
    for index, spk_id in enumerate(spk_labels):
        frequency_spk_labels_dict[spk_id] += 1
        if frequency_spk_labels_dict[spk_id] > least_freq_spk:
            next
        else:
            speaker_indexes.append(index)
    return speaker_indexes

In [8]:
def gender_classifier(
    train_embeddings,
    train_labels,
    test_embeddings,
    test_labels,
    model_name = 'log_reg'):
    # Train
    print("Train embeddings", train_embeddings.shape)
    train = torch.utils.data.TensorDataset(train_embeddings, train_labels)
    train_loader = torch.utils.data.DataLoader(dataset = train, batch_size = config['batch_size'], shuffle = False)
    # Test
    test = torch.utils.data.TensorDataset(test_embeddings, test_labels)
    test_loader = torch.utils.data.DataLoader(dataset = test, batch_size = config['test_batch_size'], shuffle = False)
    if model_name == 'log_reg':
        model = LogisticRegression(train_embeddings.shape[1], 2)
    elif model_name == 'fc2':
        model = FC2(train_embeddings.shape[1], 2, config['dropout'])
    elif model_name == 'fc4':
        model = FC4(train_embeddings.shape[1], 2, config['dropout'])
    model = model.cuda()
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=config['lr'])
    wandb.watch(model, log="all")
    for epoch in tqdm(range(config['epochs'])):
        for i, (vectors, labels) in enumerate(train_loader):
           # Define variables
            train = Variable(vectors.view(-1, train_embeddings.shape[1]))
            labels = Variable(labels)
            # Clear gradients
            optimizer.zero_grad()
            # Forward propagation
            outputs = model(train)
            # Calculate softmax and cross entropy loss
            loss = criterion(outputs, labels)
            # Calculate gradients
            loss.backward()
            # Update parameters
            optimizer.step()
    # Get test predictions
    y_pred = []
    y_true = []
    for i, (x_test, y_test) in enumerate(test_loader):
        x_test = Variable(x_test)
        outputs = model(x_test)
        y_pred += torch.max(outputs.data, 1)[1].cpu().numpy().tolist()
        y_true += y_test.data.cpu().numpy().tolist()
    wandb.log({
        'Accuracy': sklearn.metrics.accuracy_score(y_true, y_pred),
        'F1': sklearn.metrics.f1_score(y_true, y_pred)
    })
    return model, sklearn.metrics.f1_score(y_true, y_pred)

## Load gender metadata

In [9]:
! ls dataset/

gender-test_set.txt  gender-train_set.txt  gender-train_test.csv


In [10]:
df = pd.read_csv("dataset/gender-train_test.csv")
df.head()

Unnamed: 0,gender,Name,VoxCeleb_ID
0,female,Adrienne Bailon,id00097
1,female,Carolina Crescentini,id01413
2,female,Preeya Kalidas,id07065
3,female,Danielle Bisutti,id01948
4,female,Michaela May,id06026


In [11]:
with open("dataset/gender-train_set.txt") as f:
    train_speakers = json.load(f)
    
with open("dataset/gender-test_set.txt") as f:
    test_speakers = json.load(f)

In [12]:
%%time
train_indexes = []
plda_vox_id = []
test_indexes = []
test_plda_vox_id = []
for i, voxID_video_id in enumerate(tqdm(plda_data.get_utt_labels())):
    # Let's now remove the "recording" info from voxID-YT id
    current_id = voxID_video_id.split("-")[0]

    if current_id in train_speakers:
        train_indexes.append(i)
        plda_vox_id.append(current_id)
    elif current_id in test_speakers:
        test_indexes.append(i)
        test_plda_vox_id.append(current_id)

100%|██████████| 1128702/1128702 [01:31<00:00, 12350.30it/s]

CPU times: user 1min 29s, sys: 2.01 s, total: 1min 31s
Wall time: 1min 32s





In [13]:
len(train_indexes), len(test_indexes)

(465421, 295345)

- Individuo registrazioni plausibili


In [14]:
train_idx = get_correct_recordings_index(plda_vox_id)
test_idx = get_correct_recordings_index(test_plda_vox_id)

21
21


In [15]:
len(train_idx), len(test_idx)

(52878, 35238)

In [16]:
X_train = plda_data.embeddings[train_indexes]
X_train = X_train[train_idx]
y_train_spk = np.array(plda_vox_id)
y_train_spk = y_train_spk[train_idx]

In [17]:
X_test = plda_data.embeddings[test_indexes]
X_test = X_test[test_idx]
y_test_spk = np.array(test_plda_vox_id)
y_test_spk = y_test_spk[test_idx]

In [18]:
id_gender_dict = pd.Series(df.gender.values,index=df.VoxCeleb_ID).to_dict()

In [19]:
y_train = [id_gender_dict[x] for x in y_train_spk]
y_test = [id_gender_dict[x] for x in y_test_spk]

In [20]:
len(y_train), len(y_test)

(52878, 35238)

In [21]:
def train_holdout(preprocessing_strategy, model_name, train_embeddings, train_labels, test_embeddings, test_labels):

    train_embeddings = train_embeddings.cuda()
    train_labels = train_labels.cuda()
    
    test_embeddings = test_embeddings.cuda()
    test_labels = test_labels.cuda()
    
    if preprocessing_strategy == 'cwl':
        # Preprocess embeddings
        vector_processor = VectorProcessor.train(train_embeddings, 'cwl', 'cuda:0')
        train_embeddings = vector_processor.process(train_embeddings)
        test_embeddings = vector_processor.process(test_embeddings)
    elif preprocessing_strategy == 'wccn':
        L = wccn(train_embeddings.cpu().numpy(), train_labels.cpu().numpy(), 0)
        train_embeddings = torch.matmul(train_embeddings, torch.from_numpy(L).cuda().float())
        test_embeddings = torch.matmul(test_embeddings, torch.from_numpy(L).cuda().float())


    # Normal models
    model, test_f1 = gender_classifier(train_embeddings,
                      train_labels,
                      test_embeddings,
                      test_labels,
                      model_name=model_name
                       )
    print(test_f1)
    return model


In [22]:
models_to_evaluate = ['fc2']
norm_strat_to_evaluate = ['']
dropout = [False]
batch_size = [256]
lr = [0.001]
epochs = [200]
train_combinations = list(itertools.product(
    epochs,
    models_to_evaluate,
    norm_strat_to_evaluate,
    dropout,
    batch_size,
    lr
))

In [23]:
len(train_combinations)

1

Convert gender labels in numerical format for training reasons

In [24]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)
y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)

In [25]:
trained_model = None
for epoch, model, strategy, drop, bs, lr_now in train_combinations:
    config = {
        'batch_size' : bs,         
        'test_batch_size' : 100,
        'epochs' : epoch,
        'lr' : lr_now,
        'seed' : my_seed,
        'log_interval' : 1,
        'model_name' : model,
        'feature_norm' : strategy,
        'dropout': drop,
        'dataset' : 'gender',
        'embedding' : 'i-vec',
        'folder_fn': 'ivectors/log_reg/'
    }
    print(config)

    if drop:
        drop_id = 'dropout'
    else:
        drop_id = ''
    wandb.init(
        project='voxceleb_enrichment',
        name='_'.join([model,config['embedding'], strategy, drop_id]),
        config=config
    )
    trained_model = train_holdout(strategy, model, X_train, y_train, X_test, y_test)
    wandb.run.finish()

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable


{'batch_size': 256, 'test_batch_size': 100, 'epochs': 200, 'lr': 0.001, 'seed': 19951008, 'log_interval': 1, 'model_name': 'fc2', 'feature_norm': '', 'dropout': False, 'dataset': 'gender', 'embedding': 'i-vec', 'folder_fn': 'ivectors/log_reg/'}


[34m[1mwandb[0m: Currently logged in as: [33mhechmik[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.24 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


  0%|          | 0/200 [00:00<?, ?it/s]

Train embeddings torch.Size([52878, 400])


100%|██████████| 200/200 [03:46<00:00,  1.13s/it]


0.9823973652830617


0,1
Accuracy,0.98241
F1,0.9824
_step,0.0
_runtime,233.0
_timestamp,1617291648.0


0,1
Accuracy,▁
F1,▁
_step,▁
_runtime,▁
_timestamp,▁


In [26]:
torch.save(trained_model.state_dict(), "torch_models/ivec_fc2_model")

## FC4

In [27]:
models_to_evaluate = ['fc4']
norm_strat_to_evaluate = ['']
dropout = [False]
batch_size = [256]
lr = [0.001]
epochs = [200]
train_combinations = list(itertools.product(
    epochs,
    models_to_evaluate,
    norm_strat_to_evaluate,
    dropout,
    batch_size,
    lr
))

In [28]:
trained_model = None
for epoch, model, strategy, drop, bs, lr_now in train_combinations:
    config = {
        'batch_size' : bs,         
        'test_batch_size' : 100,
        'epochs' : epoch,
        'lr' : lr_now,
        'seed' : my_seed,
        'log_interval' : 1,
        'model_name' : model,
        'feature_norm' : strategy,
        'dropout': drop,
        'dataset' : 'gender',
        'embedding' : 'i-vec',
        'folder_fn': 'ivectors/log_reg/'
    }
    print(config)

    if drop:
        drop_id = 'dropout'
    else:
        drop_id = ''
    wandb.init(
        project='voxceleb_enrichment',
        name='_'.join([model,config['embedding'], strategy, drop_id]),
        config=config
    )
    trained_model = train_holdout(strategy, model, X_train, y_train, X_test, y_test)
    wandb.run.finish()
torch.save(trained_model.state_dict(), "torch_models/ivec_fc4_model")

{'batch_size': 256, 'test_batch_size': 100, 'epochs': 200, 'lr': 0.001, 'seed': 19951008, 'log_interval': 1, 'model_name': 'fc4', 'feature_norm': '', 'dropout': False, 'dataset': 'gender', 'embedding': 'i-vec', 'folder_fn': 'ivectors/log_reg/'}


[34m[1mwandb[0m: wandb version 0.10.24 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


  0%|          | 0/200 [00:00<?, ?it/s]

Train embeddings torch.Size([52878, 400])


100%|██████████| 200/200 [04:18<00:00,  1.29s/it]


0.9805189281156955


0,1
Accuracy,0.9805
F1,0.98052
_step,0.0
_runtime,267.0
_timestamp,1617291920.0


0,1
Accuracy,▁
F1,▁
_step,▁
_runtime,▁
_timestamp,▁


## Log reg

In [29]:
models_to_evaluate = ['log_reg']
norm_strat_to_evaluate = ['']
dropout = [False]
batch_size = [256]
lr = [0.001]
epochs = [200]
train_combinations = list(itertools.product(
    epochs,
    models_to_evaluate,
    norm_strat_to_evaluate,
    dropout,
    batch_size,
    lr
))

In [30]:
trained_model = None
for epoch, model, strategy, drop, bs, lr_now in train_combinations:
    config = {
        'batch_size' : bs,         
        'test_batch_size' : 100,
        'epochs' : epoch,
        'lr' : lr_now,
        'seed' : my_seed,
        'log_interval' : 1,
        'model_name' : model,
        'feature_norm' : strategy,
        'dropout': drop,
        'dataset' : 'gender',
        'embedding' : 'i-vec',
        'folder_fn': 'ivectors/log_reg/'
    }
    print(config)

    if drop:
        drop_id = 'dropout'
    else:
        drop_id = ''
    wandb.init(
        project='voxceleb_enrichment',
        name='_'.join([model,config['embedding'], strategy, drop_id]),
        config=config
    )
    trained_model = train_holdout(strategy, model, X_train, y_train, X_test, y_test)
    wandb.run.finish()
torch.save(trained_model.state_dict(), "torch_models/ivec_log_reg_model")

{'batch_size': 256, 'test_batch_size': 100, 'epochs': 200, 'lr': 0.001, 'seed': 19951008, 'log_interval': 1, 'model_name': 'log_reg', 'feature_norm': '', 'dropout': False, 'dataset': 'gender', 'embedding': 'i-vec', 'folder_fn': 'ivectors/log_reg/'}


[34m[1mwandb[0m: wandb version 0.10.24 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


  0%|          | 0/200 [00:00<?, ?it/s]

Train embeddings torch.Size([52878, 400])


100%|██████████| 200/200 [03:25<00:00,  1.03s/it]


0.982957450434585


0,1
Accuracy,0.98297
F1,0.98296
_step,0.0
_runtime,213.0
_timestamp,1617292140.0


0,1
Accuracy,▁
F1,▁
_step,▁
_runtime,▁
_timestamp,▁
