# Prediction
Load the model, intialize the speakers with 1 audio for each person, then predict the speaker of unseen audios among the previous identified speakers

In [29]:
from experiments.predict import *
from IPython.display import Audio, display

In [2]:
class args:
    model_path = "../saved_models/peach_0001.pt"
    dim = 1
    filters = 128
    num_speakers = 3699
    batch_size = 1500
    n_seconds = 3
    downsampling = 4
    spectrogram = True
    window_length = 0.02
    window_hop = 0.01
    device = 'cuda'

In [12]:
#############
# Constants #
#############
# Device must be 'cpu' if no gpu found
device = torch.device(args.device)
if args.spectrogram:
    if args.dim == 1:
        in_channels = int(args.window_length * 16000) // 2 + 1
    elif args.dim == 2:
        in_channels = 1
    else:
        raise RuntimeError
else:
    in_channels = 1
num_classes = args.num_speakers

N_SPEAKERS_EVALUATION = 5

In [38]:
def getAudioFromPredictionDataloader(predictionDataloader):
    if args.spectrogram:
        return predictionDataloader.dataset.dataset[0][0]
    else:
        return predictionDataloader.dataset[0][0]

## Load model

In [5]:
model = load_model(args.model_path, args.filters, in_channels, num_classes, args.dim, args.device)
model

Model loaded : ../saved_models/peach_0001.pt


ResidualClassifier(
  (conv1): Conv1d(161, 128, kernel_size=(7,), stride=(2,), bias=False)
  (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (maxpool1): MaxPool1d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): ResidualBlock1D(
      (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): ResidualBlock1D(
      (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, aff

## Initialisation phase

In [54]:
n_members = 4
auto_prepare = int(input("Do you want to auto prepare "+str(n_members)+" speakers ? (0/1) : "))
if (auto_prepare):
    audio_paths = [
        "/home/profenpoche/voicemap/data/testPrediction/activity_SId/1-10.flac", 
        "/home/profenpoche/voicemap/data/testPrediction/activity_SId/2-18.flac", 
        #"/home/profenpoche/voicemap/data/testPrediction/activity_SId/3-18.flac", 
        #"/home/profenpoche/voicemap/data/testPrediction/activity_SId/6-4.flac",
        "/home/profenpoche/voicemap/data/testPrediction/activity_SId/7-2.flac",
        "/home/profenpoche/voicemap/data/testPrediction/activity_SId/8-4.flac"
        # "data/testPrediction/activity_SId/4-1.flac", 
        # "data/testPrediction/activity_SId/5-1.flac"
        ]
else:
    n_members = int(input("Please enter the number of speakers in the activity : "))

Do you want to auto prepare 4 speakers ? (0/1) : 1


In [55]:
# Initialize the list of identified and trusted speakers
speakers = [] 
audios = []
# First step is to associate speakers with audio file foreach member of the group
for m in range(n_members):
    if auto_prepare:
        audio_path = audio_paths[m]
    else:
        audio_path = str(input("Please enter the audio path for the voice of the speaker n°"+str(m)+" : "))

    # It is necessary to put the audio in a dataloader before prediction
    predictionDataloader = buildPredictionDataloader(audio_path, args=args2)
    # predict the speaker (and validate him compared to the previous speakers)
    speaker_identified = identifySpeaker(speakers, predictionDataloader, model, N_SPEAKERS_EVALUATION, args.spectrogram)
    audio = getAudioFromPredictionDataloader(predictionDataloader)
    audios.append(audio)
    display(Audio(data=audio, rate=4000))
    speakers.append(speaker_identified)

print(bcolors.OKBLUE + "\nSpeakers are : ")
print(speakers)
print(bcolors.ENDC)


-------------- Prediction of an audio : /home/profenpoche/voicemap/data/testPrediction/activity_SId/1-10.flac
Predicted : 
7			3372			3529			3605			860
6.025798051600837	2.676491477241182	2.5687392727312597	2.394927350708492	2.1580507770878623
--- 0.3513193130493164 seconds ---



-------------- Prediction of an audio : /home/profenpoche/voicemap/data/testPrediction/activity_SId/2-18.flac
Predicted : 
3530			895			3203			3155			3179
4.849674524773583	2.877641834462724	2.7785555418971573	2.7446440575510715	2.7061261524068136
--- 0.2741384506225586 seconds ---



-------------- Prediction of an audio : /home/profenpoche/voicemap/data/testPrediction/activity_SId/7-2.flac
Predicted : 
3179			3178			3204			3203			54
3.1124235035870242	2.9243128264654397	2.5261640878721026	2.192401637843634	2.173378445170862
--- 0.2691967487335205 seconds ---



-------------- Prediction of an audio : /home/profenpoche/voicemap/data/testPrediction/activity_SId/8-4.flac
Predicted : 
415			1023			582			1			310
3.256805430241405	2.421559899082239	2.2760939993991807	2.223511336037831	2.113146228233763
--- 0.26912975311279297 seconds ---


[94m
Speakers are : 
[7, 3530, 3179, 415]
[0m


## Prediction phase

In [58]:
# Then we can loop and ask for an unlimited number of audio file
# Foreach audio we can link the most confident speaker among the 'speakers' list
activity_can_continue = True
while activity_can_continue:
    try:
        audio_path = str(input("Please enter the audio path for the speaker to identify : "))
        predictionDataloader = buildPredictionDataloader(audio_path)
        
        audio = getAudioFromPredictionDataloader(predictionDataloader)
        print(f"{bcolors.OKBLUE} Audio you have entered : {bcolors.ENDC}")
        display(Audio(data=audio, rate=4000))
        
        values, indices = predictAudioFile(predictionDataloader, model, N_SPEAKERS_EVALUATION,args.spectrogram)
        
        print("\nConfidence for each speaker previously identified : ")
        filtered_list = [values[i] for i in speakers]
        print(filtered_list)
        speaker_index = filtered_list.index(max(filtered_list))
        
        print(bcolors.BOLD + "====== Speaker identified is : "+str(speaker_index) + bcolors.ENDC)
        display(Audio(data=audios[speaker_index], rate=4000))
        print("########################################################")
    except NameError as error:
        print(error)
        activity_can_continue = bool(input("NameError. Press Enter to leave "))
    except RuntimeError as error:
        print(error)
        activity_can_continue = bool(input("RuntimeError. Press Enter to leave "))


Please enter the audio path for the speaker to identify : /home/profenpoche/voicemap/data/testPrediction/activity_SId/1-12.flac
[94m Audio you have entered : [0m



-------------- Prediction of an audio : /home/profenpoche/voicemap/data/testPrediction/activity_SId/1-12.flac
Predicted : 
7			3186			3364			3372			18
4.124151563048926	3.8670779322824735	3.0599254605564408	2.748529051254856	2.5387648790188204

Confidence for each speaker previously identified : 
[4.124151563048926, 0.6980314581589209, 0.7251842681616832, -0.5561072250384336]


########################################################
Please enter the audio path for the speaker to identify : /home/profenpoche/voicemap/data/testPrediction/activity_SId/2-14.flac
[94m Audio you have entered : [0m



-------------- Prediction of an audio : /home/profenpoche/voicemap/data/testPrediction/activity_SId/2-14.flac
Predicted : 
3530			3605			860			3631			3240
3.942774001155927	3.5326082370292458	3.034905554442002	2.756642910234777	2.7125053992322457

Confidence for each speaker previously identified : 
[1.8330091591517883, 3.942774001155927, 2.339187731799916, -0.13181978420779647]


########################################################
Please enter the audio path for the speaker to identify : /home/profenpoche/voicemap/data/testPrediction/activity_SId/3-13.flac
[94m Audio you have entered : [0m



-------------- Prediction of an audio : /home/profenpoche/voicemap/data/testPrediction/activity_SId/3-13.flac
Predicted : 
3631			3264			3203			895			619
3.8568043268363756	3.520337693794851	3.032013834848621	2.822339191142223	2.0333957125186837

Confidence for each speaker previously identified : 
[1.9451056150413388, 0.6656866078853605, 1.8522570499119793, -0.38568708729931345]


########################################################
Please enter the audio path for the speaker to identify : /home/profenpoche/voicemap/data/testPrediction/activity_SId/3-14.flac
[94m Audio you have entered : [0m



-------------- Prediction of an audio : /home/profenpoche/voicemap/data/testPrediction/activity_SId/3-14.flac
Predicted : 
3264			3203			3364			3241			887
4.1211778929634395	3.5725915193640585	2.9895720188998696	2.5093018948868595	2.354662125758807

Confidence for each speaker previously identified : 
[1.6916735285762983, 0.635482518641536, 1.5960244645751485, -0.2731943909189947]


########################################################
Please enter the audio path for the speaker to identify : /home/profenpoche/voicemap/data/testPrediction/activity_SId/8-1.flac
[94m Audio you have entered : [0m



-------------- Prediction of an audio : /home/profenpoche/voicemap/data/testPrediction/activity_SId/8-1.flac
Predicted : 
3240			590			3050			2844			3252
2.017499794086376	1.9952443342119	1.6552702404019604	1.5383678668684946	1.455399150965468

Confidence for each speaker previously identified : 
[-0.044562222140352964, 0.23532882272194539, -0.7498039218263381, 0.3527568242151046]


########################################################
Please enter the audio path for the speaker to identify : /home/profenpoche/voicemap/data/testPrediction/activity_SId/7-1.flac
[94m Audio you have entered : [0m



-------------- Prediction of an audio : /home/profenpoche/voicemap/data/testPrediction/activity_SId/7-1.flac
Predicted : 
3240			213			2934			3573			2804
1.933396984903396	1.798071968463392	1.6148299225448928	1.5486549403397003	1.4703823664348308

Confidence for each speaker previously identified : 
[-0.4232337522124432, 0.24514013340398183, -0.23883149463777975, 0.6119933778973177]


########################################################


KeyboardInterrupt: 