# Prediction
Load the model, intialize the speakers with 1 audio for each person, then predict the speaker of unseen audios among the previous identified speakers

In [1]:
from experiments.predict import *
from voicemap.train import TrainingArgs, calculate_in_channels
from IPython.display import Audio, display

  from tqdm.autonotebook import tqdm


In [5]:
args = TrainingArgs()
args.model_path = "../models/best_model_without_spec.pt"
args.dim = 1
args.filters = 128
args.num_speakers = 1291
args.batch_size = 1500
args.n_seconds = 3
args.downsampling = 4
args.spectrogram = False
args.window_length = 0.02
args.window_hop = 0.01
args.device = 'cuda'

in_channels = calculate_in_channels(args)

N_SPEAKERS_EVALUATION = 5

In [6]:
def getAudioFromPredictionDataloader(predictionDataloader):
    if args.spectrogram:
        return predictionDataloader.dataset.dataset[0][0]
    else:
        return predictionDataloader.dataset[0][0]

## Load model

In [7]:
model = load_model(args.model_path, args.filters, in_channels, args.num_speakers, args.dim, args.device)
model

Model loaded : ../models/best_model_without_spec.pt


ResidualClassifier(
  (conv1): Conv1d(1, 128, kernel_size=(7,), stride=(2,), bias=False)
  (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (maxpool1): MaxPool1d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): ResidualBlock1D(
      (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): ResidualBlock1D(
      (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affin

## Initialisation phase

In [8]:
n_members = 4
auto_prepare = int(input("Do you want to auto prepare "+str(n_members)+" speakers ? (0/1) : "))
if (auto_prepare):
    audio_paths = [
        "/home/profenpoche/voicemap/data/testPrediction/activity_SId/1-10.flac", 
        "/home/profenpoche/voicemap/data/testPrediction/activity_SId/2-18.flac", 
        #"/home/profenpoche/voicemap/data/testPrediction/activity_SId/3-18.flac", 
        #"/home/profenpoche/voicemap/data/testPrediction/activity_SId/6-4.flac",
        "/home/profenpoche/voicemap/data/testPrediction/activity_SId/7-2.flac",
        "/home/profenpoche/voicemap/data/testPrediction/activity_SId/8-4.flac"
        # "data/testPrediction/activity_SId/4-1.flac", 
        # "data/testPrediction/activity_SId/5-1.flac"
        ]
else:
    n_members = int(input("Please enter the number of speakers in the activity : "))

Do you want to auto prepare 4 speakers ? (0/1) : 1


In [9]:
# Initialize the list of identified and trusted speakers
speakers = [] 
audios = []
# First step is to associate speakers with audio file foreach member of the group
for m in range(n_members):
    if auto_prepare:
        audio_path = audio_paths[m]
    else:
        audio_path = str(input("Please enter the audio path for the voice of the speaker n°"+str(m)+" : "))

    # It is necessary to put the audio in a dataloader before prediction
    predictionDataloader = buildPredictionDataloader(audio_path, args=args)
    # predict the speaker (and validate him compared to the previous speakers)
    speaker_identified = identifySpeaker(speakers, predictionDataloader, model, N_SPEAKERS_EVALUATION, args.spectrogram)
    audio = getAudioFromPredictionDataloader(predictionDataloader)
    audios.append(audio)
    display(Audio(data=audio, rate=4000))
    speakers.append(speaker_identified)

print(bcolors.OKBLUE + "\nSpeakers are : ")
print(speakers)
print(bcolors.ENDC)


-------------- Prediction of an audio : /home/profenpoche/voicemap/data/testPrediction/activity_SId/1-10.flac
Predicted : 
897			962			531			189			618
6.779479249677044	6.6342716741777945	6.055346566189138	4.469009118025346	4.347099589577734
--- 0.270061731338501 seconds ---



-------------- Prediction of an audio : /home/profenpoche/voicemap/data/testPrediction/activity_SId/2-18.flac
Predicted : 
756			5			306			796			835
9.799615542900892	4.609653814360707	4.179279045030706	4.016545358408395	3.8740242954259942
--- 0.212355375289917 seconds ---



-------------- Prediction of an audio : /home/profenpoche/voicemap/data/testPrediction/activity_SId/7-2.flac
Predicted : 
716			870			710			403			548
5.440760488494652	3.4142579395313035	3.132485441201555	3.0195222205418895	2.9130790475510375
--- 0.17238807678222656 seconds ---



-------------- Prediction of an audio : /home/profenpoche/voicemap/data/testPrediction/activity_SId/8-4.flac
Predicted : 
1169			1146			486			688			1128
4.447832094843351	3.795996708960798	3.790965949043307	3.7003476963891506	3.5028362054682756
--- 0.17442035675048828 seconds ---



Speakers are : 
[897, 756, 716, 1169]



## Prediction phase

In [None]:
# Then we can loop and ask for an unlimited number of audio file
# Foreach audio we can link the most confident speaker among the 'speakers' list
activity_can_continue = True
while activity_can_continue:
    try:
        audio_path = str(input("Please enter the audio path for the speaker to identify : "))
        predictionDataloader = buildPredictionDataloader(audio_path, args)
        
        audio = getAudioFromPredictionDataloader(predictionDataloader)
        print(f"{bcolors.OKBLUE} Audio you have entered : {bcolors.ENDC}")
        display(Audio(data=audio, rate=4000))
        
        values, indices = predictAudioFile(predictionDataloader, model, N_SPEAKERS_EVALUATION,args.spectrogram)
        
        print("\nConfidence for each speaker previously identified : ")
        filtered_list = [values[i] for i in speakers]
        print(filtered_list)
        speaker_index = filtered_list.index(max(filtered_list))
        
        print(bcolors.BOLD + "====== Speaker identified is : "+str(speaker_index) + bcolors.ENDC)
        display(Audio(data=audios[speaker_index], rate=4000))
        print("########################################################")
    except NameError as error:
        print(error)
        activity_can_continue = bool(input("NameError. Press Enter to leave "))
    except RuntimeError as error:
        print(error)
        activity_can_continue = bool(input("RuntimeError. Press Enter to leave "))


Please enter the audio path for the speaker to identify : /home/profenpoche/voicemap/data/testPrediction/activity_SId/8-2.flac
 Audio you have entered : 



-------------- Prediction of an audio : /home/profenpoche/voicemap/data/testPrediction/activity_SId/8-2.flac
Predicted : 
868			850			688			536			539
4.61556108144234	4.531053572675788	4.30524444657108	3.3547390969980624	3.094794671266431

Confidence for each speaker previously identified : 
[0.9247348879371919, 0.14531547633706943, 1.7234659971196944, 2.8631708010446024]


########################################################
