In [None]:
!git clone https://github.com/philipperemy/deep-speaker.git

Cloning into 'deep-speaker'...
remote: Enumerating objects: 1975, done.[K
remote: Counting objects: 100% (88/88), done.[K
remote: Compressing objects: 100% (72/72), done.[K
remote: Total 1975 (delta 39), reused 38 (delta 11), pack-reused 1887[K
Receiving objects: 100% (1975/1975), 81.47 MiB | 41.67 MiB/s, done.
Resolving deltas: 100% (1028/1028), done.


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install pydub
!apt-get install ffmpeg

Collecting pydub
  Downloading https://files.pythonhosted.org/packages/a6/53/d78dc063216e62fc55f6b2eebb447f6a4b0a59f55c8406376f76bf959b08/pydub-0.25.1-py2.py3-none-any.whl
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Reading package lists... Done
Building dependency tree       
Reading state information... Done
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 31 not upgraded.


In [None]:
%cd deep-speaker/
!pip install -r requirements.txt


/content/deep-speaker


In [None]:
import random

import numpy as np

from audio import read_mfcc
from batcher import sample_from_mfcc
from constants import SAMPLE_RATE, NUM_FRAMES
from conv_models import DeepSpeakerModel
from test import batch_cosine_similarity

# Reproducible results.
np.random.seed(123)
random.seed(123)

# Define the model here.
model = DeepSpeakerModel()

# Load the checkpoint. https://drive.google.com/file/d/1F9NvdrarWZNktdX9KlRYWWHDwRkip_aP.
# Also available here: https://share.weiyun.com/V2suEUVh (Chinese users).
model.m.load_weights('/content/drive/My Drive/ResCNN_triplet_training_checkpoint_265.h5', by_name=True)

In [None]:
import os
audio_path = '/content/drive/My Drive/Voice'
speakers = os.listdir(audio_path)
embeddings = []
names = []
for speaker in speakers:
  full_path = os.path.join(audio_path, speaker)
  files = os.listdir(full_path)
  files = [os.path.join(full_path, f) for f in files]
  for f in files:
    mfcc = sample_from_mfcc(read_mfcc(f, SAMPLE_RATE), NUM_FRAMES)
    embedding = model.m.predict(np.expand_dims(mfcc, axis=0))
    embeddings.append(embedding)
    names.append(speaker)
embeddings = np.array(embeddings)
names = np.array(names)
print(f'embeddings shape: {embeddings.shape} and names shape: {names.shape}')    


embeddings shape: (29, 1, 512) and names shape: (29,)


In [None]:
from collections import Counter
def predict(embedding, k=1):
  results = []
  for embeddingSpeaker, speaker in zip(embeddings, names):
    cosine = batch_cosine_similarity(embeddingSpeaker, embedding)
    results.append((cosine, speaker))
  results = sorted(results, reverse = True)
  temp =[(first - 0.1, second) for (first, second) in results[:10]]
  temp = np.array(temp).reshape(-1,1)
  mostVotes = [second for first, second in results[:k]]
  mostVotes = Counter(mostVotes)
  return mostVotes.most_common(1)[0][0], temp


In [None]:
import numpy as np
from google.colab import files
from keras.preprocessing import image
import librosa
import IPython.display as ipd

uploaded = files.upload()

for fn in uploaded.keys():
    # predicting images
    path = os.path.join(os.getcwd(), fn)
x, sr = librosa.load(f'{path}') 
ipd.Audio(x, rate=sr)


Saving tuan_test_01.wav to tuan_test_01.wav


In [None]:
import time
start_time = time.time()
mfcc = sample_from_mfcc(read_mfcc(path, SAMPLE_RATE), NUM_FRAMES)
embedding = model.m.predict(np.expand_dims(mfcc, axis=0))
end_time = time.time()
print(f'Time for embedding feature: {end_time - start_time}')
start_time = time.time()
className, probability = predict(embedding)
end_time = time.time()
print(f"{fn} is {className} predicted in {end_time - start_time}\n")
print(f"probability:\n {probability}")

Time for embedding feature: 0.1960465908050537
tuan_test_01.wav is TuanAnhTA predicted in 0.0017011165618896484

probability:
 [[array([0.89999974], dtype=float32)]
 ['TuanAnhTA']
 [array([0.89999974], dtype=float32)]
 ['TuanAnhTA']
 [array([0.8044851], dtype=float32)]
 ['TuanAnhTA']
 [array([0.8044851], dtype=float32)]
 ['TuanAnhTA']
 [array([0.6905955], dtype=float32)]
 ['TuanAnhTA']
 [array([0.6716699], dtype=float32)]
 ['TuanAnhTA']
 [array([0.5815058], dtype=float32)]
 ['TuanAnhTA']
 [array([0.52464354], dtype=float32)]
 ['TuanAnhTA']
 [array([0.41661736], dtype=float32)]
 ['Sam']
 [array([0.41661736], dtype=float32)]
 ['Sam']]


  if __name__ == '__main__':
