### Install and import required packages

**Resemblyzer** - to derive a "high-level representation" of a voice through a deep learning model (https://github.com/resemble-ai/Resemblyzer)

In [None]:
!pip install resemblyzer
#You may be prompted to restart the runtime after the installation. Look out for that in the output below.

In [None]:
from resemblyzer import VoiceEncoder, preprocess_wav
import numpy as np
from umap import UMAP
import matplotlib.pyplot as plt
import librosa
import os
import sys
import pickle as pkl
import pyaudio
import wave
import struct

### To enable microphone input
(source: https://gist.github.com/Anwarvic/e843ed55c550784aa6f78faa505ca3ff)

In [None]:
# all imports
from io import BytesIO
from base64 import b64decode
from google.colab import output
from IPython.display import Javascript

RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec=3):
  print("Speak Now...")
  display(Javascript(RECORD))
  sec += 1
  s = output.eval_js('record(%d)' % (sec*1000))
  print("Done Recording !")
  b = b64decode(s.split(',')[1])
  with open('audio.wav','wb') as f:
    f.write(b)
  audio, sr = librosa.load('audio.wav')
  return audio

### Clone the github repository to download some required pretrained files

In [None]:
!git clone https://github.com/rohitma38/sing-alike-interactive.git

In [None]:
encoder = VoiceEncoder()
reducer = pkl.load(open('sing-alike-interactive/reducer.sav','rb'))
knn = pkl.load(open('sing-alike-interactive/knn_model.sav','rb'))
projs = np.load('sing-alike-interactive/projections.npy')
labels = np.load('sing-alike-interactive/labels.npy')

singers = ['Arijit', 'Atif', 'Lata', 'Chinmayi']
cluster_coords = [(12.3,1.3),(12.3,13),(-4,-2),(-4,16)]

def plot_projection(projs_gt, labels_gt, projs_test):
  fig = plt.figure()
  ax = fig.add_subplot(111)
  ax.scatter(projs_gt[:,0],projs_gt[:,1],c=labels)

  for i in range(len(singers)):
    plt.annotate(singers[i],cluster_coords[i])
    
  for item in projs_test:
    ax.scatter(item[0],item[1],color='black',marker='+')
  plt.show()

In [None]:
prediction_history=[]

### Aaand, time to play!

Running below cell will record for 10 seconds from your mic and then compare your voice to 4 popular singers that are currently stored in the system - Arijit Singh (red), Atif Aslam (blue), Lata Mangeshkar (green) and Chinmayi (cyan). Your voice sample will appear as an 'x' mark on a 2-d plane containing the 4 clusters of the artists.

In [None]:
print('Recording')
data = record(10) 
decoded = data.copy()

embed = encoder.embed_utterance(decoded/np.max(decoded))
proj = reducer.transform([embed])
print('You sound most like: knn.predict(proj)')
prediction_history.append([proj[0][0],proj[0][1]])
plot_projection(projs,labels,prediction_history)

Run above cell repeatedly to continue playing with this demo!
One fun thing to try would be to check how your voice sample's embedding changes when you sound male versus when you sound female.
Have fun! :D