# Speaker recognition using singing or speaking voice

This project aims at recognizing speakers through a joint representation learning of singing and speaking voices from them. 

In [2]:
import pandas as pd
import numpy as np
import torch
import json

### JukeBox dataset analysis

[Voxceleb Link](https://mm.kaist.ac.kr/datasets/voxceleb/#downloads)

In [3]:
metadata = pd.read_csv("./jukebox_metadata.csv")

#creating artist id and name pairs
idname_df = metadata[['artist_id', 'artist_name']]
ids = list(idname_df['artist_id'])
names = list(idname_df['artist_name'])

id_name_dict = {}

for i in range(0, len(ids)):
    
    if ids[i] in id_name_dict:
        continue
    else:
        id_name_dict[int(ids[i])] = names[i]

# with open("./jukebox_id_artist.json", "w+") as data:
#     json.dump(id_name_dict, data)

### Jukebox and VoxCeleb common artists

In [4]:
#jukebox artists
with open("./jukebox_id_artist.json", 'r') as juke:
    jukebox = json.load(juke)

jukebox_arts = [j for i,j in jukebox.items()]
jukebox_arts = [i.replace(" ", "_").lower() for i in jukebox_arts]
print(jukebox_arts[:10])


#voxceleb artists
voxDf = pd.read_csv("./vox1_meta.csv", delimiter="\t")
vox_arts = list(voxDf["VGGFace1 ID"])
vox_arts = [k.lower() for k in vox_arts]
print(vox_arts[:10])

#commons
commons = list(set(jukebox_arts).intersection(set(vox_arts)))
# with open("commons_in_vox_and_jukebox.txt", 'w+') as commonData:

#     for artist in commons:
#         commonData.write(artist)
#         commonData.write("\n")



['a-lin', 'adira', 'adrian_minune', 'afgan', 'ai', 'aiko', 'aimer', 'albert_west', 'alex_mica', 'alexandra_stan']
['a.j._buckley', 'a.r._rahman', 'aamir_khan', 'aaron_tveit', 'aaron_yoo', 'abbie_cornish', 'abigail_breslin', 'abigail_spencer', 'adam_beach', 'adam_brody']


### housekeeping for arranging vox and juke data

In [5]:
jukemeta = pd.read_csv("jukebox_metadata.csv")
voxmeta = pd.read_csv("vox1_meta.csv", delimiter='\t')
voxmeta['VGGFace1 ID'] = voxmeta['VGGFace1 ID'].str.casefold()
jukemeta['artist_name'] = jukemeta['artist_name'].str.casefold()
jukemeta['artist_name'] = jukemeta['artist_name'].apply(lambda x: x.replace(" ", "_"))

In [6]:
### creating dictionary for voxid:artist pair

voxcommon = voxmeta[voxmeta['VGGFace1 ID'].isin(commons)]
voxid_celeb = {}
for i in range(0, len(voxcommon)):
    voxid_celeb[voxcommon.iloc[i]['VoxCeleb1 ID']]=voxcommon.iloc[i]['VGGFace1 ID']

# with open("./vox_id_celeb.json", 'w+') as voxidartist:
    
#     json.dump(voxid_celeb, voxidartist)

In [7]:
### creating dictionary for jukeID:artist pair
jukecommon = jukemeta[jukemeta['artist_name'].isin(commons)]
#print(list(jukecommon))

jukeid_celeb = {}
for c in commons:
    jukeid = str(jukecommon[jukecommon['artist_name'] == c]['artist_id'].unique()[0])
    jukeid_celeb[jukeid] = c

# with open("./juke_id_celeb.json", 'w+') as jukeidartist:
#     json.dump(jukeid_celeb, jukeidartist)



In [8]:
jukecommon[:10]

Unnamed: 0.1,Unnamed: 0,song_name,artist_id,artist_name,partition,language_label,gender_label,full_path
2411,2411,0_1.wav,75,charlotte_gainsbourg,AUXILIARY,French,female,AUXILIARY/75/0_1.wav
2412,2412,0_2.wav,75,charlotte_gainsbourg,AUXILIARY,French,female,AUXILIARY/75/0_2.wav
2413,2413,0_3.wav,75,charlotte_gainsbourg,AUXILIARY,French,female,AUXILIARY/75/0_3.wav
2414,2414,0_4.wav,75,charlotte_gainsbourg,AUXILIARY,French,female,AUXILIARY/75/0_4.wav
2415,2415,0_5.wav,75,charlotte_gainsbourg,AUXILIARY,French,female,AUXILIARY/75/0_5.wav
14825,14825,0_1.wav,359,amanda_seyfried,TRAIN,English,female,TRAIN/359/0_1.wav
14826,14826,0_2.wav,359,amanda_seyfried,TRAIN,English,female,TRAIN/359/0_2.wav
14827,14827,0_3.wav,359,amanda_seyfried,TRAIN,English,female,TRAIN/359/0_3.wav
14828,14828,0_4.wav,359,amanda_seyfried,TRAIN,English,female,TRAIN/359/0_4.wav
14829,14829,0_5.wav,359,amanda_seyfried,TRAIN,English,female,TRAIN/359/0_5.wav


### Metadata Creation


In [9]:
from src.utilities import Utilities

with open("juke_id_celeb_commons.json", 'r') as jk:
    juke = json.load(jk)

with open("vox_id_celeb_commons.json", 'r') as vo:
    vox = json.load(vo)

29

In [14]:
#jukemeta = full metadata for juke
#voxmeta = full metadata for voxceleb

###getting jukemeta information
for artist in commons:
    print(artist + ": "+ str(len((jukemeta[jukemeta['artist_name'] == artist]))))

marie_osmond: 7
lea_salonga: 89
bruno_mars: 213
smokey_robinson: 14
miley_cyrus: 56
amanda_seyfried: 27
anne_hathaway: 8
josh_groban: 11
nelly_furtado: 9
nicole_scherzinger: 43
rita_ora: 24
cyndi_lauper: 13
stevie_wonder: 167
troye_sivan: 26
meat_loaf: 47
chris_martin: 4
carrie_underwood: 30
janet_jackson: 8
cher: 143
lea_michele: 47
jack_black: 118
charlotte_gainsbourg: 5
kylie_minogue: 60
sammy_davis_jr.: 14
blake_shelton: 5
lorde: 119
kenny_rogers: 126
jill_scott: 10
jessie_j: 7


### TRAIN, TEST, AUXILIARY in JukeBox

In [15]:
jukemeta = pd.read_csv("./jukebox_metadata.csv")
jukemeta['artist_name'] = jukemeta['artist_name'].str.casefold()
jukemeta['artist_name'] = jukemeta['artist_name'].apply(lambda x: x.replace(" ", "_"))

In [16]:
commondf = jukemeta[jukemeta['artist_name'].isin(commons)]
auxiliarys = list(commondf[commondf['partition'] == "AUXILIARY"]["artist_name"].unique())
auxiliary_ID_artist = {}

for aux in auxiliarys:
    idNum = commondf[commondf['artist_name']==aux]['artist_id'].unique()[0]
    auxiliary_ID_artist[str(idNum)] = aux 

In [17]:
aux_ids = [auxid for auxid, name in auxiliary_ID_artist.items()]
aux_ids

['75', '372', '649', '663', '680', '914']

In [35]:
# with open("./auxiliarys.json", 'w+') as auxis:
#     json.dump(auxiliary_ID_artist, auxis)

In [18]:
for artist in auxiliarys:
    print(artist + ": "+ str(len((jukemeta[jukemeta['artist_name'] == artist]))))

charlotte_gainsbourg: 5
anne_hathaway: 8
jack_black: 118
janet_jackson: 8
jessie_j: 7
nelly_furtado: 9


### arranging the data

The data is arranged using a script in `/netscratch/rsharma/voice-recognition-speak-sing/src/utilities.py`. I know I am not an organized being in this sense OKAY! Stop looking at me like a criminal! 

## Meta data for the data arranged in the previous step.

In [28]:
##### Metadata for files and the data size per artists for voxceleb and Jukebox
def get_id_from_artistName(dic, val):
    return [k for k, v in dic.items() if v == val][0]

df_columns = ['artist_name', 'voxceleb_id', 'jukebox_id', 'singing_time_sec', 'speaking_time_sec', 'vox_path', 'juke_path']

In [46]:
import wave
import os

def getDuration(filePath):
    with wave.open(filePath) as mywav:
        duration_seconds = mywav.getnframes() / mywav.getframerate()
    return duration_seconds

speakingDir = "/netscratch/rsharma/voice-recognition-speak-sing/data/speaking/"
singingDir = "/netscratch/rsharma/voice-recognition-speak-sing/data/singing/"
metadata = []


for artist in commons:
    if artist in auxiliarys:
        continue
    voxID = get_id_from_artistName(vox, artist)
    jukeID = get_id_from_artistName(juke, artist)
    
    #singing_metadata
    print("Metadata for: " + artist)
    juke_path = singingDir + artist
    sing_artist_files = [juke_path + "/" + i for i in  os.listdir(singingDir + artist)]
    sing_duration = 0
    for singFile in sing_artist_files:
        sing_duration = duration + getDuration(singFile)
    
    print("Sing duration: " + str(sing_duration))

    
    #speaking_metadata
    vox_path = speakingDir + artist
    speak_artist_folders = [vox_path + "/" + j for j in os.listdir(speakingDir + artist)]
    speak_duration = 0
    for folder in speak_artist_folders:
        speak_artist_files = [folder + "/" + k for k in os.listdir(folder)]
        
        for speakFile in speak_artist_files:
            speak_duration += getDuration(speakFile)
    
    print("Speak duration: " + str(speak_duration))
            
    
    row = [artist, voxID, jukeID, str(int(sing_duration)), str(int(speak_duration)), vox_path, juke_path]
    metadata.append(row)

Metadata for: marie_osmond
Sing duration: 239.76
Speak duration: 690.4849375000001
Metadata for: lea_salonga
Sing duration: 239.76
Speak duration: 1957.6966874999998
Metadata for: bruno_mars
Sing duration: 239.76
Speak duration: 611.2457499999999
Metadata for: smokey_robinson
Sing duration: 239.01
Speak duration: 2447.694937500002
Metadata for: miley_cyrus
Sing duration: 239.76
Speak duration: 2945.8579375000004
Metadata for: amanda_seyfried
Sing duration: 239.76
Speak duration: 1132.1305000000002
Metadata for: josh_groban
Sing duration: 239.28
Speak duration: 2555.0171875000015
Metadata for: nicole_scherzinger
Sing duration: 239.76
Speak duration: 1164.6494374999997
Metadata for: rita_ora
Sing duration: 239.76
Speak duration: 933.5259374999999
Metadata for: cyndi_lauper
Sing duration: 239.76
Speak duration: 1311.6532499999992
Metadata for: stevie_wonder
Sing duration: 239.76
Speak duration: 525.0450000000001
Metadata for: troye_sivan
Sing duration: 239.76
Speak duration: 761.605375
Me

In [48]:
metadata_df = pd.DataFrame(data=metadata, columns=df_columns)
metadata_df.to_csv("/netscratch/rsharma/voice-recognition-speak-sing/data/metdata.csv")