Val file paths

In [1]:
#Folder to find .wav files
valdata_folder="/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/"
#Spk2utt mapping path
valspk2utt ="/mnt/datadrive/datasets/spkveri/mrt1/spk450/450-speakers/utt2spk"
#wav.scp path
valwavscp="/mnt/datadrive/datasets/spkveri/mrt1/spk450/450-speakers/wav.scp"

Unval file paths

In [2]:
#Folder to find .wav files
data_folder="/mnt/datadrive/datasets/spkveri/mrt1/btch03-no-empty/"
#Spk2utt mapping path
spk2utt ="/mnt/datadrive/datasets/spkveri/mrt1/btch03-no-empty/utt2spk"
#wav.scp path
wavscp="/mnt/datadrive/datasets/spkveri/mrt1/btch03-no-empty/wav.scp"

In [3]:
#Path to save the preprocessed data
output_folder="/mnt/training/silverbullet-testenv/test-02-bigexp/save/"

In [4]:
import pandas as pd
import torchaudio
import torch
import numpy as np 
import random

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def veri_test (test_df: pd.DataFrame, output_folder):
    
    #randomise the columns
   
    
    values = test_df['wav'].tolist()
    random.shuffle(values)
    test_df['shuffled_column'] = values

    list2=test_df["shuffled_column"].iloc[0]
    #print(str(list2.split("/")[8]))
    
    #fn to set the labels
    def get_label(list1,list2):
        uniq_id= list1
        shuf_col = list2
        label=""
        if (str(list1.split("_")[0]) == str(list2.split("/")[4])):
            label = 1
            #print(str(list1.split("_")[0]),str(list2.split("/")[8]),label)
        else:
            label = 0
            #print(str(list1.split("_")[0]),str(list2.split("/")[8]),label)
        return(int(label))


    test_df["label"] = test_df.apply(lambda x: get_label(x["ID"], x["shuffled_column"]), axis=1)
    
    test_df=test_df.drop(columns=["spk_id","ID"])
    
    veri_test = test_df.reindex(columns=['label', 'wav', 'shuffled_column'])
    
    #save the data frame as csv 
    print(len(veri_test))
    print("Saving veri_test")
    veri_test.to_csv(output_folder+"veri_test.txt", sep=" ", index=False, header=False)
    print("Saved veri_test")
    

In [6]:
def csv_prep (data_folder, df: pd.DataFrame):
    
    df["start"]=" "
    
    groups = df.groupby(df['ID'].apply(lambda x: x.split('_')[0]))
    
    output = []
    for spk, group in groups:
        for index, row in group.iterrows():
            id = row['ID'].split('_')[1]
            wav = data_folder +spk+ "/" + id +  ".wav"
            #print(wav)

    
            signal, fs = torchaudio.load(wav)
            signal = signal.squeeze(0)
            audio_duration = signal.shape[0] / 16000
            start_sample = 0
            stop_sample = signal.shape[0]
            stop_sample= int(stop_sample)
            #print(stop_sample)

            df.loc[index, 'start'] = start_sample
            df.loc[index, "stop"] = stop_sample
            df.loc[index, "duration"] = audio_duration 
 
            
    df['stop'] = df['stop'].astype(int)
    
    df_f = df.reindex(columns=['ID', 'duration', 'wav', "start","stop","spk_id"])

    #print("num_frames")
    #num_frames=df['stop']-df['start']
    #print(num_frames)
    
    return df_f

In [7]:
def split_dataframe(df):
    spk_id_groups = df.groupby('spk_id')
    final_train = pd.DataFrame(columns=df.columns)
    final_dev = pd.DataFrame(columns=df.columns)
    final_test = pd.DataFrame(columns=df.columns)
    final_enrol= pd.DataFrame(columns=df.columns)
    for name, group in spk_id_groups:
        group_size = group.shape[0]
        train_size = int(group_size * 0.5)
        dev_size = int(group_size * 0.2)
        test_size = int(group_size * 0.3)

        train_df = group.iloc[:train_size]
        test_df = group.iloc[train_size:train_size+test_size]
        dev_df = group.iloc[train_size+test_size:]

        final_train = final_train.append(train_df)
        final_dev = final_dev.append(dev_df)
        final_test = final_test.append(test_df)
        final_enrol= final_enrol.append(test_df)
    final_train.reset_index(drop=True, inplace=True)
    final_dev.reset_index(drop=True, inplace=True)
    final_test.reset_index(drop=True, inplace=True)
    final_enrol.reset_index(drop=True, inplace=True)
    return final_train,final_dev,final_test,final_enrol

Train - n*20 validated speakers and get val embeddings for the same 

In [8]:
def train(spk2utt,wavscp,data_folder, output_folder):

    #Make the main dataframe
    
    #spk2utt prep
    d1=pd.read_csv(spk2utt)
    d1.columns= ["col1"]
    d1[['ID', 'spk_id']] = d1['col1'].str.split(' ', expand=True)
    d1 = d1.drop(columns=["col1"])
    
    #wav.scp prep
    d2=pd.read_csv(wavscp)
    d2.columns= ["col1"]
    d2[['ID', 'wav']] = d2['col1'].str.split(' ', expand=True)
    d2 = d2.drop(columns=["col1"])
    
    #main df 
    d= pd.merge(d1, d2, on="ID")
    #print(d.head())
    
    #dealing with missing values/ changing path names in csv to actual path 
    d['wav'] = d['wav'].apply(lambda x: str(x).replace( '/home/sai/work/silver-bullet/batch-02/', '/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/'))
    print("Length of d",len(d))
    mask = d["wav"].isin(["/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16783193/281474985386592.wav","/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16783193/281474985386582.wav", "/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16783193/281474985386570.wav","/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16783193/281474985386581.wav","/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16783114/281474985436102.wav","/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16794090/281474989055448.wav"])
    d = d[~mask]
    d['wav'] = d['wav'].apply(lambda x: str(x).replace('/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/'," /mnt/datadrive/datasets/spkveri/mrt1/btch03-no-empty/"))
    print("Length of new d",len(d)) 
    print("Made main data frame")

    # Get the unique id values
    unique_ids = d['spk_id'].unique()
    print(len(unique_ids))
    

    # Select the first n unique id values
    selected_ids = unique_ids[0:3]
    print(len(selected_ids))
    #print(selected_ids=="16783193")

    # Filter the DataFrame to select only rows with the selected id values
    train_selected_rows = d[d['spk_id'].isin(selected_ids)]

    #print(train_selected_rows)
    print(len(train_selected_rows))
    print("save the list")
    train_selected_rows.to_csv(output_folder+"train_selected_rows.csv", index= False)
    print("saved the list")
    #get train.csv now 

    #Prep TRAIN csv file - use for benchmarking
    print("start making the csv files")
    print("Saving train.csv")
    traindf=csv_prep(data_folder,train_selected_rows)
    traindf.to_csv(output_folder+"train.csv", index= False)
    print("Saved train.csv")

    #Prep VAL_EMB.CSV files (SAME THING AS TRAIN.CSV) - but use for emb generation
    print("start making the csv files")
    print("Saving val_emb.csv")
    traindf=csv_prep(data_folder,train_selected_rows)
    traindf.to_csv(output_folder+"val_emb.csv", index= False)
    print("Saved val_emb.csv")

    print("Done train/val_emb preprocessing!")
    

In [9]:
train(valspk2utt,valwavscp,valdata_folder,output_folder)

Length of d 11322
Length of new d 11316
Made main data frame
455
3
74
save the list
saved the list
start making the csv files
Saving train.csv
Saved train.csv
start making the csv files
Saving val_emb.csv
Saved val_emb.csv
Done train/val_emb preprocessing!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["start"]=" "
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[index, 'start'] = start_sample
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[index, "stop"] = stop_sample
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pa

Test - Out of unvalidated 500 test random 10 out of 500 unval - 20val = 480 utterances per speaker 
Generate unval embeddings for n*480 speakers

In [10]:
def test(spk2utt,wavscp,data_folder, output_folder):
    #Make the main dataframe
    
    #spk2utt prep
    d1=pd.read_csv(spk2utt)
    d1.columns= ["col1"]
    d1[['ID', 'spk_id']] = d1['col1'].str.split(' ', expand=True)
    d1 = d1.drop(columns=["col1"])
    
    #wav.scp prep
    d2=pd.read_csv(wavscp)
    d2.columns= ["col1"]
    d2[['ID', 'wav']] = d2['col1'].str.split(' ', expand=True)
    d2 = d2.drop(columns=["col1"])
    
    #main df 
    d= pd.merge(d1, d2, on="ID")
    print(d.head())

    d['wav'] = d['wav'].apply(lambda x: str(x).replace( '/home/shreya/btch03-no-empty/', ' /mnt/datadrive/datasets/spkveri/mrt1/btch03-no-empty/'))
    print("Length of d",len(d))
    #mask = d["wav"].isin(["/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16783193/281474985386592.wav","/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16783193/281474985386582.wav", "/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16783193/281474985386570.wav","/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16783193/281474985386581.wav","/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16783114/281474985436102.wav","/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16794090/281474989055448.wav"])
    #d = d[~mask]
    #print("Length of new d",len(d))
    #print(d)

    #test - RANDOM 10 UNVAL

    #val samples 3*20
    #Change path to wherever the train_selected_rows.csv obtained from the previous train fn is 
    # <output_folder>+train_selected_rows.csv
    filtered_df=pd.read_csv(output_folder+"train_selected_rows.csv")
    print(len(filtered_df))
    #print(filtered_df.head())

    #Unval 500-20= 480 
    #t = pd.DataFrame()t = d[~d.isin(filtered_df)]print(t.head())print(len(t))
    mask = d.index.isin(filtered_df.index)
    temp_df= d[~mask]
    temp_df= temp_df.reset_index(drop=True)
    print("mask total -20val len",len(temp_df))
    #print(temp_df["spk_id"]=="16793950")


    #selected ids array
    #selected_ids=["16793950","16801899","16794112"]
    selected_ids = filtered_df['spk_id'].unique()
    str_array = [str(x) for x in selected_ids]
    print(str_array)

    #get n from 455 - THIS DF IS USED TO GET UNVAL_EMB.CSV 
    n_df= temp_df.loc[temp_df['spk_id'].isin(str_array)]
    print("unval length of 3 speaker ids", len(n_df))
    #print(n_df)
    print("save the list")
    n_df.to_csv(output_folder+"test_selected_threeidsnoshuffling.csv", index= False)
    print("saved the list")

    #get 10 random from this list 
    # Group df by spk_id
    grouped = n_df.groupby('spk_id')

    #empty df to store the randomly selected rows
    final_df = pd.DataFrame(columns=n_df.columns)

    # Iterate over each group and select 10 random rows
    for name, group in grouped:
            final_df = final_df.append(group.sample(10))
    #print(final_df.head())
    print("len 10 from 3",len(final_df))

    #id= temp_df.loc[temp_df['spk_id']=="16794112"]
    #print(id)
    
    #Prep veri_test
    print("Start making veri_test")
    veri_test(final_df,output_folder)
    print("Saved veri_test")

    #For experiment
    print("Saving test.csv")
    testdf=csv_prep(data_folder,final_df)
    testdf.to_csv(output_folder+"test.csv",index= False)
    print("Saved test.csv") 

    print("Saving enrol.csv")
    testdf=csv_prep(data_folder,final_df)
    testdf.to_csv(output_folder+"enrol.csv",index= False)
    print("Saved enrol.csv")

    #For embeddings
    print("Saving unval_emb.csv")
    testdf=csv_prep(data_folder,n_df)
    testdf.to_csv(output_folder+"unval_emb.csv",index= False)
    print("Saved unval.csv") 

    

In [11]:
test(spk2utt,wavscp,data_folder, output_folder)

                         ID    spk_id  \
0  16793950_281474988390972  16793950   
1  16793950_281474988390973  16793950   
2  16793950_281474988390974  16793950   
3  16793950_281474988390975  16793950   
4  16793950_281474988390976  16793950   

                                                 wav  
0  /home/shreya/btch03-no-empty/16793950/28147498...  
1  /home/shreya/btch03-no-empty/16793950/28147498...  
2  /home/shreya/btch03-no-empty/16793950/28147498...  
3  /home/shreya/btch03-no-empty/16793950/28147498...  
4  /home/shreya/btch03-no-empty/16793950/28147498...  
Length of d 224000
74
mask total -20val len 223926
['16793950', '16801899', '16794112']
unval length of 3 speaker ids 1653
save the list
saved the list
len 10 from 3 30
Start making veri_test
30
Saving veri_test
Saved veri_test
Saved veri_test
Saving test.csv
Saved test.csv
Saving enrol.csv


  final_df = final_df.append(group.sample(10))
  final_df = final_df.append(group.sample(10))
  final_df = final_df.append(group.sample(10))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["start"]=" "
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[index, 'start'] = start_sample
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[index, "stop"] = stop_sample
A value is trying to be set on 

Saved enrol.csv
Saving unval_emb.csv
Saved unval.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['stop'] = df['stop'].astype(int)


In [12]:
print("Done preprocessing!")

Done preprocessing!
