In [9]:
#Folder to find .wav files
data_folder="/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/"

#Spk2utt mapping path
spk2utt ="/mnt/datadrive/datasets/spkveri/mrt1/spk450/450-speakers/utt2spk"
#wav.scp path
wavscp="/mnt/datadrive/datasets/spkveri/mrt1/spk450/450-speakers/wav.scp"

#Path to save the preprocessed data
output_folder="/mnt/training/silverbullet-testenv/test-01-benchmarks/save/"

#Percentage split for train, dev, test respectively 
split=[0.1,0.0111111111, 0.8888888889]

In [10]:
import pandas as pd
import torchaudio
import torch
import numpy as np 
import random

In [11]:
def split_fn (df: pd.DataFrame, split):
    
    split_indices = [int(len(df) * percentage_split) for percentage_split in split]
    
    #Select rows
    print(split_indices)
    print("next index:", split_indices[1]+split_indices[0])
    
    train_df = df.iloc[:split_indices[0],:]
    validation_df = df.iloc[split_indices[0]:(split_indices[0]+split_indices[1])]
    test_df = df.iloc[(split_indices[0]+split_indices[1]):,:]
    enrol_df = df.iloc[(split_indices[0]+split_indices[1]):,:]

    print("Length of train_df", len(train_df))
    print("Length of dev",len(validation_df))
    print("Length of test", len(test_df))
    
    #print(train_df.head())
    #print(validation_df.head())
    
    return train_df, validation_df, test_df, enrol_df

In [12]:
def veri_test (test_df: pd.DataFrame, output_folder):

    final_df=pd.DataFrame()

    #matched samples

    matched_df = pd.DataFrame()
    
    for i in range(27):

        groups= test_df.groupby("spk_id")

        def select_random_pairs(group):
            wav_files = group['wav'].tolist()
            random_pairs = random.sample(wav_files, 2)
            return random_pairs

        # Apply the function to each group and convert the result to a list
        random_pairs_list = groups.apply(select_random_pairs).tolist()

        #print("Iteration")
        #print(len(random_pairs_list))
        #print(random_pairs_list)

        shufmatched_df = pd.DataFrame(random_pairs_list, columns = ['wav', 'shuffled_column'])
        #print(len(shuf_df))
        #print(shuf_df.head(2))

        matched_df = matched_df.append(shufmatched_df, ignore_index=True) 

    print("no of matched pairs",len(matched_df))

    # Define a function to extract the ID and spk_id
    def extract_id(wav):
        # Split the string into a list by '/'
        parts = wav.split('/')

        # Extract the speaker id and file number from the list
        spk_id = parts[-2]
        file_num = parts[-1].split('.')[0]

        # Concatenate the speaker id and file number to create the 'ID'
        ID = spk_id + '_' + file_num

        return ID, spk_id

    # Apply the function to the 'wav' column and store the results in two new columns
    matched_df['ID'], matched_df['spk_id'] = matched_df['wav'].apply(extract_id).str

    new_columns = ['ID', 'spk_id', 'wav',"shuffled_column"]
    matched_df = matched_df.reindex(columns=new_columns)

    #print(matched_df.head())


    #Unmatched sample
    unmatched_df = pd.DataFrame()

    for i in range(10):

        #sample one audio from each spk_id randomly 

        df_random = test_df.groupby('spk_id').apply(lambda x: x.sample(1)).drop_duplicates().reset_index(drop=True)
        #print(df_random)

        wav_list = df_random['wav'].tolist()
        f_list=[]

        for i in range(1104):
            l= random.sample(wav_list,2)
            #print(l)
            f_list.append(l)
        #print(f_list)
    
        shuf_df = pd.DataFrame(f_list, columns = ['wav', 'shuffled_column'])
        #print(shuf_df)

        # Define a function to extract the ID and spk_id
        def extract_id(wav):
            # Split the string into a list by '/'
            parts = wav.split('/')

            # Extract the speaker id and file number from the list
            spk_id = parts[-2]
            file_num = parts[-1].split('.')[0]

            # Concatenate the speaker id and file number to create the 'ID'
            ID = spk_id + '_' + file_num

            return ID, spk_id

        # Apply the function to the 'wav' column and store the results in two new columns
        shuf_df['ID'], shuf_df['spk_id'] = shuf_df['wav'].apply(extract_id).str

        new_columns = ['ID', 'spk_id', 'wav',"shuffled_column"]
        shuf_df = shuf_df.reindex(columns=new_columns)

        #print(shuf_df)

        unmatched_df = unmatched_df.append(shuf_df)
    print("No. of unmatched pairs",len(unmatched_df))


    
        
    #fn to set the labels
    def get_label(list1,list2):
        uniq_id= list1
        shuf_col = list2
        label=""
        if (str(list1.split("_")[0]) == str(list2.split("/")[8])):
            label = 1
            #print(str(list1.split("_")[0]),str(list2.split("/")[8]),label)
        else:
            label = 0
            #print(str(list1.split("_")[0]),str(list2.split("/")[8]),label)
        return(int(label))

    final_df = unmatched_df.append(matched_df, ignore_index=True)

    final_df["label"] = final_df.apply(lambda x: get_label(x["ID"], x["shuffled_column"]), axis=1)

    final_df=final_df.drop(columns=["spk_id","ID"])

    veri_test = final_df.reindex(columns=['label', 'wav', 'shuffled_column'])

    #save the data frame as csv 
    print(veri_test.head())
    print("length of veri test",len(veri_test))
    print("Saving veri_test")
    veri_test.to_csv(output_folder+"veri_test.txt", sep=" ", index=False, header=False)
    print("Saved veri_test")
    
    

csv prep

In [13]:
def csv_prep (data_folder, df: pd.DataFrame):
    
    df["start"]=" "
    
    groups = df.groupby(df['ID'].apply(lambda x: x.split('_')[0]))
    
    output = []
    for spk, group in groups:
        for index, row in group.iterrows():
            id = row['ID'].split('_')[1]
            wav = data_folder +spk+ "/" + id +  ".wav"
            #print(wav)

    
            signal, fs = torchaudio.load(wav)
            signal = signal.squeeze(0)
            audio_duration = signal.shape[0] / 16000
            start_sample = 0
            stop_sample = signal.shape[0]
            stop_sample= int(stop_sample)
            #print(stop_sample)

            df.loc[index, 'start'] = start_sample
            df.loc[index, "stop"] = stop_sample
            df.loc[index, "duration"] = audio_duration 
 
            
    df['stop'] = df['stop'].astype(int)
    
    df_f = df.reindex(columns=['ID', 'duration', 'wav', "start","stop","spk_id"])

    #print("num_frames")
    #num_frames=df['stop']-df['start']
    #print(num_frames)
    
    return df_f

In [14]:
def main (split,spk2utt,wavscp,data_folder, output_folder):
    
    #Make the main dataframe
    
        #spk2utt prep
        d1=pd.read_csv(spk2utt)
        d1.columns= ["col1"]
        d1[['ID', 'spk_id']] = d1['col1'].str.split(' ', expand=True)
        d1 = d1.drop(columns=["col1"])
        
        #wav.scp prep
        d2=pd.read_csv(wavscp)
        d2.columns= ["col1"]
        d2[['ID', 'wav']] = d2['col1'].str.split(' ', expand=True)
        d2 = d2.drop(columns=["col1"])
        
        #main df 
        d= pd.merge(d1, d2, on="ID")
        #print(d.head())
        d['wav'] = d['wav'].apply(lambda x: str(x).replace( '/home/sai/work/silver-bullet/batch-02/', '/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/'))
        print("Length of d",len(d))
        mask = d["wav"].isin(["/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16783193/281474985386592.wav","/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16783193/281474985386582.wav", "/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16783193/281474985386570.wav","/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16783193/281474985386581.wav","/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16783114/281474985436102.wav","/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16794090/281474989055448.wav"])
        d = d[~mask]
        print("Length of new d",len(d))

        print("Made main data frame")
        
        
    #split fn 
        print("Split into train dev test enrol")
        train_df, validation_df, test_df, enrol_df = split_fn(d, split)
        print("Done")

    #Prep veri_test
        print("Start making veri_test")
        veri_test(test_df,output_folder)
        print("Saved veri_test")


    #Prep 4 csv files
        print("start making the csv files")
        print("Saving train.csv")
        traindf=csv_prep(data_folder,train_df)
        traindf.to_csv(output_folder+"train.csv", index= False)
        print("Saved train.csv")
        
        print("Saving dev.csv")
        devdf=csv_prep(data_folder,validation_df)
        devdf.to_csv(output_folder+"dev.csv",index= False)
        print("Saved dev.csv")
        
        print("Saving test.csv")
        testdf=csv_prep(data_folder,test_df)
        testdf.to_csv(output_folder+"test.csv",index= False)
        print("Saved test.csv")
        
        print("Saving enrol.csv")
        enroldf=csv_prep(data_folder,enrol_df)

        #print(enroldf.loc[enroldf['wav'] == '/mnt/datadrive/datasets/spkveri/mrt1/spk450/wav/16783193/281474985386592.wav'])

        enroldf.to_csv(output_folder+"enrol.csv",index= False)
        print("Saved enrol.csv")

        print("-----------------Done preprocessing!--------------------") 
    

In [15]:
main(split,spk2utt,wavscp,data_folder,output_folder)

Length of d 11322
Length of new d 11316
Made main data frame
Split into train dev test enrol
[1131, 125, 10058]
next index: 1256
Length of train_df 1131
Length of dev 125
Length of test 10060
Done
Start making veri_test


  matched_df = matched_df.append(shufmatched_df, ignore_index=True)
  matched_df = matched_df.append(shufmatched_df, ignore_index=True)
  matched_df = matched_df.append(shufmatched_df, ignore_index=True)
  matched_df = matched_df.append(shufmatched_df, ignore_index=True)
  matched_df = matched_df.append(shufmatched_df, ignore_index=True)
  matched_df = matched_df.append(shufmatched_df, ignore_index=True)
  matched_df = matched_df.append(shufmatched_df, ignore_index=True)
  matched_df = matched_df.append(shufmatched_df, ignore_index=True)
  matched_df = matched_df.append(shufmatched_df, ignore_index=True)
  matched_df = matched_df.append(shufmatched_df, ignore_index=True)
  matched_df = matched_df.append(shufmatched_df, ignore_index=True)
  matched_df = matched_df.append(shufmatched_df, ignore_index=True)
  matched_df = matched_df.append(shufmatched_df, ignore_index=True)
  matched_df = matched_df.append(shufmatched_df, ignore_index=True)
  matched_df = matched_df.append(shufmatched_df,

no of matched pairs 10935


  shuf_df['ID'], shuf_df['spk_id'] = shuf_df['wav'].apply(extract_id).str
  unmatched_df = unmatched_df.append(shuf_df)
  shuf_df['ID'], shuf_df['spk_id'] = shuf_df['wav'].apply(extract_id).str
  unmatched_df = unmatched_df.append(shuf_df)
  shuf_df['ID'], shuf_df['spk_id'] = shuf_df['wav'].apply(extract_id).str
  unmatched_df = unmatched_df.append(shuf_df)
  shuf_df['ID'], shuf_df['spk_id'] = shuf_df['wav'].apply(extract_id).str
  unmatched_df = unmatched_df.append(shuf_df)
  shuf_df['ID'], shuf_df['spk_id'] = shuf_df['wav'].apply(extract_id).str
  unmatched_df = unmatched_df.append(shuf_df)
  shuf_df['ID'], shuf_df['spk_id'] = shuf_df['wav'].apply(extract_id).str
  unmatched_df = unmatched_df.append(shuf_df)
  shuf_df['ID'], shuf_df['spk_id'] = shuf_df['wav'].apply(extract_id).str
  unmatched_df = unmatched_df.append(shuf_df)
  shuf_df['ID'], shuf_df['spk_id'] = shuf_df['wav'].apply(extract_id).str
  unmatched_df = unmatched_df.append(shuf_df)
  shuf_df['ID'], shuf_df['spk_id'] = shu

No. of unmatched pairs 11040
   label                                                wav  \
0      0  /mnt/datadrive/datasets/spkveri/mrt1/spk450/wa...   
1      0  /mnt/datadrive/datasets/spkveri/mrt1/spk450/wa...   
2      0  /mnt/datadrive/datasets/spkveri/mrt1/spk450/wa...   
3      0  /mnt/datadrive/datasets/spkveri/mrt1/spk450/wa...   
4      0  /mnt/datadrive/datasets/spkveri/mrt1/spk450/wa...   

                                     shuffled_column  
0  /mnt/datadrive/datasets/spkveri/mrt1/spk450/wa...  
1  /mnt/datadrive/datasets/spkveri/mrt1/spk450/wa...  
2  /mnt/datadrive/datasets/spkveri/mrt1/spk450/wa...  
3  /mnt/datadrive/datasets/spkveri/mrt1/spk450/wa...  
4  /mnt/datadrive/datasets/spkveri/mrt1/spk450/wa...  
length of veri test 21975
Saving veri_test
Saved veri_test
Saved veri_test
start making the csv files
Saving train.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["start"]=" "
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[index, 'start'] = start_sample
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[index, "stop"] = stop_sample
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pa

Saved train.csv
Saving dev.csv
Saved dev.csv
Saving test.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['stop'] = df['stop'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["start"]=" "
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[index, 'start'] = start_sample
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pa

Saved test.csv
Saving enrol.csv
Saved enrol.csv
-----------------Done preprocessing!--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['stop'] = df['stop'].astype(int)


In [16]:
print("Done preprocessing!")

Done preprocessing!
