In [None]:
'''
chunked_data_df : 단어 레벨 데이터프레임
train_concat_df : 발화자 레벨 데이터프레임 (발화자 ID 추출 때 사용됨)
label : TD/SSD (0/1) to augment (클래스 0과 1에 대해서 각각 함수 호출)
num_concat : 음성 하나에 붙일 단어샘플 수

'''
def audio_augment_by_label(chunked_data_df, train_concat_df, label, num_concat, st_index):
    
    #filter out the necessary rows for augmentation
    train_concat_df =train_concat_df[train_concat_df['disease_type']==label] # TD 나 SSD
    train_spkrs = train_concat_df['id'].unique().tolist() # Extract all train speakers
    train_data_df = chunked_data_df[chunked_data_df['id'].isin(train_spkrs)] #Extract all chunked data spoken by the speakers in train_concat_df

    train_grouped_df = train_data_df.groupby('age') # group the train data dataframe by age
    new_path_prefix = f"/home/selinawisco/whisper/data/kochild/augmented_{num_concat}"

    proc = 0 # to keep track of how many augmented audio were produced
    new_speaker_id = st_index # starting index of the file name
    inc = 0 #increment variable for filename
    
    #shuffle each grouped by age df in train_grouped_df
    shuffled_dfs = [] # list to store all the shuffled df by age
    for age,age_df in train_grouped_df:

        shuffled_df = age_df.sample(frac=1).reset_index(drop=True) #shuffle the df
        # total_groups = total_groups + (len(shuffled_df) // 5 + 1)
        shuffled_dfs.append(shuffled_df) #add the shuffled age df to the list of shuffled dfs

    #--------- dictionary for csv file ---------
    data_dict = {
                'audio': [],
                'disease_type': [],
                'age': [],
                'id': []
            }
    #-------------------------------------------
    
    for shuffled_df in shuffled_dfs:

        #grabbing the universal information for this df for csv creation
        #-----------------------csv------------------------------
        curr_age = shuffled_df['age'][0]
        curr_label = shuffled_df['disease_type'][0]
        #-----------------------csv------------------------------

        if(len(shuffled_df) % num_concat == 0):
            num_groups = len(shuffled_df) // num_concat
        else: # if there's a remainder after dividing by num_concat
            num_groups = len(shuffled_df) // num_concat + 1
        # Inside a specific age df, looping over the number of groups
        for i in range(num_groups):   

            #getting the num_concat amount of audio in a dataframe called group 
            group = pd.DataFrame(shuffled_df.loc[i*num_concat : (i+1)*num_concat])
            full_audio_data, sr = librosa.load(group.loc[group.index[0], 'audio'], sr=None) #loading the first in the group

            #>>>>>Concatenation of the audios in group<<<<<<#
            for j in range(1, len(group)):

                audio_data, _ = librosa.load(group.loc[group.index[j], 'audio'], sr=sr)
                full_audio_data = np.concatenate((full_audio_data, audio_data))
            
            inc = inc + 1
            #-------------------- create audio ----------------------------
           
            new_path = "%s/%d"%(new_path_prefix, new_speaker_id + inc)
            os.makedirs(new_path, exist_ok=True)
            soundfile.write("%s/combined_speech.wav"%new_path,full_audio_data,sr,format="wav") # create the augmented audio
            proc = proc + 1
            #-------------------- create audio -----------------------------

            #------------------------- csv ---------------------------
            # add augmented audio info to csv file

            data_dict['speech_file'].append("%s/combined_speech.wav"%new_path)
            data_dict['age'].append(curr_age)
            data_dict['id'].append(new_speaker_id + inc)
            data_dict['disease_type'].append(curr_label)
            #------------------------- csv ---------------------------

    new_df = pd.DataFrame.from_dict(data_dict)
    return new_df


In [None]:
word_level_df = pd.read_csv(DATA_PATH + 'r08.1_train.csv')
speaker_level_df = pd.read_csv(DATA_PATH + 'r_012_train(combined).csv') 

In [None]:
num_concat = 5
augmented_td = audio_augment_by_label(word_level_df, speaker_level_df, 0, num_concat,10000)
augmented_ssd = audio_augment_by_label(word_level_df, speaker_level_df, 1, num_concat,20000)

train_augmented_df = pd.concat([augmented_td, augmented_ssd], ignore_index=True) # combine two csv's
train_augmented_df.to_csv(DATA_PATH + f'r_012_train(augmented_{num_concat}).csv',index=False) # creates the augmented train csv in data folder
