In [17]:
from pydub import AudioSegment
import pandas as pd
import numpy as np
from pathlib import Path
from sphfile import SPHFile
from scipy.io import wavfile
from towhee import pipeline
from tqdm import tqdm

In [2]:
train_data = pd.read_csv('train_features_audio_aligned.csv')
train_data

Unnamed: 0.1,Unnamed: 0,Channel,Utterance,file,start,end,pitch0,pitch1,pitch2,fb00,fb01,fb02,Label
0,24551,A,otherwise theyre pretty smelly,4019.txt,220.071750,221.542875,0.058766,0.731051,0.638110,0.899390,0.884808,0.852887,sv
1,23101,B,and i think a lot of their friends feel the sa...,2709.txt,432.140875,434.443250,0.098493,0.476925,0.364297,0.851667,0.897019,0.761266,sd
2,27063,B,well now so if you were going to have a dinner...,3506.txt,3.879375,7.575625,0.119491,0.390849,0.420487,0.963859,0.968053,0.937679,qw
3,20702,A,they uh,3228.txt,55.163000,55.805875,-0.028226,0.336798,0.912177,0.927696,0.921769,0.901077,%
4,5976,B,i i dont feel like theyre a benefit to society...,3247.txt,40.875875,44.615500,0.110553,0.543148,1.124164,0.841859,0.877972,0.821876,sv
...,...,...,...,...,...,...,...,...,...,...,...,...,...
34235,9142,B,latex paint,3431.txt,76.686625,77.410500,0.033999,0.795516,2.186543,0.933034,0.924904,0.952571,qy
34236,29158,B,where are you calling from by the way texas,2627.txt,164.720625,440.489500,0.203538,1.612669,2.738063,1.000000,0.914083,0.981782,fc
34237,8837,B,okay,2263.txt,0.764875,1.559250,0.152105,0.322380,0.581555,0.956922,0.893387,0.935446,"fo_o_fw_""_by_bc"
34238,28557,A,but i mean they do,4733.txt,93.566375,94.416375,0.130494,1.247400,1.301812,0.884195,0.929633,0.879377,sd


In [3]:
test_data = pd.read_csv('test_features_audio_aligned.csv')
test_data

Unnamed: 0.1,Unnamed: 0,Channel,Utterance,file,start,end,pitch0,pitch1,pitch2,fb00,fb01,fb02,Label
0,7246,A,and one is uh four,3457.txt,5.724875,7.149125,0.079450,0.745006,0.446980,0.812936,0.790438,0.581930,sd
1,1323,B,yeah,2349.txt,18.924125,19.216375,-0.029854,0.196361,0.130172,0.704110,0.687018,0.650901,b
2,33043,A,okay,2608.txt,5.790500,6.150500,-0.012475,0.378718,2.083808,0.816360,0.851317,0.702781,aa
3,26715,A,regarding uh taxes i you know,4725.txt,0.650000,1.900000,0.033159,0.370226,0.428816,0.890855,0.886848,0.928268,"fo_o_fw_""_by_bc"
4,23466,A,and that was actually after the war was over,2253.txt,291.411000,292.961000,0.051122,0.643769,0.504700,0.912026,0.881303,0.915515,sd
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1798,20101,A,you know its not just a matter of um of you kn...,4036.txt,216.594500,221.119375,0.067425,3.005191,1.036978,0.854105,0.900739,0.863340,sd
1799,25612,A,yeah,2584.txt,46.278500,46.638500,0.062966,0.044387,0.365346,0.779663,0.767770,0.746546,b
1800,8778,B,but for for criminal suits its unanimous by fe...,4148.txt,26.458625,30.496625,0.168230,0.604286,0.494122,0.901684,0.879804,0.936716,qy^d
1801,25952,A,not that they wont appoint them,2962.txt,215.880375,220.206125,0.110140,0.524441,0.476326,0.945284,0.958228,0.935874,sd


In [4]:
# Data Cleaning

In [5]:
# cleaned_train_data = pd.DataFrame(columns=['Channel', 'Utterance', 'Label', 'file', 'start', 'end'])
# cleaned_test_data = pd.DataFrame(columns=['Channel', 'Utterance', 'Label', 'file', 'start', 'end'])

In [6]:
### Audio data preparation

In [7]:
audio_dataPath = 'Audio/swb1_LDC97S62/'
audio_data_folders = ['swb1_d1', 'swb1_d2', 'swb1_d3', 'swb1_d4']

In [8]:
def get_audio_file_loc(file):
    for audio_dir in audio_data_folders:
        dir_to_search = audio_dataPath + audio_dir + '/data'
        files = Path(dir_to_search).glob('*')
        for f in files:
            if f.name == 'sw0' + file + '.sph':
                return f
    print(f'{file} not found for wav')
    return None

In [146]:
# Add col for audio file loc -- DONE
# Convert to wav
# Channel split
# Split as per time
# Col for split file loc

train_data['raw_audio_file_loc'] = ''
train_data['split_audio_file_loc'] = ''
cleaned_train_data = pd.DataFrame(columns=train_data.columns)

print('Starting audio file processing...')
for index, row in train_data.iterrows():
    if index % 500 == 0:
        print(f'[CHECKPOINT/{index}] Processing file :: {row["file"]}')
    temp = train_data.iloc[index].to_frame().T
    # print(f'Processing file :: {row["file"]}')
    # print('Finding file in audio dir')
    file_dir = get_audio_file_loc(row["file"][:-4])
    # print(f'Audio file loc :: {file_dir}')
    if file_dir is None:
        print('Audio file not found. Skipping...')
        continue
    # train_data.iloc[index]['raw_audio_file_loc'] = file_dir
    temp['raw_audio_file_loc'] = file_dir

    # SPH to .wav
    wav_file_loc = str(file_dir) + '.wav'
    sph = SPHFile(file_dir)
    sph.write_wav(wav_file_loc)

    # Channel Splitting, assuming channel 1 is 'A' and 2 is 'B'
    wav_channel = 0 if row['Channel'] == 'A' else 1
    channel_wav_file_loc = str(file_dir) + f'_{wav_channel}_channel.wav'
    fs, data = wavfile.read(wav_file_loc)
    wavfile.write(channel_wav_file_loc, fs, data[:, 0])   # saving first column which corresponds to channel 1

    # Split file as per timestamps
    timestamp_wav_file = str(file_dir) + f'_{wav_channel}_channel_{index}.wav'
    sph.write_wav(timestamp_wav_file, row['start'], row['end'])
    # train_data.iloc[index]['split_audio_file_loc'] = timestamp_wav_file
    temp['split_audio_file_loc'] = timestamp_wav_file

    # Merge
    cleaned_train_data = pd.concat([cleaned_train_data, temp], ignore_index=True)

print('All files processed !')
cleaned_train_data

Starting audio file processing...
[CHECKPOINT/0] Processing file :: 4019.txt
[CHECKPOINT/500] Processing file :: 2407.txt
[CHECKPOINT/1000] Processing file :: 4319.txt
[CHECKPOINT/1500] Processing file :: 3813.txt
[CHECKPOINT/2000] Processing file :: 4802.txt
[CHECKPOINT/2500] Processing file :: 3971.txt
[CHECKPOINT/3000] Processing file :: 3716.txt
[CHECKPOINT/3500] Processing file :: 2018.txt
[CHECKPOINT/4000] Processing file :: 4659.txt
[CHECKPOINT/4500] Processing file :: 3383.txt
[CHECKPOINT/5000] Processing file :: 3076.txt
[CHECKPOINT/5500] Processing file :: 2622.txt
[CHECKPOINT/6000] Processing file :: 2711.txt
[CHECKPOINT/6500] Processing file :: 2692.txt
[CHECKPOINT/7000] Processing file :: 3467.txt
[CHECKPOINT/7500] Processing file :: 4362.txt
[CHECKPOINT/8000] Processing file :: 3042.txt
[CHECKPOINT/8500] Processing file :: 2102.txt
[CHECKPOINT/9000] Processing file :: 4174.txt
[CHECKPOINT/9500] Processing file :: 2237.txt
[CHECKPOINT/10000] Processing file :: 3389.txt
[CH

Unnamed: 0.1,Unnamed: 0,Channel,Utterance,file,start,end,pitch0,pitch1,pitch2,fb00,fb01,fb02,Label,raw_audio_file_loc,split_audio_file_loc
0,24551,A,otherwise theyre pretty smelly,4019.txt,220.07175,221.542875,0.058766,0.731051,0.63811,0.89939,0.884808,0.852887,sv,Audio\swb1_LDC97S62\swb1_d4\data\sw04019.sph,Audio\swb1_LDC97S62\swb1_d4\data\sw04019.sph_0...
1,23101,B,and i think a lot of their friends feel the sa...,2709.txt,432.140875,434.44325,0.098493,0.476925,0.364297,0.851667,0.897019,0.761266,sd,Audio\swb1_LDC97S62\swb1_d2\data\sw02709.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw02709.sph_1...
2,27063,B,well now so if you were going to have a dinner...,3506.txt,3.879375,7.575625,0.119491,0.390849,0.420487,0.963859,0.968053,0.937679,qw,Audio\swb1_LDC97S62\swb1_d3\data\sw03506.sph,Audio\swb1_LDC97S62\swb1_d3\data\sw03506.sph_1...
3,20702,A,they uh,3228.txt,55.163,55.805875,-0.028226,0.336798,0.912177,0.927696,0.921769,0.901077,%,Audio\swb1_LDC97S62\swb1_d3\data\sw03228.sph,Audio\swb1_LDC97S62\swb1_d3\data\sw03228.sph_0...
4,5976,B,i i dont feel like theyre a benefit to society...,3247.txt,40.875875,44.6155,0.110553,0.543148,1.124164,0.841859,0.877972,0.821876,sv,Audio\swb1_LDC97S62\swb1_d2\data\sw03247.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw03247.sph_1...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34235,9142,B,latex paint,3431.txt,76.686625,77.4105,0.033999,0.795516,2.186543,0.933034,0.924904,0.952571,qy,Audio\swb1_LDC97S62\swb1_d2\data\sw03431.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw03431.sph_1...
34236,29158,B,where are you calling from by the way texas,2627.txt,164.720625,440.4895,0.203538,1.612669,2.738063,1.0,0.914083,0.981782,fc,Audio\swb1_LDC97S62\swb1_d2\data\sw02627.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw02627.sph_1...
34237,8837,B,okay,2263.txt,0.764875,1.55925,0.152105,0.32238,0.581555,0.956922,0.893387,0.935446,"fo_o_fw_""_by_bc",Audio\swb1_LDC97S62\swb1_d1\data\sw02263.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw02263.sph_1...
34238,28557,A,but i mean they do,4733.txt,93.566375,94.416375,0.130494,1.2474,1.301812,0.884195,0.929633,0.879377,sd,Audio\swb1_LDC97S62\swb1_d4\data\sw04733.sph,Audio\swb1_LDC97S62\swb1_d4\data\sw04733.sph_0...


In [147]:
cleaned_train_data.to_csv('train_features_audio_aligned__with_wav_files.csv')

In [149]:
# Add col for audio file loc -- DONE
# Convert to wav
# Channel split
# Split as per time
# Col for split file loc

test_data['raw_audio_file_loc'] = ''
test_data['split_audio_file_loc'] = ''
cleaned_test_data = pd.DataFrame(columns=test_data.columns)

print('Starting audio file processing...')
for index, row in test_data.iterrows():
    if index % 500 == 0:
        print(f'[CHECKPOINT/{index}] Processing file :: {row["file"]}')
    temp = test_data.iloc[index].to_frame().T
    # print(f'Processing file :: {row["file"]}')
    # print('Finding file in audio dir')
    file_dir = get_audio_file_loc(row["file"][:-4])
    # print(f'Audio file loc :: {file_dir}')
    if file_dir is None:
        print('Audio file not found. Skipping...')
        continue
    # test_data.iloc[index]['raw_audio_file_loc'] = file_dir
    temp['raw_audio_file_loc'] = file_dir

    # SPH to .wav
    wav_file_loc = str(file_dir) + '.wav'
    sph = SPHFile(file_dir)
    sph.write_wav(wav_file_loc)

    # Channel Splitting, assuming channel 1 is 'A' and 2 is 'B'
    wav_channel = 0 if row['Channel'] == 'A' else 1
    channel_wav_file_loc = str(file_dir) + f'_{wav_channel}_channel.wav'
    fs, data = wavfile.read(wav_file_loc)
    wavfile.write(channel_wav_file_loc, fs, data[:, 0])   # saving first column which corresponds to channel 1

    # Split file as per timestamps
    timestamp_wav_file = str(file_dir) + f'_{wav_channel}_channel_{index}.wav'
    sph.write_wav(timestamp_wav_file, row['start'], row['end'])
    # test_data.iloc[index]['split_audio_file_loc'] = timestamp_wav_file
    temp['split_audio_file_loc'] = timestamp_wav_file

    # Merge
    cleaned_test_data = pd.concat([cleaned_test_data, temp], ignore_index=True)

print('All files processed !')
cleaned_test_data

Starting audio file processing...
[CHECKPOINT/0] Processing file :: 3457.txt
[CHECKPOINT/500] Processing file :: 4633.txt
[CHECKPOINT/1000] Processing file :: 3810.txt
[CHECKPOINT/1500] Processing file :: 2546.txt
All files processed !


Unnamed: 0.1,Unnamed: 0,Channel,Utterance,file,start,end,pitch0,pitch1,pitch2,fb00,fb01,fb02,Label,raw_audio_file_loc,split_audio_file_loc
0,7246,A,and one is uh four,3457.txt,5.724875,7.149125,0.07945,0.745006,0.44698,0.812936,0.790438,0.58193,sd,Audio\swb1_LDC97S62\swb1_d1\data\sw03457.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw03457.sph_0...
1,1323,B,yeah,2349.txt,18.924125,19.216375,-0.029854,0.196361,0.130172,0.70411,0.687018,0.650901,b,Audio\swb1_LDC97S62\swb1_d2\data\sw02349.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw02349.sph_1...
2,33043,A,okay,2608.txt,5.7905,6.1505,-0.012475,0.378718,2.083808,0.81636,0.851317,0.702781,aa,Audio\swb1_LDC97S62\swb1_d3\data\sw02608.sph,Audio\swb1_LDC97S62\swb1_d3\data\sw02608.sph_0...
3,26715,A,regarding uh taxes i you know,4725.txt,0.65,1.9,0.033159,0.370226,0.428816,0.890855,0.886848,0.928268,"fo_o_fw_""_by_bc",Audio\swb1_LDC97S62\swb1_d3\data\sw04725.sph,Audio\swb1_LDC97S62\swb1_d3\data\sw04725.sph_0...
4,23466,A,and that was actually after the war was over,2253.txt,291.411,292.961,0.051122,0.643769,0.5047,0.912026,0.881303,0.915515,sd,Audio\swb1_LDC97S62\swb1_d1\data\sw02253.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw02253.sph_0...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1798,20101,A,you know its not just a matter of um of you kn...,4036.txt,216.5945,221.119375,0.067425,3.005191,1.036978,0.854105,0.900739,0.86334,sd,Audio\swb1_LDC97S62\swb1_d3\data\sw04036.sph,Audio\swb1_LDC97S62\swb1_d3\data\sw04036.sph_0...
1799,25612,A,yeah,2584.txt,46.2785,46.6385,0.062966,0.044387,0.365346,0.779663,0.76777,0.746546,b,Audio\swb1_LDC97S62\swb1_d1\data\sw02584.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw02584.sph_0...
1800,8778,B,but for for criminal suits its unanimous by fe...,4148.txt,26.458625,30.496625,0.16823,0.604286,0.494122,0.901684,0.879804,0.936716,qy^d,Audio\swb1_LDC97S62\swb1_d3\data\sw04148.sph,Audio\swb1_LDC97S62\swb1_d3\data\sw04148.sph_1...
1801,25952,A,not that they wont appoint them,2962.txt,215.880375,220.206125,0.11014,0.524441,0.476326,0.945284,0.958228,0.935874,sd,Audio\swb1_LDC97S62\swb1_d1\data\sw02962.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw02962.sph_0...


In [150]:
cleaned_test_data.to_csv('test_features_audio_aligned__with_wav_files.csv')

In [9]:
cleaned_train_data = pd.read_csv('train_features_audio_aligned__with_wav_files.csv')
cleaned_test_data = pd.read_csv('test_features_audio_aligned__with_wav_files.csv')

In [10]:
### Audio Embeddings using VGGish

In [11]:
embedding_pipeline = pipeline('towhee/audio-embedding-vggish')

In [154]:
outs = embedding_pipeline('Audio/swb1_LDC97S62/swb1_d1/data/sw02005.sph_0_channel_6430.wav')
embeds = outs[0][0]
embeds

array([[-0.82036364, -0.02235112, -0.41300994, ..., -0.32624432,
         0.25843823, -0.04476282],
       [-0.8400724 , -0.73279965, -0.97257507, ..., -0.79468644,
         0.2912054 , -0.13896751],
       [-0.96856177, -0.21487424, -0.71445537, ..., -1.2906566 ,
         0.5236661 ,  0.14876935],
       ...,
       [-1.0941894 , -0.17372096, -0.5458378 , ..., -1.0862762 ,
         0.41352844, -0.30419177],
       [-0.97151434, -0.61433566, -0.5702079 , ..., -0.563406  ,
         0.84958446,  0.18776059],
       [-0.7974632 , -0.28262445, -0.80651236, ..., -0.63312006,
         0.42711574, -0.02943486]], dtype=float32)

In [155]:
### Train embeddings

In [12]:
cleaned_train_data__with_embeddings = pd.DataFrame(columns=cleaned_train_data.columns)
train_indexes_to_delete = []
cleaned_train_data['audio_embeddings'] = ''

for index, row in tqdm(cleaned_train_data.iterrows(), total=cleaned_train_data.shape[0]):
    try:
        outs = embedding_pipeline(row['split_audio_file_loc'])
        embeddings = outs[0][0]
        temp = cleaned_train_data.iloc[index].to_frame().T
        temp['audio_embeddings'] = list(embeddings)
        # cleaned_train_data.iloc[index]['audio_embeddings'] = embeddings
        cleaned_train_data__with_embeddings = pd.concat([cleaned_train_data__with_embeddings, temp], ignore_index=True)
    except Exception as e:
        train_indexes_to_delete.append(index)

cleaned_train_data__with_embeddings

2023-04-25 05:21:13,509 - 29148 - audio_decoder.py-audio_decoder:27 - ERROR: Invalid PCM packet, data has size 1 but at least a size of 2 was expected
2023-04-25 05:21:19,873 - 29148 - audio_decoder.py-audio_decoder:27 - ERROR: Invalid PCM packet, data has size 1 but at least a size of 2 was expected
2023-04-25 05:21:43,080 - 23740 - audio_decoder.py-audio_decoder:27 - ERROR: Invalid PCM packet, data has size 1 but at least a size of 2 was expected
2023-04-25 05:21:43,442 - 23740 - audio_decoder.py-audio_decoder:27 - ERROR: Invalid PCM packet, data has size 1 but at least a size of 2 was expected
2023-04-25 05:22:02,412 - 23960 - audio_decoder.py-audio_decoder:27 - ERROR: Invalid PCM packet, data has size 1 but at least a size of 2 was expected
2023-04-25 05:22:24,516 - 23740 - audio_decoder.py-audio_decoder:27 - ERROR: Invalid PCM packet, data has size 1 but at least a size of 2 was expected
2023-04-25 05:22:35,257 - 29076 - audio_decoder.py-audio_decoder:27 - ERROR: Invalid PCM packe

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Channel,Utterance,file,start,end,pitch0,pitch1,pitch2,fb00,fb01,fb02,Label,raw_audio_file_loc,split_audio_file_loc,audio_embeddings
0,0,24551,A,otherwise theyre pretty smelly,4019.txt,220.07175,221.542875,0.058766,0.731051,0.63811,0.89939,0.884808,0.852887,sv,Audio\swb1_LDC97S62\swb1_d4\data\sw04019.sph,Audio\swb1_LDC97S62\swb1_d4\data\sw04019.sph_0...,"[-0.7446629, -0.5539386, -0.5260427, -0.797536..."
1,12,1375,B,she got the treatments,3057.txt,238.462375,239.7735,0.049801,0.327706,0.288991,0.833201,0.821875,0.72311,sd,Audio\swb1_LDC97S62\swb1_d2\data\sw03057.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw03057.sph_1...,"[-0.7556931, -0.18065795, -0.32506776, -0.8088..."
2,14,28359,B,this is the reality,2418.txt,207.996125,209.286125,0.09164,0.485693,0.503992,0.841941,0.762634,0.675776,^q,Audio\swb1_LDC97S62\swb1_d1\data\sw02418.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw02418.sph_1...,"[-0.51421905, -0.5236217, -0.45810127, -1.1976..."
3,15,5278,A,so we kind of looked around,3252.txt,126.830125,128.693375,0.062573,0.717063,0.323278,0.864236,0.876196,0.871295,sd,Audio\swb1_LDC97S62\swb1_d2\data\sw03252.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw03252.sph_0...,"[-0.82808685, -0.5073712, -0.84148455, -0.8994..."
4,20,310,A,hes in in florida jail now,3334.txt,156.163125,157.872375,0.136232,0.20211,0.324408,0.822553,0.860201,0.623045,sd,Audio\swb1_LDC97S62\swb1_d2\data\sw03334.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw03334.sph_0...,"[-1.2704585, -0.5354775, -0.2047256, -0.638970..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6300,34211,15016,A,and it kind of makes you wonder,2662.txt,281.232625,282.492625,0.067311,0.295882,0.308001,0.796258,0.811982,0.837162,sv,Audio\swb1_LDC97S62\swb1_d3\data\sw02662.sph,Audio\swb1_LDC97S62\swb1_d3\data\sw02662.sph_0...,"[-0.8243134, -0.6850996, -0.5437871, -1.115950..."
6301,34216,3144,B,then they got into lots of trouble,3041.txt,85.467875,87.121125,0.109101,0.131329,0.369177,0.821601,0.853326,0.626697,sd,Audio\swb1_LDC97S62\swb1_d2\data\sw03041.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw03041.sph_1...,"[-0.7438276, -0.018036127, -0.28294587, -1.086..."
6302,34217,8357,B,well i wanted to go,2953.txt,38.73725,39.86725,0.086197,0.319149,0.401463,0.982032,0.967915,0.917494,^h,Audio\swb1_LDC97S62\swb1_d1\data\sw02953.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw02953.sph_1...,"[-0.8271198, 0.18368217, -0.32657582, -1.06563..."
6303,34223,1570,B,it it depends on um,2611.txt,479.430625,480.991375,0.054801,0.424003,0.518149,0.680224,0.769932,0.614414,sv,Audio\swb1_LDC97S62\swb1_d2\data\sw02611.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw02611.sph_1...,"[-0.4560522, -0.08628321, -0.43902183, -1.2200..."


In [13]:
cleaned_train_data__with_embeddings.to_csv('train_features_audio_aligned__with_wav_files__embeddings.csv')

In [12]:
cleaned_test_data__with_embeddings = pd.DataFrame(columns=cleaned_test_data.columns)
cleaned_test_data['audio_embeddings'] = ''
test_indexes_to_delete = []

for index, row in tqdm(cleaned_test_data.iterrows(), total=cleaned_test_data.shape[0]):
    try:
        outs = embedding_pipeline(row['split_audio_file_loc'])
        embeddings = outs[0][0]
        temp = cleaned_test_data.iloc[index].to_frame().T
        temp['audio_embeddings'] = list(embeddings)
        # cleaned_test_data.iloc[index]['audio_embeddings'] = embeddings
        cleaned_test_data__with_embeddings = pd.concat([cleaned_test_data__with_embeddings, temp], ignore_index=True)
    except Exception as e:
        test_indexes_to_delete.append(index)

2023-04-25 13:13:22,063 - 22660 - audio_decoder.py-audio_decoder:27 - ERROR: Invalid PCM packet, data has size 1 but at least a size of 2 was expected
2023-04-25 13:13:30,315 - 13356 - audio_decoder.py-audio_decoder:27 - ERROR: Invalid PCM packet, data has size 1 but at least a size of 2 was expected
2023-04-25 13:13:46,924 - 13356 - audio_decoder.py-audio_decoder:27 - ERROR: Invalid PCM packet, data has size 1 but at least a size of 2 was expected
2023-04-25 13:13:48,347 - 22660 - audio_decoder.py-audio_decoder:27 - ERROR: Invalid PCM packet, data has size 1 but at least a size of 2 was expected
2023-04-25 13:13:51,873 - 13356 - audio_decoder.py-audio_decoder:27 - ERROR: Invalid PCM packet, data has size 1 but at least a size of 2 was expected
2023-04-25 13:14:04,490 - 23088 - audio_decoder.py-audio_decoder:27 - ERROR: Invalid PCM packet, data has size 1 but at least a size of 2 was expected
2023-04-25 13:14:36,153 - 23088 - audio_decoder.py-audio_decoder:27 - ERROR: Invalid PCM packe

In [15]:
cleaned_train_data__with_embeddings

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Channel,Utterance,file,start,end,pitch0,pitch1,pitch2,fb00,fb01,fb02,Label,raw_audio_file_loc,split_audio_file_loc,audio_embeddings
0,0,24551,A,otherwise theyre pretty smelly,4019.txt,220.07175,221.542875,0.058766,0.731051,0.63811,0.89939,0.884808,0.852887,sv,Audio\swb1_LDC97S62\swb1_d4\data\sw04019.sph,Audio\swb1_LDC97S62\swb1_d4\data\sw04019.sph_0...,"[-0.7446629, -0.5539386, -0.5260427, -0.797536..."
1,12,1375,B,she got the treatments,3057.txt,238.462375,239.7735,0.049801,0.327706,0.288991,0.833201,0.821875,0.72311,sd,Audio\swb1_LDC97S62\swb1_d2\data\sw03057.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw03057.sph_1...,"[-0.7556931, -0.18065795, -0.32506776, -0.8088..."
2,14,28359,B,this is the reality,2418.txt,207.996125,209.286125,0.09164,0.485693,0.503992,0.841941,0.762634,0.675776,^q,Audio\swb1_LDC97S62\swb1_d1\data\sw02418.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw02418.sph_1...,"[-0.51421905, -0.5236217, -0.45810127, -1.1976..."
3,15,5278,A,so we kind of looked around,3252.txt,126.830125,128.693375,0.062573,0.717063,0.323278,0.864236,0.876196,0.871295,sd,Audio\swb1_LDC97S62\swb1_d2\data\sw03252.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw03252.sph_0...,"[-0.82808685, -0.5073712, -0.84148455, -0.8994..."
4,20,310,A,hes in in florida jail now,3334.txt,156.163125,157.872375,0.136232,0.20211,0.324408,0.822553,0.860201,0.623045,sd,Audio\swb1_LDC97S62\swb1_d2\data\sw03334.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw03334.sph_0...,"[-1.2704585, -0.5354775, -0.2047256, -0.638970..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6300,34211,15016,A,and it kind of makes you wonder,2662.txt,281.232625,282.492625,0.067311,0.295882,0.308001,0.796258,0.811982,0.837162,sv,Audio\swb1_LDC97S62\swb1_d3\data\sw02662.sph,Audio\swb1_LDC97S62\swb1_d3\data\sw02662.sph_0...,"[-0.8243134, -0.6850996, -0.5437871, -1.115950..."
6301,34216,3144,B,then they got into lots of trouble,3041.txt,85.467875,87.121125,0.109101,0.131329,0.369177,0.821601,0.853326,0.626697,sd,Audio\swb1_LDC97S62\swb1_d2\data\sw03041.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw03041.sph_1...,"[-0.7438276, -0.018036127, -0.28294587, -1.086..."
6302,34217,8357,B,well i wanted to go,2953.txt,38.73725,39.86725,0.086197,0.319149,0.401463,0.982032,0.967915,0.917494,^h,Audio\swb1_LDC97S62\swb1_d1\data\sw02953.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw02953.sph_1...,"[-0.8271198, 0.18368217, -0.32657582, -1.06563..."
6303,34223,1570,B,it it depends on um,2611.txt,479.430625,480.991375,0.054801,0.424003,0.518149,0.680224,0.769932,0.614414,sv,Audio\swb1_LDC97S62\swb1_d2\data\sw02611.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw02611.sph_1...,"[-0.4560522, -0.08628321, -0.43902183, -1.2200..."


In [14]:
cleaned_test_data__with_embeddings.to_csv('test_features_audio_aligned__with_wav_files__embeddings.csv')

In [15]:
cleaned_test_data__with_embeddings

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Channel,Utterance,file,start,end,pitch0,pitch1,pitch2,fb00,fb01,fb02,Label,raw_audio_file_loc,split_audio_file_loc,audio_embeddings
0,0,7246,A,and one is uh four,3457.txt,5.724875,7.149125,0.07945,0.745006,0.44698,0.812936,0.790438,0.58193,sd,Audio\swb1_LDC97S62\swb1_d1\data\sw03457.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw03457.sph_0...,"[-0.8501383, -0.6682229, -0.7700393, -0.899509..."
1,3,26715,A,regarding uh taxes i you know,4725.txt,0.65,1.9,0.033159,0.370226,0.428816,0.890855,0.886848,0.928268,"fo_o_fw_""_by_bc",Audio\swb1_LDC97S62\swb1_d3\data\sw04725.sph,Audio\swb1_LDC97S62\swb1_d3\data\sw04725.sph_0...,"[-0.29266676, -0.39997765, -0.056198478, -0.80..."
2,4,23466,A,and that was actually after the war was over,2253.txt,291.411,292.961,0.051122,0.643769,0.5047,0.912026,0.881303,0.915515,sd,Audio\swb1_LDC97S62\swb1_d1\data\sw02253.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw02253.sph_0...,"[-0.57869494, -0.021157354, -0.22905743, -1.10..."
3,7,30504,A,oh well he made it at fifty,2623.txt,577.62475,579.52525,0.117541,0.304582,0.497606,0.903913,0.885161,0.875586,sd,Audio\swb1_LDC97S62\swb1_d2\data\sw02623.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw02623.sph_0...,"[-0.77571785, -0.3537391, -0.31975806, -1.0626..."
4,9,8670,B,and another thing is cost,4649.txt,135.940125,137.436,0.075767,0.416207,0.655162,0.90832,0.937884,0.858942,sv,Audio\swb1_LDC97S62\swb1_d4\data\sw04649.sph,Audio\swb1_LDC97S62\swb1_d4\data\sw04649.sph_1...,"[-0.6750778, -0.38885894, -0.5354514, -0.97823..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,1782,24371,B,and that takes a lot of research,3788.txt,74.265625,75.951625,0.131553,0.652201,0.63622,0.738575,0.849406,0.791737,sv,Audio\swb1_LDC97S62\swb1_d3\data\sw03788.sph,Audio\swb1_LDC97S62\swb1_d3\data\sw03788.sph_1...,"[-1.0022984, -0.4274709, -0.63184744, -1.11947..."
330,1785,3016,B,and they come with a new car warranty,2439.txt,63.677625,65.337625,0.080437,0.417976,0.703755,0.824261,0.877894,0.823225,sd,Audio\swb1_LDC97S62\swb1_d1\data\sw02439.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw02439.sph_1...,"[-0.783738, -0.8356898, -0.9192296, -1.0297803..."
331,1786,5514,A,well this is the season for vacations,3093.txt,1.264625,2.878,0.042389,0.324073,0.369063,0.836854,0.792219,0.77149,sd,Audio\swb1_LDC97S62\swb1_d1\data\sw03093.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw03093.sph_0...,"[-1.020629, -0.5321673, -0.50256455, -1.245020..."
332,1788,20012,B,and it rubbed a notch in that,2692.txt,180.85375,182.06375,0.100869,0.443338,0.475594,0.831022,0.833894,0.724063,sd,Audio\swb1_LDC97S62\swb1_d1\data\sw02692.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw02692.sph_1...,"[-0.83638334, -0.4051539, -0.53455937, -0.9764..."


In [18]:
### Loading text features and removing waste indexes

In [18]:
cleaned_train_data__with_embeddings = pd.read_csv('train_features_audio_aligned__with_wav_files__embeddings.csv')
cleaned_test_data__with_embeddings = pd.read_csv('test_features_audio_aligned__with_wav_files__embeddings.csv')

In [14]:
text_aligned_train_features = np.load('train_features_alignment.npy')
text_aligned_test_features = np.load('test_features_alignment.npy')

In [15]:
# cleaned_train_data__with_embeddings.drop(index=train_indexes_to_delete, axis=1, inplace=True)
# cleaned_test_data__with_embeddings.drop(index=test_indexes_to_delete, axis=1, inplace=True)

In [None]:
# text_aligned_train_features__cleaned = np.delete(text_aligned_train_features, train_indexes_to_delete)
# text_aligned_test_features__cleaned = np.delete(text_aligned_test_features, test_indexes_to_delete)

In [None]:
### Save

In [None]:
# cleaned_train_data.to_csv('train_features_audio_aligned__with_wav_files__embeddings__cleaned.csv')
# cleaned_test_data.to_csv('test_features_audio_aligned__with_wav_files__embeddings__cleaned.csv')

In [None]:
# text_aligned_train_features__cleaned.save('text_aligned_train_features__cleaned.npy')
# text_aligned_test_features__cleaned.save('text_aligned_test_features__cleaned.npy')

In [20]:
cleaned_test_data__with_embeddings

Unnamed: 0.3,Unnamed: 0.2,0,Unnamed: 0.1,Unnamed: 0,Channel,Utterance,file,start,end,pitch0,pitch1,pitch2,fb00,fb01,fb02,Label,raw_audio_file_loc,split_audio_file_loc,audio_embeddings
0,0,Unnamed: 0.1,,,,,,,,,,,,,,,,,
1,1,Unnamed: 0,,,,,,,,,,,,,,,,,
2,2,Channel,,,,,,,,,,,,,,,,,
3,3,Utterance,,,,,,,,,,,,,,,,,
4,4,file,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,345,,1782.0,24371.0,B,and that takes a lot of research,3788.txt,74.265625,75.951625,0.131553,0.652201,0.636220,0.738575,0.849406,0.791737,sv,Audio\swb1_LDC97S62\swb1_d3\data\sw03788.sph,Audio\swb1_LDC97S62\swb1_d3\data\sw03788.sph_1...,[-1.0022984 -0.4274709 -0.63184744 -1.119475...
346,346,,1785.0,3016.0,B,and they come with a new car warranty,2439.txt,63.677625,65.337625,0.080437,0.417976,0.703755,0.824261,0.877894,0.823225,sd,Audio\swb1_LDC97S62\swb1_d1\data\sw02439.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw02439.sph_1...,[-0.783738 -0.8356898 -0.9192296 -1.029780...
347,347,,1786.0,5514.0,A,well this is the season for vacations,3093.txt,1.264625,2.878000,0.042389,0.324073,0.369063,0.836854,0.792219,0.771490,sd,Audio\swb1_LDC97S62\swb1_d1\data\sw03093.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw03093.sph_0...,[-1.020629 -0.5321673 -0.50256455 -1.245020...
348,348,,1788.0,20012.0,B,and it rubbed a notch in that,2692.txt,180.853750,182.063750,0.100869,0.443338,0.475594,0.831022,0.833894,0.724063,sd,Audio\swb1_LDC97S62\swb1_d1\data\sw02692.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw02692.sph_1...,[-0.83638334 -0.4051539 -0.53455937 -0.976427...
