In [90]:
# import pandas, os, and librosa

import pandas as pd
import os
import librosa

In [262]:
# read in coswara data as a pandas dataframe and filter by id, age, gender, covid status, and record date

data = pd.read_csv("coswara_data.csv")
data = data[["id", "a", "g", "covid_status", "record_date"]]
data

Unnamed: 0,id,a,g,covid_status,record_date
0,iV3Db6t1T8b7c5HQY2TwxIhjbzD3,28,male,healthy,2020-04-23
1,AxuYWBN0jFVLINCBqIW5aZmGCdu1,25,male,healthy,2020-04-20
2,C5eIsssb9GSkaAgIfsHMHeR6fSh1,28,female,healthy,2020-04-24
3,YjbEAECMBIaZKyfqOvWy5DDImUb2,26,male,healthy,2020-04-23
4,aGOvk4ji0cVqIzCs1jHnzlw2UEy2,32,male,healthy,2020-04-22
...,...,...,...,...,...
2741,333NjqA1TfZJuICEdXSkPhVz0LA3,48,male,positive_asymp,2021-07-13
2742,dpA0EeRrtJUeKJjEuf7BL0AeTJZ2,36,male,positive_mild,2021-07-06
2743,UXhG3vgoxkWtd4Meky1nm0sRgMV2,76,male,positive_mild,2021-07-13
2744,A5KW9PXraNgckln1gnNaJnX6DvB2,25,male,positive_moderate,2021-07-01


In [263]:
# find the unique values for covid status

data.covid_status.unique()

array(['healthy', 'no_resp_illness_exposed',
       'resp_illness_not_identified', 'positive_moderate',
       'recovered_full', 'positive_mild', 'positive_asymp',
       'under_validation'], dtype=object)

In [264]:
# keep only the covid status values of healthy, positive moderate, positive mild, and positive asymp

data = data.loc[(data["covid_status"] == "healthy") | (data["covid_status"] == "positive_moderate") | 
               (data["covid_status"] == "positive_mild") | (data["covid_status"] == "positive_asymp")]
data

Unnamed: 0,id,a,g,covid_status,record_date
0,iV3Db6t1T8b7c5HQY2TwxIhjbzD3,28,male,healthy,2020-04-23
1,AxuYWBN0jFVLINCBqIW5aZmGCdu1,25,male,healthy,2020-04-20
2,C5eIsssb9GSkaAgIfsHMHeR6fSh1,28,female,healthy,2020-04-24
3,YjbEAECMBIaZKyfqOvWy5DDImUb2,26,male,healthy,2020-04-23
4,aGOvk4ji0cVqIzCs1jHnzlw2UEy2,32,male,healthy,2020-04-22
...,...,...,...,...,...
2741,333NjqA1TfZJuICEdXSkPhVz0LA3,48,male,positive_asymp,2021-07-13
2742,dpA0EeRrtJUeKJjEuf7BL0AeTJZ2,36,male,positive_mild,2021-07-06
2743,UXhG3vgoxkWtd4Meky1nm0sRgMV2,76,male,positive_mild,2021-07-13
2744,A5KW9PXraNgckln1gnNaJnX6DvB2,25,male,positive_moderate,2021-07-01


In [265]:
# only focus on male and female for the analysis

data = data.loc[data['g'] != "other"]
data

Unnamed: 0,id,a,g,covid_status,record_date
0,iV3Db6t1T8b7c5HQY2TwxIhjbzD3,28,male,healthy,2020-04-23
1,AxuYWBN0jFVLINCBqIW5aZmGCdu1,25,male,healthy,2020-04-20
2,C5eIsssb9GSkaAgIfsHMHeR6fSh1,28,female,healthy,2020-04-24
3,YjbEAECMBIaZKyfqOvWy5DDImUb2,26,male,healthy,2020-04-23
4,aGOvk4ji0cVqIzCs1jHnzlw2UEy2,32,male,healthy,2020-04-22
...,...,...,...,...,...
2741,333NjqA1TfZJuICEdXSkPhVz0LA3,48,male,positive_asymp,2021-07-13
2742,dpA0EeRrtJUeKJjEuf7BL0AeTJZ2,36,male,positive_mild,2021-07-06
2743,UXhG3vgoxkWtd4Meky1nm0sRgMV2,76,male,positive_mild,2021-07-13
2744,A5KW9PXraNgckln1gnNaJnX6DvB2,25,male,positive_moderate,2021-07-01


In [266]:
# rename healthy covid status as negative
# rename positive asymp, positive mild, and positive mild covid statuses as positive

data.loc[data["covid_status"] == "healthy", "covid_status"] = "negative"
data.loc[data["covid_status"] == "positive_asymp", "covid_status"] = "positive"
data.loc[data["covid_status"] == "positive_mild", "covid_status"] = "positive"
data.loc[data["covid_status"] == "positive_moderate", "covid_status"] = "positive"

In [267]:
# find unique values for age

data.a.unique()

array([28, 25, 26, 32, 23, 33, 27, 35, 36, 39, 51, 21, 30, 37, 34, 40, 67,
       24, 57, 47, 63, 42, 12, 46, 61, 60, 56, 62, 65, 18, 58, 22, 43, 64,
       29, 55, 68, 31, 13, 75, 76, 38, 44, 48, 59, 70, 20, 45, 16, 52, 19,
       41, 50, 80, 17, 54, 72, 71, 49, 81, 66,  1, 77, 83, 15,  2, 53,  8,
       14, 87, 79, 69,  7, 86, 74, 73, 78, 10, 85, 11])

In [268]:
# change age to binary labels of 1 and 0
# 45 was selected because that was the median and mean age of the dataset
# 1 is old (age > 45) and 0 is young (age <= 45)

data.loc[data["a"] <= 45, "a"] = 0
data.loc[data["a"] > 45, "a"] = 1

In [269]:
# relabel 0 as young and 1 as male

data.loc[data["a"] == 0, "a"] = "young"
data.loc[data["a"] == 1, "a"] = "old"

In [270]:
# store the values from the id column in a list

patient_list = list(data["id"])

In [122]:
# loop through coswara data audio files (folder, patient id, and audio file) and append them to a list

coughs = []

for folder in os.listdir("Extracted_data"):
    if folder != ".DS_Store":
        for patient in os.listdir("Extracted_data/" + folder):
            if patient != ".DS_Store" and patient in patient_list:
                for audio in os.listdir("Extracted_data/" + folder + "/" + patient + "/"):
                    coughs.append([folder, patient, audio])

In [126]:
# append the path to each cough-heavy.wav file to a list

audio = []

for cough in coughs:
    if cough[2] == "cough-heavy.wav":
        audio.append("Extracted_data/" + cough[0] + "/" + cough[1] + "/" + cough[2])

In [132]:
# check size of list - should be 2112 files

len(audio)

2112

In [242]:
# ts for time series, sr for sampling rate
# extract mfcc for each file and append to empty list

mfccs = []

for file in audio:
    ts, sr = librosa.load(file)
    mfccs.append(librosa.feature.mfcc(y = ts, sr = sr, n_mfcc = 40, hop_length = 662325))

In [243]:
# checking that mfcc all have shape of (40, 1)

for mfcc in mfccs:
    if mfcc.shape != (40, 1):
        print(mfcc.shape)

In [246]:
# make pandas dataframe from mfcc

mfcc_data = pd.DataFrame(columns = range(40))

for mfcc in mfccs:
    mfcc_data = pd.concat([mfcc_data, pd.DataFrame(mfcc.T)], ignore_index = True)
    
mfcc_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,-1131.370850,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,-1131.370850,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,-1131.370850,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,-1131.370850,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,-707.025940,56.426231,13.907639,17.147783,-2.012528,5.152593,15.529266,-3.269644,5.154553,9.935441,...,-0.237741,3.750417,10.081942,-9.412180,-3.947216,8.084291,-2.917649,3.018960,-1.533405,4.374303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2107,-504.757263,55.995552,-4.536501,7.020762,-5.924695,2.774076,-7.127914,-12.114542,-3.710113,-5.438988,...,-5.155150,-7.399287,-7.925261,-5.619906,-5.644806,1.242933,-2.810987,-0.122058,-2.671896,0.472195
2108,-723.345093,-5.390590,83.830673,-18.267603,23.992085,-17.434067,9.397023,-25.693218,5.133441,-13.971018,...,-7.775133,16.577866,-2.542984,3.071694,-4.513388,-5.473712,0.011834,-0.797911,-1.411863,7.378503
2109,-724.274902,96.186737,-65.017212,62.636177,-31.711601,48.414162,-7.423612,6.581656,-14.663187,-3.385683,...,-4.680919,-4.848273,3.366529,-8.074186,-0.762944,1.346204,0.544383,-5.299528,0.331113,-0.893786
2110,-525.034973,55.996941,-10.076442,20.016684,0.919657,-5.279638,10.089838,23.879969,2.035708,-17.630470,...,-1.043215,-2.091019,-5.030453,3.318240,-3.117649,-1.775076,-6.552867,-3.958253,-0.738340,1.371519


In [247]:
# reset index for mfcc dataframe to start from 1
# rename columns from 1 to 40

mfcc_data = mfcc_data.reset_index(drop = True)
mfcc_data.index += 1
mfcc_data.columns = range(1, 41)

In [248]:
# check that mfcc dataframe is reindexed correctly

mfcc_data

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,31,32,33,34,35,36,37,38,39,40
1,-1131.370850,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,-1131.370850,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,-1131.370850,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,-1131.370850,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,-707.025940,56.426231,13.907639,17.147783,-2.012528,5.152593,15.529266,-3.269644,5.154553,9.935441,...,-0.237741,3.750417,10.081942,-9.412180,-3.947216,8.084291,-2.917649,3.018960,-1.533405,4.374303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2108,-504.757263,55.995552,-4.536501,7.020762,-5.924695,2.774076,-7.127914,-12.114542,-3.710113,-5.438988,...,-5.155150,-7.399287,-7.925261,-5.619906,-5.644806,1.242933,-2.810987,-0.122058,-2.671896,0.472195
2109,-723.345093,-5.390590,83.830673,-18.267603,23.992085,-17.434067,9.397023,-25.693218,5.133441,-13.971018,...,-7.775133,16.577866,-2.542984,3.071694,-4.513388,-5.473712,0.011834,-0.797911,-1.411863,7.378503
2110,-724.274902,96.186737,-65.017212,62.636177,-31.711601,48.414162,-7.423612,6.581656,-14.663187,-3.385683,...,-4.680919,-4.848273,3.366529,-8.074186,-0.762944,1.346204,0.544383,-5.299528,0.331113,-0.893786
2111,-525.034973,55.996941,-10.076442,20.016684,0.919657,-5.279638,10.089838,23.879969,2.035708,-17.630470,...,-1.043215,-2.091019,-5.030453,3.318240,-3.117649,-1.775076,-6.552867,-3.958253,-0.738340,1.371519


In [249]:
# rename columns from 1 to 40

column_names = {name: str(name) for name in range(1, 41)}

mfcc_data.rename(columns = column_names, inplace = True)

In [250]:
# filter out columns that have 0s in them

mfcc_data = mfcc_data[mfcc_data["3"] != 0]

In [251]:
# mfcc_data dataframe

mfcc_data

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,31,32,33,34,35,36,37,38,39,40
5,-707.025940,56.426231,13.907639,17.147783,-2.012528,5.152593,15.529266,-3.269644,5.154553,9.935441,...,-0.237741,3.750417,10.081942,-9.412180,-3.947216,8.084291,-2.917649,3.018960,-1.533405,4.374303
7,-427.065979,24.380165,53.189838,-27.081852,-17.267035,22.450130,-9.862848,30.509949,-14.036346,8.524446,...,0.229314,-5.317071,7.775287,5.483598,-1.584106,0.640845,-8.218983,8.017080,3.545668,9.140753
10,-576.463379,189.395813,-47.168961,54.421181,-34.666393,48.699585,-40.861412,-12.097855,-46.044930,-11.362576,...,-1.983973,4.916521,5.208999,-3.601717,-7.056323,-7.671651,1.058864,1.054138,3.543286,-0.483198
12,-660.168518,-41.418015,47.211380,-25.102112,-4.699510,19.003639,11.318865,10.720022,-12.694391,-5.186853,...,-10.161366,2.952621,-4.387035,8.939890,3.173635,2.953260,4.881428,8.365202,0.733032,4.489803
24,-832.438171,97.745850,-13.485252,11.668908,2.601781,8.706614,-4.443015,-1.067009,-1.965252,-2.138626,...,12.727421,6.819866,-5.170315,-6.187433,-8.128935,1.426312,-1.929544,6.119107,1.827243,-0.272180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2108,-504.757263,55.995552,-4.536501,7.020762,-5.924695,2.774076,-7.127914,-12.114542,-3.710113,-5.438988,...,-5.155150,-7.399287,-7.925261,-5.619906,-5.644806,1.242933,-2.810987,-0.122058,-2.671896,0.472195
2109,-723.345093,-5.390590,83.830673,-18.267603,23.992085,-17.434067,9.397023,-25.693218,5.133441,-13.971018,...,-7.775133,16.577866,-2.542984,3.071694,-4.513388,-5.473712,0.011834,-0.797911,-1.411863,7.378503
2110,-724.274902,96.186737,-65.017212,62.636177,-31.711601,48.414162,-7.423612,6.581656,-14.663187,-3.385683,...,-4.680919,-4.848273,3.366529,-8.074186,-0.762944,1.346204,0.544383,-5.299528,0.331113,-0.893786
2111,-525.034973,55.996941,-10.076442,20.016684,0.919657,-5.279638,10.089838,23.879969,2.035708,-17.630470,...,-1.043215,-2.091019,-5.030453,3.318240,-3.117649,-1.775076,-6.552867,-3.958253,-0.738340,1.371519


In [252]:
# list that stores indices of rows in mfcc_data

data_indices = []

for val in mfcc_data.index:
    data_indices.append(val)

In [257]:
# check length of data_indices - should be 1081 values

len(data_indices)

1081

In [258]:
# reset index for mfcc_data dataframe to start from 1

mfcc_data = mfcc_data.reset_index(drop = True)
mfcc_data.index += 1

In [271]:
# check that indices were reset to start from 1

mfcc_data

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,31,32,33,34,35,36,37,38,39,40
1,-707.025940,56.426231,13.907639,17.147783,-2.012528,5.152593,15.529266,-3.269644,5.154553,9.935441,...,-0.237741,3.750417,10.081942,-9.412180,-3.947216,8.084291,-2.917649,3.018960,-1.533405,4.374303
2,-427.065979,24.380165,53.189838,-27.081852,-17.267035,22.450130,-9.862848,30.509949,-14.036346,8.524446,...,0.229314,-5.317071,7.775287,5.483598,-1.584106,0.640845,-8.218983,8.017080,3.545668,9.140753
3,-576.463379,189.395813,-47.168961,54.421181,-34.666393,48.699585,-40.861412,-12.097855,-46.044930,-11.362576,...,-1.983973,4.916521,5.208999,-3.601717,-7.056323,-7.671651,1.058864,1.054138,3.543286,-0.483198
4,-660.168518,-41.418015,47.211380,-25.102112,-4.699510,19.003639,11.318865,10.720022,-12.694391,-5.186853,...,-10.161366,2.952621,-4.387035,8.939890,3.173635,2.953260,4.881428,8.365202,0.733032,4.489803
5,-832.438171,97.745850,-13.485252,11.668908,2.601781,8.706614,-4.443015,-1.067009,-1.965252,-2.138626,...,12.727421,6.819866,-5.170315,-6.187433,-8.128935,1.426312,-1.929544,6.119107,1.827243,-0.272180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1077,-504.757263,55.995552,-4.536501,7.020762,-5.924695,2.774076,-7.127914,-12.114542,-3.710113,-5.438988,...,-5.155150,-7.399287,-7.925261,-5.619906,-5.644806,1.242933,-2.810987,-0.122058,-2.671896,0.472195
1078,-723.345093,-5.390590,83.830673,-18.267603,23.992085,-17.434067,9.397023,-25.693218,5.133441,-13.971018,...,-7.775133,16.577866,-2.542984,3.071694,-4.513388,-5.473712,0.011834,-0.797911,-1.411863,7.378503
1079,-724.274902,96.186737,-65.017212,62.636177,-31.711601,48.414162,-7.423612,6.581656,-14.663187,-3.385683,...,-4.680919,-4.848273,3.366529,-8.074186,-0.762944,1.346204,0.544383,-5.299528,0.331113,-0.893786
1080,-525.034973,55.996941,-10.076442,20.016684,0.919657,-5.279638,10.089838,23.879969,2.035708,-17.630470,...,-1.043215,-2.091019,-5.030453,3.318240,-3.117649,-1.775076,-6.552867,-3.958253,-0.738340,1.371519


In [274]:
# reset indices of data dataframe to start from 1

data = data.reset_index(drop = True)
data.index += 1
data

Unnamed: 0,id,a,g,covid_status,record_date
1,iV3Db6t1T8b7c5HQY2TwxIhjbzD3,young,male,negative,2020-04-23
2,AxuYWBN0jFVLINCBqIW5aZmGCdu1,young,male,negative,2020-04-20
3,C5eIsssb9GSkaAgIfsHMHeR6fSh1,young,female,negative,2020-04-24
4,YjbEAECMBIaZKyfqOvWy5DDImUb2,young,male,negative,2020-04-23
5,aGOvk4ji0cVqIzCs1jHnzlw2UEy2,young,male,negative,2020-04-22
...,...,...,...,...,...
2108,333NjqA1TfZJuICEdXSkPhVz0LA3,old,male,positive,2021-07-13
2109,dpA0EeRrtJUeKJjEuf7BL0AeTJZ2,young,male,positive,2021-07-06
2110,UXhG3vgoxkWtd4Meky1nm0sRgMV2,old,male,positive,2021-07-13
2111,A5KW9PXraNgckln1gnNaJnX6DvB2,young,male,positive,2021-07-01


In [275]:
# extracting indices so the corresponding rows from the data dataframe can be obtained

data = data.query('index in @data_indices')

In [277]:
# checking that the correct indices were selected from the data dataframe

data

Unnamed: 0,id,a,g,covid_status,record_date
5,aGOvk4ji0cVqIzCs1jHnzlw2UEy2,young,male,negative,2020-04-22
7,OW5RTM4WXPawz0QLpsfjsl4FqM22,young,male,negative,2020-04-20
10,pOZwqBg4NsVYWASmwwhXFq4UlpC2,young,male,negative,2020-04-24
12,GzhrTQhWHSTwvweRKx2x1Uh4wx52,young,male,negative,2020-04-24
24,zga52aKD4sQJbsqwZg1RrLKjkus2,young,female,negative,2020-04-20
...,...,...,...,...,...
2108,333NjqA1TfZJuICEdXSkPhVz0LA3,old,male,positive,2021-07-13
2109,dpA0EeRrtJUeKJjEuf7BL0AeTJZ2,young,male,positive,2021-07-06
2110,UXhG3vgoxkWtd4Meky1nm0sRgMV2,old,male,positive,2021-07-13
2111,A5KW9PXraNgckln1gnNaJnX6DvB2,young,male,positive,2021-07-01


In [278]:
# reset index for the data dataframe to start from 1

data = data.reset_index(drop = True)
data.index += 1

In [280]:
# remove id and record_date columns from the data dataframe

data = data[["a", "g", "covid_status"]]

In [282]:
# rename a to Age, g to Gender, and covid_status to Covid Status

data.columns = ["Age", "Gender", "Covid Status"]

In [286]:
# add data and mfcc_data to one dataframe

final_data = pd.concat([data, mfcc_data], axis = 1)

In [287]:
# reset index for final_data dataframe to start from 1

final_data = final_data.reset_index(drop = True)
final_data.index += 1

In [289]:
# save preprocessed dataframe as a csv file

final_data.to_csv("preprocessed_coswara_data.csv", index = False)