In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import json
import pandas
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
import sklearn

# Read the input data, group them based on datatype

In [None]:
directory = r"../input/covid19-cough-audio-classification/"

def retrive_data():
    json_files = []
    audio_files = []
    
    file_list = os.listdir(directory)
    json_file = [file for file in file_list if ".json" in str(file)]
    audio_file = [file for file in file_list if ".webm" in str(file)]
    audio_file2 = [file for file in file_list if ".ogg" in str(file)]
    return json_file, audio_file, audio_file2
    
json_files, audio_files, audio_files2 = retrive_data()

# Print total input data count

In [None]:
print("total files in directory: ", len([name for name in os.listdir(directory)]))
print("json file count: ", len(json_files))
print("webm file count: ", len(audio_files))
print("ogg file count: ", len(audio_files2))

# Process json data and create Pandas dataframe for better analysis and understanding

In [None]:
def load_json_info():
    json_data = []
    columns = []
    audio_file_name = []
    
    for filename in json_files:
        json_path = os.path.join(directory, filename)
        
        with open(json_path, mode='r') as recurse:
            json_contents = json.load(recurse)

            for key in json_contents.keys():
                columns.append(key)
    
    columns = set(columns)
    for filename in json_files:
        name = filename.replace(".json", "")
        is_present = 1 if name + ".webm" in audio_files else 0
        json_path = os.path.join(directory, filename)
        
        with open(json_path, mode='r') as recurse:
            json_contents = json.load(recurse)
            
            row_data = []
            for key in columns:
                row_data.append(json_contents.get(key))
            
            audio_file_name.append(is_present)
            json_data.append(row_data)
    
    dataframe = pd.DataFrame(json_data, columns=columns) 
    dataframe['audio_clip_present'] = audio_file_name
    return dataframe, columns
        
dataframe, columns = load_json_info()

# List down the missing fields in the input dataframe

In [None]:
dataframe.loc[0:5, :]

In [None]:
dataframe.isnull().sum()

# Dataframe formatting

In [None]:
dataframe.age.fillna(0, inplace=True)

dataframe['age'] = dataframe['age'].astype(float)
dataframe['date_time'] = pd.to_datetime(dataframe.datetime)

# Listen to Sample audio and its graph visualization 

In [None]:
x , sr = librosa.load(directory + audio_files[0])
ipd.Audio(directory + audio_files[0])

In [None]:
%matplotlib inline
plt.figure(figsize=(14, 5))
librosa.display.waveplot(x, sr=sr)

In [None]:
X = librosa.stft(x)
Xdb = librosa.amplitude_to_db(abs(X))
plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
plt.colorbar()

In [None]:
spectral_centroids = librosa.feature.spectral_centroid(x, sr=sr)[0]

# Computing the time variable for visualization
plt.figure(figsize=(14, 5))
frames = range(len(spectral_centroids))
t = librosa.frames_to_time(frames)

# Normalising the spectral centroid for visualisation
def normalize(x, axis=0):
    return sklearn.preprocessing.minmax_scale(x, axis=axis)

#Plotting the Spectral Centroid along the waveform
librosa.display.waveplot(x, sr=sr, alpha=0.4)
plt.plot(t, normalize(spectral_centroids), color='r')