## Covid Project

In this data science project we want to use data from the COWAS data base (uploaded at Kaggle: https://www.kaggle.com/praveengovi/coronahack-respiratory-sound-dataset) to make a 


### Data Structure

There are 1397 cases of which 56 are positive ones. Each case is composed of 9 independing recordings 
['counting-normal','counting-fast','breathing-deep','breathing-shallow','cough-heavy','cough-shallow','vowel-a','vowel-e','vowel-o']

### Potential Solution

Using an auto-encoder approach (out of distribution), training on "healthy" cases.
Proposed solution (https://github.com/moiseshorta/MelSpecVAE)

## #Chunk 1
### Libraries

In [37]:

#Data visualization

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Audio Analysis
import glob
import IPython
import librosa
import librosa.display
import tensorflow as tf
from tensorflow import keras

#path
import os

#

## #Chunk 2
### Import Meta data (file path information)

In [3]:
# import meta data
# Meta data csv contain different additional information about each case.
# One column contains the path to the .wav files of each case
df_meta = pd.read_csv('./CoronaHack-Respiratory-Sound-Dataset/Corona-Hack-Respiratory-Sound-Metadata.csv')
df_meta.info(), df_meta.shape


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1397 entries, 0 to 1396
Data columns (total 37 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   USER_ID                 1397 non-null   object 
 1   COUNTRY                 1397 non-null   object 
 2   AGE                     1397 non-null   int64  
 3   COVID_STATUS            1396 non-null   object 
 4   ENGLISH_PROFICIENCY     1397 non-null   object 
 5   GENDER                  1397 non-null   object 
 6   COUNTY_RO_STATE         1397 non-null   object 
 7   CITY_LOCALITY           1228 non-null   object 
 8   Diabetes                1397 non-null   int64  
 9   Asthma                  1397 non-null   int64  
 10  Smoker                  1397 non-null   int64  
 11  Hypertension            1397 non-null   int64  
 12  Fever                   1397 non-null   int64  
 13  Returning_User          1397 non-null   int64  
 14  Using_Mask              1397 non-null   

(None, (1397, 37))

## #Chunk 3
### Get the label for each case

In [5]:
#Get the label (healthy / COVID) 

#split COVID STATUS column to get labels in column 'split'
df_meta['split'] = df_meta['COVID_STATUS'].str.split('_').str.get(0)
#Check for NA
df_meta.loc[:,'counting-normal'].isna().sum()
df_meta.loc[:,'split'].value_counts()

#Generate a dict to re-categorize the split column
cat_dict = {'healthy':0,'no':0,'resp':0,'recovered':0,'positive':1}

#map cat_dict to split column 
df_meta.loc[:,'split'] =  df_meta.loc[:,'split'].map(cat_dict)
df_meta2 = df_meta.dropna(subset=['split'])
df_meta2.loc[:,'split'] = df_meta2.loc[:,'split'].astype('int32')


#Extract positive USER ID
df_meta_positives = df_meta[df_meta['split'] == 1]
df_meta_negatives = df_meta[df_meta['split'] == 0]

positives = list(df_meta_positives['USER_ID'])
negatives = list(df_meta_negatives['USER_ID'])
len(positives),len(negatives)
#positives

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


(56, 1340)

## #Chunk 4
### Define Function for .wav import and preprocessing 

In [15]:
# Write function for import and preprocessing of all 9 .wav files per case (code adapted from Tristan classes) 

import cv2
def preprocess_other(sample):
  image_target_height, image_target_width = 64, 64 #setting up the shape of sample
  audio_binary = tf.io.read_file(sample) #read-in the sample as tensor
  audio, rate = tf.audio.decode_wav(audio_binary, desired_channels=1) #getting the audio and rate
  #label = sample['label']

  def py_preprocess_audio(audio):
      audio = audio.numpy().astype('float32') #set audio file as float
      #generate the mel spectrogram
      spectrogram = librosa.feature.melspectrogram(
        y=audio, n_fft=1024,  n_mels=64, hop_length=64, sr=8000, fmax=2000 #n_fft = window size, n_mels = frequency bins, hop_lenghth =jump to the right , sr = sound rate, fmax = 
      ) 

      spectrogram /= np.max(spectrogram) #devide by np.max(audio)
      spectrogram = cv2.resize(spectrogram, dsize=(image_target_height, image_target_width)) #resize the spectrogram
      spectrogram = np.expand_dims(spectrogram, axis=-1) #expand the dimension ? -Why ?
      return spectrogram

  spectrogram = tf.py_function(py_preprocess_audio, [audio], tf.float32) #apply py_process_audio function 
  spectrogram.set_shape((image_target_height, image_target_width, 1)) #set shape, include channel dimension

  return spectrogram#, label

## #Chunk 5
### generate Function to create the input data for auto-encoder

In [42]:
# Create function to load and prepare data for input 
# here we want to use the 9 recordings as separate features but grouped per case as input to the auto-encoder 

#names of 9 recordings per each case (extracted from the csv meta data file from )
names_input = ['counting-normal','counting-fast','breathing-deep','breathing-shallow','cough-heavy','cough-shallow','vowel-a','vowel-e','vowel-o']
#label column from the meta data csv (#Chunk 3)
name_label = 'split'

def create_input_label(df=df_meta2,names=names_input,name_label=name_label):
    input_dic = {} #Use a dictionnary to put in the 9 records per case
    for index,name in enumerate(names):
        #print(index,name)
        path_list = df[name].tolist()
        #print(path_list[:10])
        path_name = ['./CoronaHack-Respiratory-Sound-Dataset'  + str(dir_name for dir_name in path_list)]
        sound_paths_tensor = tf.convert_to_tensor(path_name, dtype=tf.string) #convert to tensor
        sound = tf.data.Dataset.from_tensor_slices(sound_paths_tensor)
        input_dic['x_{}'.format(index)] = sound.map(lambda sample: preprocess_other(sample)).batch(32) #generating the names of recordings(features x_0 till x_8) in batch mode


    path_label = df[name_label]
    #print(path_label)
    y = tf.convert_to_tensor(path_label, dtype=tf.int16)

    return input_dic,y

    

## #Chunk 6
### test the output from function

In [43]:
x,y = create_input_label()
x

{'x_0': <BatchDataset shapes: (None, 64, 64, 1), types: tf.float32>,
 'x_1': <BatchDataset shapes: (None, 64, 64, 1), types: tf.float32>,
 'x_2': <BatchDataset shapes: (None, 64, 64, 1), types: tf.float32>,
 'x_3': <BatchDataset shapes: (None, 64, 64, 1), types: tf.float32>,
 'x_4': <BatchDataset shapes: (None, 64, 64, 1), types: tf.float32>,
 'x_5': <BatchDataset shapes: (None, 64, 64, 1), types: tf.float32>,
 'x_6': <BatchDataset shapes: (None, 64, 64, 1), types: tf.float32>,
 'x_7': <BatchDataset shapes: (None, 64, 64, 1), types: tf.float32>,
 'x_8': <BatchDataset shapes: (None, 64, 64, 1), types: tf.float32>}

In [44]:
x.get('x_0')

<BatchDataset shapes: (None, 64, 64, 1), types: tf.float32>

## #Chunk 7
### Built the auto-encoder architecture (code adapted from Tristan Class)

In [45]:
from tensorflow.keras import models, layers

class AutoEncoder(tf.keras.Model):
    
    def __init__(self, latent_dim):
        super().__init__()

        self.latent_dim = latent_dim

        # Encoder
        self.encoder_reshape = layers.Reshape((64,64,1)) #Shape as 64,64,1
        self.encoder_fc1 = layers.Dense(256, activation="relu")
        self.encoder_fc2 = layers.Dense(latent_dim, activation="relu")

        # Decoder
        self.decoder_fc1 = layers.Dense(256, activation='relu')
        self.decoder_fc2 = layers.Dense(1, activation='sigmoid')
        self.decoder_reshape = layers.Reshape((64,64,1))

        self._build_graph()

    def _build_graph(self):
        input_shape = (64,64,1)
        self.build((None,)+ input_shape)
        inputs = tf.keras.Input(shape=input_shape)
        _= self.call(inputs)

    def call(self, x):
        z = self.encode(x)
        x_new = self.decode(z)
        return x_new

    def encode(self, x):
        x = self.encoder_reshape(x)
        x = self.encoder_fc1(x)
        z = self.encoder_fc2(x)
        return z
   

    def decode(self, z):
        z = self.decoder_fc1(z)
        z = self.decoder_fc2(z)
        x = self.decoder_reshape(z)
        return x

autoencoder = AutoEncoder(32)
autoencoder.summary()

autoencoder.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy'
)

Model: "auto_encoder_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_2 (Reshape)          (None, 64, 64, 1)         0         
_________________________________________________________________
dense_4 (Dense)              (None, 64, 64, 256)       512       
_________________________________________________________________
dense_5 (Dense)              (None, 64, 64, 32)        8224      
_________________________________________________________________
dense_6 (Dense)              (None, 64, 64, 256)       8448      
_________________________________________________________________
dense_7 (Dense)              (None, 64, 64, 1)         257       
_________________________________________________________________
reshape_3 (Reshape)          (None, 64, 64, 1)         0         
Total params: 17,441
Trainable params: 17,441
Non-trainable params: 0
________________________________________________

## #Chunk 8
### Train the model

Here we try to input the 9 features (recordings per case) into the model architecture

In [52]:
history_list = {}

history = autoencoder.fit(
    x,
    epochs = 20,
    batch_size=32

)

history_list['base'] = history

ValueError: Failed to find data adapter that can handle input: (<class 'dict'> containing {"<class 'str'>"} keys and {"<class 'tensorflow.python.data.ops.dataset_ops.BatchDataset'>"} values), <class 'NoneType'>

## #Chunk 9
### Test with one feature

In [51]:
history_list = {}

history = autoencoder.fit(
    x.get('x_0'),
    epochs = 20,
    batch_size=32

)

history_list['base'] = history

Epoch 1/20


ValueError: in user code:

    C:\Users\paulg\.conda\envs\corona\lib\site-packages\keras\engine\training.py:853 train_function  *
        return step_function(self, iterator)
    C:\Users\paulg\.conda\envs\corona\lib\site-packages\keras\engine\training.py:842 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\paulg\.conda\envs\corona\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1286 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\paulg\.conda\envs\corona\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2849 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\paulg\.conda\envs\corona\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3632 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\paulg\.conda\envs\corona\lib\site-packages\keras\engine\training.py:835 run_step  **
        outputs = model.train_step(data)
    C:\Users\paulg\.conda\envs\corona\lib\site-packages\keras\engine\training.py:791 train_step
        self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    C:\Users\paulg\.conda\envs\corona\lib\site-packages\keras\optimizer_v2\optimizer_v2.py:522 minimize
        return self.apply_gradients(grads_and_vars, name=name)
    C:\Users\paulg\.conda\envs\corona\lib\site-packages\keras\optimizer_v2\optimizer_v2.py:622 apply_gradients
        grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
    C:\Users\paulg\.conda\envs\corona\lib\site-packages\keras\optimizer_v2\utils.py:73 filter_empty_gradients
        ([v.name for _, v in grads_and_vars],))

    ValueError: No gradients provided for any variable: ['dense_4/kernel:0', 'dense_4/bias:0', 'dense_5/kernel:0', 'dense_5/bias:0', 'dense_6/kernel:0', 'dense_6/bias:0', 'dense_7/kernel:0', 'dense_7/bias:0'].
