In [None]:
# general purpose libraries
import numpy as np
import datetime as dt
import pandas as pd
import os
import pickle
from timeit import default_timer as timer
from collections import OrderedDict

pd.set_option("display.max_columns", None)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/Users/preetham7/Downloads/archive'):
    for filename in filenames:
        if filename.endswith(".csv"):
            print(os.path.join(dirname, filename))

In [None]:
# plots and visualisation
import matplotlib.pyplot as plt
import plotly.graph_objects as ply_go
import plotly.figure_factory as ply_ff
import plotly.colors as ply_colors #.sequential.Oranges as orange_palette
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
# DSP libraries
from scipy import signal
import librosa
import librosa.display as librosa_display

In [None]:
# ML and data modelling libraries
from sklearn.preprocessing   import MinMaxScaler, OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score,roc_curve, precision_recall_curve,confusion_matrix,precision_score, recall_score,average_precision_score, classification_report
from sklearn.linear_model import LogisticRegression

import xgboost as xgb

In [None]:
data_dir = '/Users/preetham7/Downloads/archive/'
metadata_file = 'metadata_compiled.csv'
metadata=pd.read_csv(data_dir+metadata_file,sep=",")


In [None]:
# convert strings 'True'/'False' to genuine booleans
cols_to_boolean = (['respiratory_condition', 'fever_muscle_pain',
                     'dyspnea_1', 'wheezing_1', 'stridor_1','choking_1', 'congestion_1', 'nothing_1',
                     'dyspnea_2', 'wheezing_2', 'stridor_2','choking_2', 'congestion_2', 'nothing_2',
                     'dyspnea_3', 'wheezing_3', 'stridor_3','choking_3', 'congestion_3', 'nothing_3',
                     'dyspnea_4', 'wheezing_4', 'stridor_4','choking_4', 'congestion_4', 'nothing_4'])
#metadata[cols_to_boolean] = metadata[cols_to_boolean].apply(lambda x: x.astype(bool))
for c in cols_to_boolean:
    metadata.loc[metadata[c].notnull(),c] = metadata.loc[metadata[c].notnull(),c].astype(bool) 

print("NULL or NA records for each column:")
print( metadata.isnull().sum() )
    
cols_to_fillna = ['gender', 'status','diagnosis_1','diagnosis_2','diagnosis_3','diagnosis_4']
metadata[cols_to_fillna]=metadata[cols_to_fillna].fillna('n/a')

#print(metadata.dtypes)
#print(metadata.shape)
metadata.head(5)


In [None]:
my_title_layout = dict({"text":"my distribution", 'xanchor':'center', 'x':0.5, 'y':0.9, 'font':{'size':24}})
my_xaxis_layout = dict(title=dict(text="my x axis", font={'size':16}))
my_layout = dict(title=my_title_layout,
                xaxis= my_xaxis_layout)
bin_size_dict = dict(cough_detected=0.001,SNR=0.5, age=1, gender=1, respiratory_condition=1, fever_muscle_pain=1, status=1 )
xaxis_title_dict = dict(cough_detected="Cough Detection Score",SNR="Signal-to-Noise Ratio" , age="Age", 
                        gender="Gender", respiratory_condition="Resp. Condition", fever_muscle_pain="Fever", status="Status" )

for c in ['cough_detected','SNR', 'age', 'gender','respiratory_condition','fever_muscle_pain', 'status' ]:
    hist_data = ply_go.Histogram(x=metadata[c], name=c, showlegend=False, xbins={'size':bin_size_dict[c]})
    fig = ply_go.Figure(data=[hist_data], layout=my_layout)
    fig.update_layout(title={'text': c+" distribution"}, xaxis={"title":{"text":xaxis_title_dict[c]}})
    fig.show()
###


fig = ply_go.Figure( layout=my_layout)
for tmp_diag in metadata['status'].unique():
    violin_data = ply_go.Violin(x=metadata.loc[metadata['status']==tmp_diag, 'status'],
                                y=metadata.loc[metadata['status']==tmp_diag, 'age'],
                                name=tmp_diag,
                                box_visible=True,
                                meanline_visible=True)
    fig.add_trace(violin_data)
    #end for
fig.update_layout(title={'text': "Distribution of AGE by type of DIAGNOSYS"}, xaxis={"title":{"text":None}}, 
                  yaxis={"title":{"text":"AGE [years]"}})
fig.show()


fig = ply_go.Figure( layout=my_layout)
for tmp_diag in metadata['status'].unique():
    violin_data = ply_go.Violin(x=metadata.loc[metadata['status']==tmp_diag, 'status'],
                                y=metadata.loc[metadata['status']==tmp_diag, 'cough_detected'],
                                name=tmp_diag,
                                box_visible=True,
                                meanline_visible=True)
    fig.add_trace(violin_data)
    #end for loop on unique statuses

    
fig.update_layout(title={'text': "Distribution of cough detection classifier by type of DIAGNOSYS"}, 
                  xaxis={"title":{"text":None}}, 
                  yaxis={"title":{"text":"Cough Detection Score"}})
fig.show()



fig = ply_go.Figure( layout=my_layout)
for tmp_diag in metadata['status'].unique():
    violin_data = ply_go.Violin(x=metadata.loc[(metadata['status']==tmp_diag)&(metadata['SNR']<100), 'status'],
                                y=metadata.loc[(metadata['status']==tmp_diag)&(metadata['SNR']<100), 'SNR'],
                                name=tmp_diag,
                                box_visible=True,
                                meanline_visible=True)
    fig.add_trace(violin_data)
    #end for loop on unique statuses

    
fig.update_layout(title={'text': "Distribution of SNR by type of DIAGNOSYS"}, 
                  xaxis={"title":{"text":None}}, 
                  yaxis={"title":{"text":"Signal-to-Noise Ratio"}})
fig.show()


In [None]:
def summarise_pivot_df(df, xcols, ycols, valcol):
    summary_df = df[xcols+ycols+valcol]
    summary_df.loc[summary_df[xcols[0]].isnull(),xcols] = 'n/a' #replace NA with a default string
    summary_df.loc[summary_df[ycols[0]].isnull(),ycols] = 'n/a' #replace NA with a default string
    summary_df = summary_df.groupby(xcols+ycols).count().reset_index()
    print(summary_df)
    pivot_df = pd.pivot_table(data=summary_df,values=valcol, index=xcols,columns=ycols)
    pivot_df.columns = [ c[1] for c in pivot_df.columns ] # get rid of multiindex
    return pivot_df

def pandas_to_plotly_heatdata(df):
    #print(df.index)
    return {'x': df.columns.tolist(),
            'y': df.index.tolist(),
            'z': df.values.tolist()}

# Heatmap Fever vs status
meta_summary_df = summarise_pivot_df(metadata, ['fever_muscle_pain'], ['status'], ['uuid'])
meta_summary_df = meta_summary_df[['healthy','symptomatic','COVID-19','n/a']]
n = meta_summary_df.sum().sum()
print(meta_summary_df.head(5) )

heat_data = ply_go.Heatmap(pandas_to_plotly_heatdata(meta_summary_df), 
                           colorscale=ply_colors.sequential.Oranges,
                           colorbar={'title':"Entries", 'titleside':"top"} ,
                           text=meta_summary_df.values)
rounded_annotation = [ ["NA" if pd.isnull(c) else "{:.0f}".format(c) for c in r] for r in heat_data['z']]
fig = ply_ff.create_annotated_heatmap(z=heat_data['z'], 
                                      x=heat_data['x'],
                                      y=[i for i,t in enumerate(heat_data['y'])],
                                      annotation_text=rounded_annotation,
                                      colorscale=heat_data['colorscale'],
                                      showscale=True,
                                      colorbar=heat_data['colorbar']  )
fig.update_layout( yaxis={"title":{"text":"Muscle Pain"},
                          "tickmode":'array',"tickvals":[2,1,0],"ticktext":['n/a','Yes','No']})
fig.show()

heat_data = ply_go.Heatmap(pandas_to_plotly_heatdata(100.0*meta_summary_df/n) ,
                           colorscale=ply_colors.sequential.Oranges,
                          colorbar={'title':"Percentage", 'titleside':"top"})
rounded_annotation = [ [ "NA" if pd.isna(c)  else "{:.2f}%".format(c)  for c in r] for r in heat_data['z']]
fig = ply_ff.create_annotated_heatmap(z=heat_data['z'], 
                                      x=heat_data['x'],
                                      y=[i for i,t in enumerate(heat_data['y'])],
                                      annotation_text=rounded_annotation,
                                      colorscale=heat_data['colorscale'],
                                      showscale=True,
                                      colorbar=heat_data['colorbar'])
fig.update_layout( yaxis={"title":{"text":"Muscle Pain"},
                          "tickmode":'array',"tickvals":[2,1,0],"ticktext":['n/a','Yes','No']})
fig.show()

# Heatmap RespCond vs status
meta_summary_df = summarise_pivot_df(metadata, ['respiratory_condition'], ['status'], ['uuid'])
meta_summary_df = meta_summary_df[['healthy','symptomatic','COVID-19','n/a']]
n = meta_summary_df.sum().sum()
#print(meta_summary_df.head(5) )
#print( pandas_to_plotly_heatdata(meta_summary_df) )
heat_data = ply_go.Heatmap(pandas_to_plotly_heatdata(meta_summary_df), 
                           colorscale=ply_colors.sequential.Oranges,
                           colorbar={'title':"Entries", 'titleside':"top"} ,
                           text=meta_summary_df.values)
rounded_annotation = [ ["NA" if pd.isnull(c) else "{:.0f}".format(c) for c in r] for r in heat_data['z']]
fig = ply_ff.create_annotated_heatmap(z=heat_data['z'], 
                                      x=heat_data['x'],
                                      #y=heat_data['y'],#
                                      y=[int(i) for i,t in enumerate(heat_data['y']) ],
                                      annotation_text=rounded_annotation,
                                      colorscale=heat_data['colorscale'],
                                      showscale=True,
                                      colorbar=heat_data['colorbar']  )
fig.update_layout( yaxis={"title":{"text":"REspiratory Condition"},
                          "tickmode":'array',"tickvals":[2,1,0,],"ticktext":['n/a','Yes','No']})
fig.show()


In [None]:
import numpy as np
import datetime as dt
import pandas as pd
import os
import warnings
import pickle
from timeit import default_timer as timer
from collections import OrderedDict
from itertools import chain

pd.set_option("display.max_columns", None)


import warnings
warnings.filterwarnings('ignore', '.*PySoundFile failed. Trying audioread instead*.', )
from scipy import signal
import librosa
import librosa.display as librosa_display

In [None]:
metadata['audio_class'] = 'X' # default, we should have none by the end of this classification process
metadata.loc[ (metadata['cough_detected'] >= 0.80) & (metadata['age']>=60) ,'audio_class'] = 'A'
metadata.loc[ (metadata['cough_detected'] >= 0.80) & (metadata['age']>=40) & (metadata['age']<60) ,'audio_class'] = 'B'
metadata.loc[ (metadata['cough_detected'] >= 0.80) & (metadata['age']< 40) ,'audio_class'] = 'C'
metadata.loc[ (metadata['cough_detected'] < 0.80) & (metadata['age']>=60) ,'audio_class'] = 'D'
metadata.loc[ (metadata['cough_detected'] < 0.80) & (metadata['age']>=40) & (metadata['age']<60) ,'audio_class'] = 'E'
metadata.loc[ (metadata['cough_detected'] < 0.80) & (metadata['age']< 40) ,'audio_class'] = 'F'

print("Entries subdivided in classes. Printing the number of entries for each class:")
print(metadata[['audio_class','uuid']].groupby(['audio_class']).count().rename(columns={'uuid':'N_entries'}) )

print("\n\n\nSplitting count by class and status:")
print(metadata[['audio_class','status','uuid']].groupby(['audio_class','status']).count().rename(columns={'uuid':'N_entries'}) )


In [None]:
def import_raw_audio(filename,indir, sr=None ):
    print(filename,'  ',indir)
    t, sr = librosa.load(indir+filename, sr=sr, mono=True)
    print('import raw audio 3')
    duration = t.shape[0]/sr #in seconds
    mu_t = t.mean()
    min_t = t.min()
    max_t = t.max()
    #tnorm = (t - mu_t )
    #tnorm = tnorm / (max_t-mu_t)
    f_token = np.array([filename[:-4]]).reshape(1, -1)
    tokens = np.array([sr, duration, mu_t, max_t, min_t]).reshape(1,-1)
    audio_df = pd.DataFrame(data= np.hstack((f_token, tokens)),
                         columns=['AUDIO_FILE', 'SAMPLING_RATE','DURATION', 'MEAN_SIG', 'MAX_SIG', 'MIN_SIG' ],
                         )
    audio_df['SAMPLING_RATE'] = audio_df['SAMPLING_RATE'].astype(float).astype(int)#weird conversion from string to int 
    for i in ['DURATION', 'MEAN_SIG', 'MAX_SIG', 'MIN_SIG' ]:
        audio_df[i] = audio_df[i].astype(float)
    
    return audio_df, t, sr
    
def zero_padding(t, sr, target_duration):
    """do zero-padding to get audio files all of the same duration; 
       this will allow us to have spectrograms all of the same size"""
    target_len = target_duration * sr
    if t.shape[0] > target_len:
        t = t[0:target_len]
    elif t.shape[0] < target_len:
        n_pads = target_len - t.shape[0] 
        t = np.append(t, np.repeat(0,n_pads)  )
    else:
        pass
    return t


def calc_stft_power_spectrum(stft, sr, n_fft):
    amplitudes = np.abs(stft)**2
    frequencies = librosa.fft_frequencies(sr, n_fft)
    psx = amplitudes.mean(axis=-1)
    return frequencies, np.sqrt(psx)


def calc_power_spectrum_welch(t, sr, n_fft):
    f, psx = signal.welch(t, sr, window='hann',nfft=n_fft, noverlap=0,axis=-1, scaling='spectrum')
    return f, np.sqrt(psx)

def calc_spectral_features(t, sr, n_fft = 512, win_length = None, win_overlap=0.0, n_mfcc=None, rec_width=0):
    
    ### Calculate spectrograms:
    ###   -) Short-time Fourier transform (STFT) for the power spectrum
    ###   -) Mel-frequency cepstral coefficients (MFCC)
    ###
    ### win_overlap: float, [0.0, 1.0] ; if 0.0, windows will be NOT overlapping; 0.9999 means almost completely overlapping windows
    ### rec_width: float, unused
    ### 
    ### Output:
    ###    stft: numpy.ndarray of dimension [n_fft/2, duration*my_sampling_rate/n_fft]; 
    ###          the n_fft/2 rows represent the frequencies of the Fast Fourier Transform in time domain;
    ###          the columns are the time windows in which the raw signal has been subdivided for the FFT.
    ###          The output values are complex numbers representing the amplitude of the sine and cosine
    ###          at that specific frequency for that specific signal window
    ###
    ###   mfcc: numpy.array of dimensions [n_mfcc, duration*my_sampling_rate/n_fft];
    ###         the n_mfcc rows indicate the different mel frequency bands;
    ###         the columns are the time windows in which the raw signal has been subdivided for the FFT
    ###         that is then mapped to the mel-frequncy bins.
    ###
    
    if win_length is None:
        win_length = n_fft
    
    if n_mfcc is None:
        n_mfcc = n_fft
    
    assert (win_overlap>=0)&(win_overlap<1.0), "Invalid value of win_overlap {} - it must be in range [0.0, 1.0) ".format(win_overlap)
    hop_length = int(win_length*(1.0-win_overlap))
    
    #   stft_db = librosa.amplitude_to_db(  np.abs(librosa.stft(t, n_fft=n_fft, 
    #                                                              hop_length=hop_length, win_length=win_length )))
    stft = librosa.stft(t, n_fft=n_fft, hop_length=hop_length, win_length=win_length ) 
    mfcc = librosa.feature.mfcc(t, sr=sr, n_mfcc=n_mfcc, dct_type=2)
    #iirt_db = librosa.amplitude_to_db(  np.abs(librosa.iirt(t, hop_length=hop_length, win_length=win_length )) )

    #R_stft = librosa.segment.recurrence_matrix(stft_db, mode='affinity', self=False, width=rec_width)
    #R_iirt = librosa.segment.recurrence_matrix(iirt_db, mode='affinity', self=False, width=rec_width)

    return stft,mfcc


def stack_rows_with_pad(list_of_arrays):
    f1 = lambda x: x.shape[1]
    max_dim = max(list(map(f1,list_of_arrays)) )
    #print("Original shapes:")
    #print([m.shape for m in list_of_arrays])
    #print("Padding shapes:")
    #print([(m.shape[0], max_dim-m.shape[1] ) for m in list_of_arrays])
    #print("nan pads:")
    #print([np.full([ m.shape[0],max_dim-m.shape[1] ],np.nan) for m in list_of_arrays])
    padded_arrays = [ np.append(m, np.full([ m.shape[0],max_dim-m.shape[1] ],np.nan), axis=1 ) for m in list_of_arrays]
    return np.concatenate(padded_arrays, axis=0)


def calc_spectral_properties_welch(t, sr, n_fft, time_window_ms, freq_bins):
    #######
    ### Computes a whole bunch of spectral properties, after the reference (see section III.A)
    ### https://myresearchspace.uws.ac.uk/ws/files/10993506/2018_12_15_Monge_Alvarez_et_al_Cough.pdf
    ###
    ### It splits the audio signal in smaller chunks. For each chunk computes the Power Spectrum Density
    ### using the Welch method. It then averages the power for user-defined frequency bands.
    ### At that point, we have many subsegments of the audio, k, and many average PSD, j
    ### The spectral properties are calculated averaging and summing over k.
    ### Output is a dictionary with various spectral properties, each one replicated j times 
    ### (as many as the frequency bands).
    ###
    
    #sanity checks
    assert len(freq_bins)>1,"Error, input freq_bins must be a list with the boundaries of the frequency bins"
    
    
    #define how many ms is long each sample of the audio signal and how many values go in each subsegment
    n_samples_tot = len(t)
    if( time_window_ms is None ):
        time_window_ms = 1000*n_samples_tot/sr
    chunk_length = min(n_samples_tot, round(time_window_ms*sr/1000) )# how many audio samples fit in time_window_ms
    n_chunks = int(np.ceil(n_samples_tot / chunk_length))
    n_freq_bins = len(freq_bins)-1
    out_all_freq = np.empty((n_freq_bins,0),float)
    out_all_psx = np.empty((n_freq_bins,0),float)
    #print("*"*30+"\nLooping over {} chunks (tot t samples:{}, chunk l = {})".format(n_chunks,n_samples_tot,chunk_length) )
    for k in range(0,n_chunks,1):
        tmin = k*chunk_length
        tmax = min((k+1)*chunk_length, n_samples_tot)
        tmp_segment = t[tmin:tmax]
        freqs_welch, psx_welch = calc_power_spectrum_welch(tmp_segment,sr, n_fft)
        #print("k={} n_freq_bins={} --> {} {}".format(k,n_freq_bins,freqs_welch.shape, psx_welch.shape))
        chunk_freq = np.empty((1,0),float)
        chunk_psx = np.empty((1,0),float)
    
        for j in range(0, n_freq_bins,1):
            freqmin = freq_bins[j]
            freqmax = freq_bins[j+1]
            freq_mask = (freqs_welch>=freqmin)&(freqs_welch<freqmax)
            selfreqs = freqs_welch[freq_mask]
            selpsx = psx_welch[freq_mask]
            #print("{} {} |||  {} {} {}".format(k,j,chunk_freq.shape,selfreqs.shape, selfreqs.reshape(1,-1).shape,selpsx.shape))
            if(j==0):
                chunk_freq = selfreqs.reshape(1,-1)
                chunk_psx  = selpsx.reshape(1,-1)
            else:
                chunk_freq = stack_rows_with_pad([chunk_freq,selfreqs.reshape(1,-1)])  
                chunk_psx = stack_rows_with_pad([chunk_psx,selpsx.reshape(1,-1)])  
            #print( "max chunk {} {} : {}".format(k,j,np.nanmax(chunk_psx, axis=1) ) )
        ### end for loop on j
        
        # append horizontally (row-wise) different frames
        out_all_freq = np.append(out_all_freq,chunk_freq, axis=1)
        out_all_psx = np.append(out_all_psx,chunk_psx, axis=1)
    ####end for loop on k   
    
    #print("SHAPES:")
    #print(out_all_freq.shape)
    #print(out_all_psx.shape)
    
    #Zero crossing rate
    zcr = librosa.feature.zero_crossing_rate(t, frame_length=chunk_length, hop_length=chunk_length+1)
    
    # spectral centroid
    psx_sum = np.nansum(out_all_psx, axis=1)
    spec_centroid = (np.nansum(out_all_freq*out_all_psx, axis=1)/ psx_sum).reshape(-1,1)
        
    #spectral bandwidth
    spec_bw = np.nansum( ((out_all_freq-spec_centroid)**2)*out_all_psx,axis=1)/psx_sum

    #spectral crest factor
    #C = 1.0 / (np.nanmax(out_all_freq) - np.nanmin(out_all_freq) +1)
    #spec_crest = (np.nanmax(out_all_psx)/(C*psx_sum) ).reshape(-1,1)
    psx_25 = np.nanquantile(out_all_psx,.25, axis=1)
    psx_50 =np.nanquantile(out_all_psx,.50, axis=1)
    psx_75 = np.nanquantile(out_all_psx,.75, axis=1)  
    psx_max = np.nanmax(out_all_psx, axis=1)
    #print("MAX: {} ; P25: {} ; P50: {} ; P75: {}".format(psx_max, psx_25, psx_50, psx_75))
    spec_crest = (psx_max-psx_50) / (psx_75 - psx_25)
    
    # spectral standard deviation
    spec_sd = np.nanstd( out_all_psx,axis=1)
    
    #spectral skewness
    n_entries = np.array([ len(row[~np.isnan(row)]) for row in out_all_psx])#.reshape(-1,)
    skew_factors = [ e*np.sqrt(e-1)/(e-2) for e in n_entries]
    spec_mean = np.nanmean(out_all_psx,axis=1).reshape(-1,1)
    spec_skew = skew_factors*np.nansum((out_all_psx-spec_mean)**3, axis=1)  / spec_sd**3
    
    return zcr, spec_centroid.reshape(1,-1), spec_bw.reshape(1,-1), spec_crest.reshape(1,-1), spec_mean.reshape(1,-1),spec_sd.reshape(1,-1),spec_skew.reshape(1,-1)
    

In [None]:
my_sampling_rate = int(4096*2)  # Sampling rate, how frequently we want to take a value of the audio curve.
                                 # the max frequency in the STFT will be (approximately) half of this
                               
my_n_fft = 512 # number of frequency bins to be calculated in the STFT; 
               # if my_window_size is None, this drives also the time-sampling window
    
my_n_mfcc = 26 # number of mel-frequencies used for the MFCC calculation. The original article used
               # a number of 13 MFCC frequencies, I am trying to add some extra info   
    
my_window_size = None # should not be greater than n_fft
target_duration = 10 # seconds; shorter audios will be zero padded; longer audios will be cut;
                     # obtained from an earlier dry run over all data and charting the distribution 
                     # of duration of the raw sound samples; 10 sec corresponds to the 97th percentile and 
                     # represent a significant improvement in terms of computing time (x5 faster) 
                     # respect to more conservative choices like 70 seconds (99th percentile)


iclass = "A"   # the class of audio records to be processesed (see previous cells)

### these are the frequency bins used to compute short-term features as per the original article (used as inputs to the cough detection classifier).
### Note that the code here will compute features for all bins but in the original paper they use only non-contiguous values.
### For example, we keep the bin [0, 200] Hz but not the [200, 300] Hz; the lowest bin used inthe rest of htis analysis will be [0, 200] Hz,
### the highest one will be [3800, 3900] Hz
my_psd_freqs = [0.0, 200.0, 300.0, 425, 500.0, 650.0, 950.0, 1150.0, 1400.0, 1800.0, 2300.0, 2400.0, 2850.0, 2950.0, 3800.0, 3900, 4000]
psd_feature_names =['SPEC_CENTROID', 'SPEC_WIDTH', 'SPEC_CREST', 'SPEC_MEAN', 'SPEC_SD', 'SPEC_SKEW']


mfcc_feature_names = [ "MFCC_MEAN_{:02}".format(i) for i in range(0,my_n_mfcc,1)]
mfcc_feature_names = mfcc_feature_names + [ "MFCC_SD_{:02}".format(i) for i in range(0,my_n_mfcc,1)]


In [None]:
def prepare_data(input_data, audio_datadir, sr, target_duration, 
                 n_fft, n_mfcc, fft_window_size, psd_freq_bins,
                 mfcc_feature_names, psd_feature_names, 
                 max_audio_samples=None, print_every_n=10):
    ######################################################
    ###
    ### Prepares a dataframe with a collection of properties and sound features 
    ### that can be readily used later in a ML classification process
    ### 
    ### input_data: pandas data.frame; an extract of the metadata file present in the original dataset
    ###
    ### audio_datadir: string; the path to the diretory where the audio files are stored
    ###
    ### sr: int; sampling rate
    ###
    ### target_duration: int; final length of audio sample, in seconds. All audio files will be formatted 
    ###                  to this duration; longer audios will be cut; shorter audios will be padded with zeros
    ###
    ### n_fft: int; number of frequency bins to be considered in the Fast Fourier Transform
    ###
    ### n_mfcc: int; number of Mel-freuqencies to be used when computing the MFCC
    ###
    ### max_audio_samples: int; maximum number of audio files to be processed. If None, all available UUIDs  
    ###                    will be processed; otherwise, only the first max_audio_sample UUID will be considered
    ###
    ###
    ### Output: The output of this loop is a big pandas dataframe with as many rows as audio files 
    ###         and as many columns as a series of audio features. 
    ###         The column list includes also the audio UUID and the sample label (the "status" column in the metadata file).
    ###
    ######################################################
    
    # get the full list of uuid to be processed
    all_uuids = input_data['uuid'].values
    if max_audio_samples is not None:
        all_uuids[0:max_audio_samples]


    # empty pandas df where to store all features for all UUIDs
    all_data = pd.DataFrame()

    # init  timer and df containig some metadata of the audio files
    skipped_uuids = []
    audio_metadata = pd.DataFrame()
    t_start = timer()


    for idx, uuid in enumerate(all_uuids):
              
        tmp_audiofilename = uuid+".webm"
        if not os.path.exists(audio_datadir+tmp_audiofilename):
            # try to look for a .ogg file
            tmp_audiofilename = uuid+".ogg"
            if not os.path.exists(audio_datadir+tmp_audiofilename):
                warn("WARNING! Could not find audio file for UUID: {}  . Skipping.".format(uuid))
                continue
        print('1') 
        if idx % print_every_n ==0:
            print("Processing file #{}: {}".format(idx,tmp_audiofilename))

        try:
            tmp_df, tmp_audio, sr = import_raw_audio(tmp_audiofilename, indir=audio_datadir, sr=sr)
        except FileNotFoundError as e_fnf:
            print("Could not find audio file {}.\n\n\n".format(tmp_audiofilename))
            skipped_uuids = skipped_uuids + [uuid]
            continue #move to next file
#         except Exception as e:
#             print("Some other exception occurred")
#             raise e #rethrow exception

        tmp_audio = zero_padding(tmp_audio, sr=sr, target_duration=target_duration) 
        tmp_df['UUID'] = uuid
        audio_metadata = audio_metadata.append(tmp_df)


        stft , mfcc = calc_spectral_features(tmp_audio, sr, n_fft=n_fft,n_mfcc=n_mfcc, win_length=fft_window_size, win_overlap=0.0)

        ### extract mean and std dev for each mel-frequency in the mfcc
        mfcc_mean = np.mean(mfcc, axis=1)
        mfcc_sd = np.std(mfcc,axis=1)
        mfcc_features = np.append(mfcc_mean,mfcc_sd,axis=0)
        mfcc_feat_dict = {name:val for name,val in zip(mfcc_feature_names,mfcc_features)}

        ### Power Spectrum Density based short-term features
        zcr,sc,sb,scf,ssmean, ssd, ssk = calc_spectral_properties_welch(tmp_audio,sr, my_n_fft,None, psd_freq_bins)
        # consider only every second bin to reduce features; following original article freq bins
        psd_features = np.array([ (x0, x1, x2, x3,x4,x5) for i, (x0, x1, x2, x3,x4,x5) in enumerate(zip(*sc, *sb,*scf,*ssmean, *ssd, *ssk)) if i%2==0]).transpose() 

        #now extract each element of the PSD feature (correspondignto a unique combination of spectral feature and freq bin)
        n_freq_bins = psd_features.shape[1]
        psd_features = psd_features.ravel()
        psd_feature_names_expanded = [ [ "{f}_{b:02}".format(f=f,b=b) for b in range(0,n_freq_bins,1) ] for f in psd_feature_names]
        psd_feature_names_expanded = list(chain.from_iterable(psd_feature_names_expanded))
        assert len(zcr)==1, "Zero-Crossing Rate vector has length different from 1: {}".format(len(zcr))
        assert len(psd_feature_names_expanded)==len(psd_features), "Mismatch between number of spectral features ({nf}) and vector with their names ({nn})".format(nf=len(psd_features) , nn=len(psd_feature_names_expanded)) 
        psd_feat_dict = { name:val for name,val in zip(psd_feature_names_expanded,psd_features)}
        psd_feat_dict['ZCR'] = zcr[0,0]

        # store all features in a pandas dataframe
        tmp_df = input_data.loc[ tmp_metadata['uuid']==uuid, ['uuid','audio_class','cough_detected','SNR','age','gender','respiratory_condition','fever_muscle_pain','status'] ]
        tmp_df.columns = [c.upper() for c in tmp_df.columns]
        tmp_dict = tmp_df.to_dict(orient='records')
        #assert len(tmp_dict)==1, "ERROR! Multiple records for UUID {} : {}".format(uuid,len(tmp_dict))
        tmp_dict = OrderedDict(tmp_dict[0] ) 
        tmp_dict.update(mfcc_feat_dict)
        tmp_dict.update( psd_feat_dict)
        #tmp_df = pd.DataFrame(tmp_dict, columns=tmp_dict.keys())
        all_data = all_data.append(pd.DataFrame(tmp_dict,index=[idx]))#, ignore_index=True)

    ### end for loop over raw audio files
    print("\n{} files processed in {:.1f} seconds\n".format(idx+1, timer()-t_start ))
    return all_data,audio_metadata
####
#### end prepare_data
#### 

In [None]:
# the following looks more complicated than needed because I want first to keep all entries of the smallest group
# then upsample the difference between that and the target number n_resampling. No pandas function is allowing me to do this.
def sample_df_balanced(df, group_col, n, random=42):
    assert isinstance(group_col,str), "Input group_col must be a plain string with the column name: {}".format(type(group_col))
    #df_count = df[[group_col]].groupby([group_col]).cumcount()+1
    df['N'] = np.zeros(len(df[group_col]))
    df_count = df[[group_col,'N']].groupby([group_col]).count().reset_index() #cumcount()+1
    
    out_df = pd.DataFrame()
    for igroup in df[group_col].unique():

        n_orig = df_count.loc[df_count[group_col]==igroup,'N'].values[0]
        if n_orig < n: # need to upsample
            delta = max(n - n_orig, 0)
            tmp_df = df.loc[df[group_col]==igroup, ]
            delta_df = tmp_df.sample(n=delta,random_state=random,replace=False)
            out_df = pd.concat([out_df,tmp_df,delta_df])
        else: #downsample
            tmp_df = df.loc[df[group_col]==igroup, ].sample(n=n,random_state=random,replace=False)
            out_df = pd.concat([out_df,tmp_df])
    ### end for loop over groups
    return out_df.drop('N',axis=1,inplace=False)
### end sample_df_balanced

   
# filter UUID, keeping only those in the desired class
tmp_metadata = metadata.loc[metadata['audio_class']==iclass,]

# remove entries where the SNR is low (hence the cough audio sound is of poor quality)
# This cut should be optimised, at this stage I just decide to cut off the worst 10%
tmp_metadata = tmp_metadata.loc[tmp_metadata['SNR']>=tmp_metadata['SNR'].quantile(0.10),]
print("Before resampling, count of entries by STATUS class in the full data.frame:")
tmp_metadata_count = tmp_metadata[['uuid','status']].groupby(['status']).count()
print(tmp_metadata_count)
print(len(tmp_metadata.loc[tmp_metadata['status']=='COVID-19', 'uuid'].unique()))

# every group will have a number of entries equal to the number of records 
# in the smallest group, rounded to the closest ten above
n_resampling = int(np.ceil(tmp_metadata_count['uuid'].min()/10)*10)

  
    
#tmp_metadata = tmp_metadata.groupby(['status']).sample(n=n_resampling,random_state=42,replace=False)
tmp_metadata = sample_df_balanced(tmp_metadata, 'status', n_resampling) #tmp_metadata.groupby(['status']).sample(n=n_resampling,random_state=42,replace=False)
print("\nAfter resampling, count of entries by STATUS class in the full data.frame:")
print(tmp_metadata[['uuid','status']].groupby(['status']).count())
print(len(tmp_metadata.loc[tmp_metadata['status']=='COVID-19', 'uuid'].unique()))

In [None]:
# all_data, all_audio_metadata = prepare_data(input_data=tmp_metadata, audio_datadir=data_dir, sr=my_sampling_rate, 
#                                             target_duration=target_duration, n_fft=my_n_fft, n_mfcc=my_n_mfcc, 
#                                             fft_window_size=my_window_size, psd_freq_bins=my_psd_freqs,
#                                             mfcc_feature_names=mfcc_feature_names, psd_feature_names=psd_feature_names,
#                                             max_audio_samples=None, print_every_n=20) 

# all_audio_metadata = all_audio_metadata.drop_duplicates(subset='UUID', keep='first') # this avoids spurious duplicates at the following merge

# print("Merging dataframe with audio features and df with audio metadata")
# all_data = pd.merge(all_data,all_audio_metadata,on=['UUID'],how='inner')
# all_data.drop(['AUDIO_FILE'],axis=1,inplace=True)

# #print(all_data[['UUID','STATUS']].groupby(['STATUS']).count())
# print("Shape of full dataframe with features and labels: {}".format(all_data.shape))
# all_data.head(10)

In [None]:
#out_data_filename = audio_outdir+"/cough-classification-data_Class{}.pkl".format(iclass )
# pd.to_pickle(all_data, out_data_filename)  # save df to file
# all_data.to_csv('coughvid_full.csv')
#
#all_data = pd.read_csv(data_dir_file+input_file,index_col=None,) # load df from file
all_data = pd.read_csv('coughvid_full.csv')
all_data = all_data.drop('Unnamed: 0',axis=1)
### one more check that all covid status are balanced
all_data[['UUID','STATUS']].groupby(['STATUS']).count().rename(columns={'UUID':'N_UUID'})


In [None]:
whole = all_data.copy()

all_data = all_data.loc[ (all_data['COUGH_DETECTED'] >= 0.80) & (metadata['age']<70)] # ,'audio_class'] = 'A'
metadata.loc[ (metadata['cough_detected'] >= 0.80) & (metadata['age']>=60) ,'audio_class'] = 'A'

#print(metadata[['audio_class','status','uuid']].groupby(['audio_class','status']).count().rename(columns={'uuid':'N_entries'}) )
all_data[['UUID','STATUS']].groupby(['STATUS']).count().rename(columns={'UUID':'N_UUID'})

### Visualise sound and spectrograms for a given UUID

In [None]:
all_uuids = all_data['UUID'].unique()
uuid_tmp = all_uuids[10] #"0379c586-c500-483c-83a6-95b63afe6931"#all_uuids[10]
tmp_audiofilename = uuid_tmp+".webm"
        
tmp_df, tmp_audio, sr = import_raw_audio(tmp_audiofilename, indir=data_dir, sr=my_sampling_rate)
tmp_audiofilename = uuid_tmp+".webm"
tmp_audio = zero_padding(tmp_audio, sr=sr, target_duration=target_duration)                                                         
stft , mfcc = calc_spectral_features(tmp_audio, sr, n_fft=my_n_fft,n_mfcc=my_n_mfcc,win_overlap=0.0)
freqs_welch, psx_welch = calc_power_spectrum_welch(tmp_audio,sr, my_n_fft)
        
print(mfcc.shape)
time_stamps = np.arange(0,target_duration, 1/my_sampling_rate)

# plot raw signal
line_data = ply_go.Scatter(x=time_stamps, 
                           y=tmp_audio,
                           name="Audio signal", showlegend=False)
fig = ply_go.Figure(data=[line_data])#, layout=my_layout)
fig.update_layout(title={'text': "Raw audio (UUID:{})".format(uuid_tmp)}, 
                  xaxis={"title":{"text":"Time [s]"}}, yaxis={"title":{"text":"Amplitude"}})
fig.show()

# plot STFT
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(18,18)) # tight_layout=False,constrained_layout=True
fig.tight_layout()
img0 = librosa.display.specshow(np.abs(stft), sr=sr, y_axis='log', x_axis='time', ax=ax)

#img0 = librosa_display.specshow(x, y_axis='log', x_axis='time',
#                               sr=my_sampling_rate, ax=ax)
ax.set_title('Log-Frequency power spectrogram', size=18)
fig.colorbar(img0, format="%+2.f")
fig.show()


# plot MFCC
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(18,18)) # tight_layout=False,constrained_layout=True
fig.tight_layout()
img0 = librosa.display.specshow(np.abs(mfcc), sr=sr, y_axis='log', x_axis='time', ax=ax)

#img0 = librosa_display.specshow(x, y_axis='log', x_axis='time',
#                               sr=my_sampling_rate, ax=ax)
ax.set_title('MFCC spectrogram', size=18)
fig.colorbar(img0, format="%+2.f")
fig.show()

# plot power spectrum
line_data = ply_go.Scatter(x=freqs_welch,#np.arange(0,target_duration,my_n_fft/(my_sampling_rate)), 
                           y=psx_welch,
                           name="Power Spectrum density", showlegend=False)
fig = ply_go.Figure(data=[line_data])#, layout=my_layout)
fig.update_layout(title={'text': "Power Spectrum Density (UUID:{})".format(uuid_tmp)}, 
                  xaxis={"title":{"text":"Frequency [Hz]"}}, yaxis={"title":{"text":"Average Power"}})
fig.show()




In [None]:
sampled_data = all_data.copy()
all_uuids = sampled_data['UUID'].values

#select X features to be used in ML classification
#train_features = (['RESPIRATORY_CONDITION', 'FEVER_MUSCLE_PAIN','MEAN_SIG','MAX_SIG','MIN_SIG','ZCR']+
#                  [ f for f in sampled_data.columns.values if f.startswith('MFCC_')] + 
#                  [ f for f in sampled_data.columns.values if f.startswith('SPEC_')])

max_freq_features = 99
train_features = (['RESPIRATORY_CONDITION', 'FEVER_MUSCLE_PAIN','MEAN_SIG','MAX_SIG','MIN_SIG','ZCR']+
                  [f2 for f2 in [ f1 for f1 in sampled_data.columns.values  if f1.startswith('MFCC_')] if int(f2[-2:])<max_freq_features]+
                  [f2 for f2 in [ f1 for f1 in sampled_data.columns.values  if f1.startswith('SPEC_')] if int(f2[-2:])<max_freq_features])

y_label = 'STATUS'

### this is used if one wants to reduce the number of classes/status; 
#sampled_data.loc[sampled_data['STATUS']=='symptomatic','STATUS'] = 'NOCOVID' 
#sampled_data.loc[sampled_data['STATUS']=='healthy','STATUS']     = 'NOCOVID' 


#print(train_features)
print("Number of training X features: {}".format(len(train_features)) )
X_train, X_test, y_train, y_test, uuid_train, uuid_test = train_test_split(all_data[train_features].values, sampled_data[[y_label]].values, all_uuids,
                                                                            test_size=0.2,random_state=612, stratify=sampled_data[y_label])
print("Shapes of train X and y datasets: X->{}    y->{}".format(X_train.shape ,y_train.shape))
print("Shapes of test  X and y datasets: X->{}    y->{}".format(X_test.shape, y_test.shape))


print("\n\nTRAIN DATASET - Count of entries by STATUS:\nhealthy={} \tsymptomatic={} \tcovid={}\n\n".format(y_train[y_train=="healthy"].shape[0], y_train[y_train=="symptomatic"].shape[0], y_train[y_train=="COVID-19"].shape[0]))

In [None]:
scaler = MinMaxScaler(feature_range=(-1,1))
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

labenc = LabelEncoder()
y_train_enc = labenc.fit_transform(y_train.ravel())
y_enc_labels = labenc.classes_
print(y_enc_labels)
y_test_enc = labenc.transform(y_test.ravel())

In [None]:
def roc_df(ytrue, ypred):   
    falseposrate, trueposrate, thresholds = metrics.roc_curve(ytrue, ypred)
    roc_df = pd.DataFrame()
    roc_df['FalsePosRate'] = falseposrate
    roc_df['TruePosRate'] = trueposrate
    roc_df['Thresholds'] = thresholds
    return roc_df

def prc_df(ytrue, ypred):   
    precision, recall, thresholds = metrics.precision_recall_curve(ytrue, ypred)
    prc_df = pd.DataFrame()
    prc_df['Precision'] = precision[:-1]
    prc_df['Recall'] = recall[:-1]
    prc_df['Thresholds'] = thresholds
    return prc_df

def plot_prc(recall, precision, ax,
             prc_score=None, xrange=[-0.05,1.05],yrange=[-0.05,1.05]):
    
    if prc_score is not None:
        prc_label='PRC Avg Score = {:.4}'.format(prc_score)
    else:
        prc_label=None
    ax.plot(recall,precision,  'b', label=prc_label)
    ax.plot([0,1],[1,0],'r--')
    ax.set_title('Precision-Recall curve', fontsize=28)
    ax.set_xlabel('Recall', fontsize=24)
    ax.set_ylabel('Precision', fontsize=24)
    ax.tick_params(axis='both', which='major', labelsize=18)
    ax.set_xlim(xrange)
    ax.set_ylim(yrange)
    ax.legend(loc='lower left',fontsize=24)
    ax.grid()
    return ax      

def score_eval(ytrue, ypreds, model_name="", ylabels=None):
    tmp_acc = accuracy_score(ytrue, ypreds)
    tmp_precision = precision_score(ytrue, ypreds, average='macro')
    tmp_recall = recall_score(ytrue, ypreds, average='macro')
    tmp_cm = confusion_matrix(ytrue, ypreds)
    print("{mn} accuracy / precision / recall: {a:.3f} / {p:.3f} / {r:.3f}".format(a=tmp_acc, p=tmp_precision, r=tmp_recall, mn=model_name) )
    print("\n\n")
    print(classification_report(ytrue, ypreds, target_names=ylabels) )
    return tmp_acc,tmp_precision, tmp_recall, tmp_cm

In [None]:
### SETUP LOGISTIC REGRESSION (MULTICLASS)
from sklearn.linear_model import LogisticRegression
logit_params = dict(multi_class='multinomial', penalty='l2', C=0.20, solver='newton-cg', random_state=991)

logit_class = LogisticRegression(**logit_params)
logit_model = logit_class.fit(X_train_norm, y_train_enc)
logit_test  = logit_model.predict(X_test_norm )
print(y_enc_labels)
logit_acc, logit_precision, logit_recall, logit_cm = score_eval(y_test_enc, logit_test, "Logit_multi", ylabels=y_enc_labels)

print(logit_cm)

logit_coeffs = pd.concat([pd.DataFrame(train_features),pd.DataFrame(np.transpose(logit_model.coef_))], axis = 1,ignore_index=True)
logit_coeffs.columns = np.append(['XVAR'], y_enc_labels,axis=0)
logit_coeffs.sort_values('COVID-19')
#logit_coeffs.sort_values('healthy')

In [None]:
#%% SETUP XGBOOST

# cast train and test sample to a XGBoost DMatrix data container -- NOT NEEDED if using sklearn API !
#dtrain = xgb.DMatrix(data=X_train_norm, label=y_train,feature_names=train_features)
#dtest = xgb.DMatrix(data=X_test_norm, label=y_test,feature_names=train_features)

#
# define XGBoost classification model
# xgb_params = {'max_depth': 3,   # max depth of a tree
#               'n_estimators': 250,
#               'learning_rate': 0.1,   # learning rate; smaller eta make convergence more accurate but slower
#               'min_split_loss': 0.05, #gamma parameter in xgboost; the larger gamma, the more conservative the algo is in adding one extra leaf to the tree
#               'reg_lambda':5.0,   # disable L2 reg only if features are all reasonably independent
#               'reg_alpha':5.0,    #  L1 reg,tring to prune unnecessary features
#               'objective': 'multi:softmax',
#               'num_class': 3,     # number of classes to classify in the dataset
#               'use_label_encoder':False,
#               'subsample': 0.5,  #use only a fraction of the training set to grow the tree; if =1.0, subsampling is disabled
#               #'colsample_bytree':0.50,
#               'random_state':9443,
#               'verbosity':0  #0: silent --> 3: very verbose
#               }

# define XGBoost classification model
xgb_params = {'max_depth': 3,   # max depth of a tree
              'n_estimators': 20,
              'learning_rate': 0.2,   # learning rate; smaller eta make convergence more accurate but slower
              #'min_split_loss': 0, #gamma parameter in xgboost; the larger gamma, the more conservative the algo is in adding one extra leaf to the tree
              'reg_lambda':10.0,   # disable L2 reg only if features are all reasonably independent
              'reg_alpha':0.0,    #  L1 reg,tring to prune unnecessary features
              'objective': 'multi:softmax',
              'num_class': 3,     # number of classes to classify in the dataset
              'use_label_encoder':False,
              'subsample': 1,  #use only a fraction of the training set to grow the tree; if =1.0, subsampling is disabled
              #'colsample_bytree':0.50,
              'random_state':9443,
              'verbosity':0  #0: silent --> 3: very verbose
              }

#evallist = [(dtest, 'eval'), (dtrain, 'train')]
xgb_class = xgb.XGBClassifier(**xgb_params)

# fit the model
xgb_model = xgb_class.fit(X_train_norm, y_train_enc, 
                          eval_metric=['mlogloss'], 
                          eval_set=[(X_train_norm, y_train_enc), (X_test_norm, y_test_enc)],
                          verbose=False)


#%% run evaluation
xgb_train = xgb_model.predict(X_train_norm )
xgb_test = xgb_model.predict(X_test_norm )
#xgb_test= labenc.inverse_transform(xgb_test)


In [None]:
### plot global feature importance as calculated by xgboost
xgb_importance = xgb_class.feature_importances_
sorted_indices = np.flip(xgb_importance.argsort())
xgb_importance = xgb_importance[sorted_indices]
importance_labels = np.array(train_features)[sorted_indices]
plt.barh(importance_labels[0:10], xgb_importance[0:10])


### accuracy and valuation metrics calculated on the training sample
xgb_acc, xgb_precision, xgb_recall , xgb_cm = score_eval(y_train_enc, xgb_train, "XGBoost TRAIN", ylabels=y_enc_labels)

print("\n\nConfusion matrix - in-sample training dataset:")
print(xgb_cm)

# Applying k-fold Cross Validation
from sklearn.model_selection import cross_val_score
n_folds = 5
accuracies = cross_val_score(estimator = xgb_model, X = X_train, y = y_train_enc, cv = n_folds)
print("\n\nEvaluating XGBoost model using training set and {}-fold cross validation: \nAverage Accuracy {:.3f} +/- {:.3f}".format(n_folds,accuracies.mean(), accuracies.std() ) )
print("Fold accuracies: {}\n\n".format(accuracies))


### extract loss values for both training and test datasets asa function of iteration (i.e., estimator added to the BDT)
xgb_train_results = xgb_model.evals_result()
xgb_train_loss = xgb_train_results['validation_0']['mlogloss']
xgb_test_loss  = xgb_train_results['validation_1']['mlogloss']
iters = len(xgb_train_loss)
x_iters = list(range(0, iters))

line_data = [ply_go.Scatter(x=x_iters, 
                            y=xgb_train_loss,
                            name="TRAIN"),
             ply_go.Scatter(x=x_iters, 
                            y=xgb_test_loss,
                            name="TEST")]
fig = ply_go.Figure(data=line_data)#, layout=my_layout)
fig.update_layout(title={'text': "Evaluation Log-Loss of XGBoost classifier"}, 
                  xaxis={"title":{"text":"Iteration"}}, yaxis={"title":{"text":"Multiclass Log-Loss"}})
fig.show()

In [None]:

print("\n\n\nVALIDATION USING TEST SAMPLE:")
xgb_acc, xgb_precision, xgb_recall , xgb_cm = score_eval(y_test_enc, xgb_test, "XGBoost", ylabels=y_enc_labels)

print("\n\nConfusion matrix:")
print(xgb_cm)


In [None]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_addons as tfa


In [None]:
pip install --upgrade tensorflow


In [None]:
train = tf.data.Dataset.from_tensor_slices((X_train_norm, y_train_enc)) 

In [None]:
ys = tf.one_hot(y_train_enc, depth=3)
y_test_oh = tf.one_hot(y_test_enc,depth = 3)


In [None]:
model = keras.Sequential([
    keras.Input(shape=(106,)),
    keras.layers.Dense(units=256, activation='relu'),
    #keras.layers.Dense(units=192, activation='relu'),
    #keras.layers.Dense(units=192, activation='relu'),
    keras.layers.Dense(units=192, activation='relu'),

    keras.layers.Dense(units=128, activation='relu'),
    keras.layers.Dense(units=3, activation='softmax')
])

In [None]:
model.compile(optimizer='adam', 
              loss=tf.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(
    X_train_norm,ys, 
    epochs=10, 
    steps_per_epoch=500
    )

In [None]:
metric = tfa.metrics.F1Score(num_classes=3, threshold=0.5)
metric.update_state(y_test_oh,nn_test)
result = metric.result()
result.numpy()

In [None]:
logit_test_dec = labenc.inverse_transform(logit_test)
xgb_test_dec = labenc.inverse_transform(xgb_test)
logit_data = pd.DataFrame({'UUID':uuid_test, 'LOGIT_STATUS':logit_test_dec})
xgb_data = pd.DataFrame({'UUID':uuid_test, 'XGB_STATUS':xgb_test_dec})
pred_data = pd.merge(sampled_data, logit_data,on='UUID', how='left')
pred_data = pd.merge(pred_data, xgb_data,on='UUID', how='left')
#pred_data[['UUID','STATUS','LOGIT_STATUS','XGB_STATUS']].head()
pred_data.loc[(pred_data['STATUS']==pred_data['LOGIT_STATUS']) & (pred_data['STATUS']!=pred_data['XGB_STATUS']) &(~pd.isnull(pred_data['XGB_STATUS'])),]