# RESPIRATORY ANALYSIS - DIAGNOSE CONDITIONS

Main reference: Kaggle dataset "Respiratory Sound Database"

https://www.kaggle.com/vbookshelf/respiratory-sound-database
    
The aim is to identify the condition of a patient from the audio recordings of their breathing.

High-level overview of what happens in this notebook:
* Import data
   * Read in patient metadata (age, sex, BMI, etc.)
   * Impute missing values of patient metadata
   * Visualise distributions of patient metadata
   * Import annotations of audio files (contain info about number of crackles and wheezes detected, as annotated by humans)
   * Read in audio data (.wav files) and calculate time-frequency analyses (STFT and IRCC). Because htis is a slow process, we wil save the spectrograms to pickle files for faster retrieval later.
   
   
* Combine data: 
   * patient metadata and audio annotations are merged in one big dataframe (one record per audio file)
   * split the datasets into a train/test sets. Do this for both the combined metadata+annotation and the STFT matrixes, keeping track of the audio file and keeping the two inputs aligned (if a record in the metadata+annotations data is used for training, do so also for the corresponding STFT)
   
   
* Run classification algorithms
   * Logistic regression  (inputs are the combined metadata+annotations)
   * Boosted Decision Tree with XGBoost (inputs are the combined metadata+annotations)
   * CNN using the STFT  (inputs are the STFT)
   * Neural Network concatenating a CNN (inputs are the STFT) and a deep Neural Network (inputs are the combined metadata+annotations)
   * For each test run a standard set of evaluation metrics (focus on precision/recall rather than accuracy given the strong imbalance of the data set)

In [None]:
# general purpose libraries
import numpy as np
import datetime as dt
import pandas as pd
import os
import pickle
from timeit import default_timer as timer
from collections import OrderedDict

In [None]:
# plots and visualisation
import matplotlib.pyplot as plt
import plotly.graph_objects as ply_go

In [None]:
# DSP libraries
import librosa
import librosa.display as librosa_display

In [None]:
# ML and data modelling libraries
from sklearn.preprocessing   import MinMaxScaler, OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score,roc_curve, precision_recall_curve,confusion_matrix,precision_score, recall_score,average_precision_score, classification_report
from sklearn.linear_model import LogisticRegression

import xgboost as xgb

In [None]:
### Setup paths and directories
work_dir = "/kaggle/working/"
data_dir = "/kaggle/input/respiratory-sound-database/"
audio_indir = data_dir + "Respiratory_Sound_Database/Respiratory_Sound_Database/audio_and_txt_files/"
audio_outdir = work_dir + "audio/"
text_outdir = work_dir + "annotations/"

demo_file = 'demographic_info.txt'
diagnosis_file = 'patient_diagnosis.csv'

def create_dir_if_not_exists(mydir):
    if not os.path.exists(mydir):
        os.makedirs(mydir)
        return -1
    return 0

create_dir_if_not_exists(audio_outdir)
create_dir_if_not_exists(text_outdir)

In [None]:
### To be used only the first time: move audio files to different subfolder, 
### just to organise files in a way that I think is cleaner
# source_files = os.listdir(audio_indir)
# for f in source_files:
#     if f.endswith(".wav"):
#         os.rename(audio_indir+f, audio_outdir+f)
#     elif f.endswith(".txt"):
#         os.rename(audio_indir+f, text_outdir+f)
#     else:
#         pass
###

## Collect patient metadata and organise them in a convenient format

In [None]:
patient_data=pd.read_csv(data_dir+demo_file,sep=" ",header=None,names=['PATIENT_ID', 'AGE', 'SEX', 'BMI', 'WEIGHT', 'HEIGHT'])
diagnosis_data = pd.read_csv(data_dir+"Respiratory_Sound_Database/Respiratory_Sound_Database/"+diagnosis_file,header=None, names=['PATIENT_ID','DIAGNOSIS'])
patient_data = patient_data.merge(diagnosis_data, on='PATIENT_ID')
#print(patient_data.head())
print(patient_data.shape)

### Impute missing values

In [None]:
print(patient_data.isna().sum() )

# only one entry without age, use median of relevant population
patient_data.loc[patient_data['AGE'].isnull(),'AGE'] = patient_data.loc[patient_data['DIAGNOSIS']=='COPD', 'AGE'].median()

# count how many M/F there are if diagnosis is COPD
#patient_data.loc[patient_data['DIAGNOSIS']=='COPD', ['SEX','PATIENT_ID'] ].groupby(['SEX']).count() 
patient_data.loc[patient_data['SEX'].isnull(),'SEX'] = 'M' # the only row with nan for SEX has diagnosis COPD; replace with most common outcome


# Impute missing BMI values
# start from the easy case: no BMI but weight and height available
null_bmi = patient_data['BMI'].isnull()
patient_data.loc[ null_bmi,'BMI'] = patient_data.loc[null_bmi, 'WEIGHT'] / (patient_data.loc[null_bmi,'HEIGHT']/100)**2

# for the remaining cases, we use the median BMI for the appropriate stratified group
age_quantiles = patient_data['AGE'].quantile([0.2,0.4,0.6,0.8]).values
print(age_quantiles)
patient_data['AGE_CAT'] = 'E'
patient_data.loc[ patient_data['AGE'] < age_quantiles[-1],'AGE_CAT'] = 'D'
patient_data.loc[ patient_data['AGE'] < age_quantiles[-2],'AGE_CAT'] = 'C'
patient_data.loc[ patient_data['AGE'] < age_quantiles[-3],'AGE_CAT'] = 'B'
patient_data.loc[ patient_data['AGE'] < age_quantiles[-4],'AGE_CAT'] = 'A'

tmp_data = (patient_data.loc[~patient_data['BMI'].isnull(),['DIAGNOSIS','AGE_CAT','SEX','BMI']].
            groupby(['DIAGNOSIS','AGE_CAT','SEX']).agg('median').reset_index().rename(columns={'BMI':'BMI_imputed'}) )
patient_data_imputed = patient_data.loc[patient_data['BMI'].isnull(),].merge(tmp_data,on=['DIAGNOSIS','AGE_CAT','SEX'],how='left')
patient_data = patient_data.merge(patient_data_imputed[['PATIENT_ID', 'BMI_imputed']], on=['PATIENT_ID'],how='left')
patient_data.loc[patient_data['BMI'].isnull(), 'BMI'] = patient_data.loc[patient_data['BMI'].isnull(), 'BMI_imputed']

# these are the last cases with very little information available
tmp_data = (patient_data.loc[~patient_data['BMI'].isnull(),].
            groupby(['AGE_CAT']).agg('median').reset_index().rename(columns={'BMI':'BMI_imputed2'}) )
patient_data_imputed = patient_data.loc[patient_data['BMI'].isnull(),].merge(tmp_data[['AGE_CAT','BMI_imputed2']],on=['AGE_CAT'],how='left')
patient_data = patient_data.merge(patient_data_imputed[['PATIENT_ID', 'BMI_imputed2']], on=['PATIENT_ID'],how='left')
patient_data.loc[patient_data['BMI_imputed'].isnull(), 'BMI_imputed'] = patient_data.loc[patient_data['BMI_imputed'].isnull(), 'BMI_imputed2']
patient_data.loc[patient_data['BMI'].isnull(), 'BMI'] = patient_data.loc[patient_data['BMI'].isnull(), 'BMI_imputed']
patient_data.drop(['BMI_imputed2'],1,inplace=True)

patient_data['BMI_imputed'] = ~patient_data['BMI_imputed'].isnull()

print("\n\nSummary of missing values after imputation procedure:")
print(patient_data.isna().sum() )
patient_data.head()

### Visualise distributions

In [None]:
my_title_layout = dict({"text":"my distribution", 'xanchor':'center', 'x':0.5, 'y':0.9, 'font':{'size':24}})
my_xaxis_layout = dict(title=dict(text="my x axis", font={'size':16}))
my_layout = dict(title=my_title_layout,
                xaxis= my_xaxis_layout)
bin_size_dict = dict(AGE=1, BMI=5.0, DIAGNOSIS=1,SEX=1)
xaxis_title_dict = dict(AGE='AGE [years]', BMI="BMI", DIAGNOSIS="Condition", SEX="Male/Female]")

for c in ['AGE','SEX', 'BMI', 'DIAGNOSIS']:
    hist_data = ply_go.Histogram(x=patient_data[c], name=c, showlegend=False, xbins={'size':bin_size_dict[c]})
    fig = ply_go.Figure(data=[hist_data], layout=my_layout)
    fig.update_layout(title={'text': c+" distribution"}, xaxis={"title":{"text":xaxis_title_dict[c]}})
    fig.show()
###


fig = ply_go.Figure( layout=my_layout)
for tmp_diag in patient_data['DIAGNOSIS'].unique():
    violin_data = ply_go.Violin(x=patient_data.loc[patient_data['DIAGNOSIS']==tmp_diag, 'DIAGNOSIS'],
                                y=patient_data.loc[patient_data['DIAGNOSIS']==tmp_diag, 'AGE'],
                                name=tmp_diag,
                                box_visible=True,
                                meanline_visible=True)
    fig.add_trace(violin_data)
###
fig.update_layout(title={'text': "Distribution of AGE by type of DIAGNOSYS"}, xaxis={"title":{"text":None}}, 
                  yaxis={"title":{"text":"AGE [years]"}})
fig.show()

fig = ply_go.Figure( layout=my_layout)
for tmp_diag in patient_data['DIAGNOSIS'].unique():
    violin_data = ply_go.Violin(x=patient_data.loc[patient_data['DIAGNOSIS']==tmp_diag, 'DIAGNOSIS'],
                                y=patient_data.loc[patient_data['DIAGNOSIS']==tmp_diag, 'BMI'],
                                name=tmp_diag,
                                box_visible=True,
                                meanline_visible=True)
    fig.add_trace(violin_data)
###
fig.update_layout(title={'text': "Distribution of BMI by type of DIAGNOSYS"}, xaxis={"title":{"text":None}}, 
                  yaxis={"title":{"text":"BMI"}})
fig.show()

## Import and process audio data

1. Import audio annotation data and store them in a dataframe
1. Import raw audio data: 
  1. extract audio features (signal amplitude) and store them in a dataframe with audio features; 
  1. calculate time-frequency representations (spectrograms, IIRT, recurrence charts) and store them as matrixes (to be used later for image recognition) 
1. merge dataframes with patient data, annotations and audio features


In [None]:
def import_annotation(filename, indir):
    d = pd.read_csv(indir+filename, sep="\t",header=None, names=["BEGIN_CYCLE", "END_CYCLE", "CRACKLE", "WHEEZE"])
    d["CRACKLE"] = d["CRACKLE"].astype(int)
    d["WHEEZE"] = d["WHEEZE"].astype(int)
    totals = np.array([d['CRACKLE'].sum(), d['WHEEZE'].sum()]).reshape(1,-1)
    tokens = np.array(f.replace(".txt","").split("_")).reshape(1,-1)
    #print("{}   {}  {} {}".format(f1.shape, tokens.shape, totals.shape, x.shape))
    ann_df = pd.DataFrame(data= np.hstack((np.array([filename[:-4]]).reshape(1,-1), tokens, totals)),
                         columns=['ANNOTATION_FILE','PATIENT_ID','REC_IDX', 'CHEST_LOC', 'ACQ', 'DEVICE', 'TOT_CRACKLE', 'TOT_WHEEZE' ],
                         )
    for i in ['PATIENT_ID','TOT_CRACKLE', 'TOT_WHEEZE']:
        ann_df[i] = ann_df[i].astype(int)
    return ann_df



In [None]:
record_data = pd.DataFrame()
for f in os.listdir(audio_indir):
    if f.endswith(".txt"):
        tmp_df = import_annotation(f, audio_indir)
        record_data = record_data.append(tmp_df)


print(record_data.shape)
print(record_data[['PATIENT_ID', 'REC_IDX']].groupby(['PATIENT_ID']).count().sort_values(by='REC_IDX') )
print(record_data[['TOT_CRACKLE', 'TOT_WHEEZE']].sum() )
record_data.head()

### Import audio data

In [None]:

def import_raw_audio(filename,indir, sr=None,ret_timefreqrep=False ):
    t, sr = librosa.load(indir+filename, sr=sr, mono=True)
    duration = t.shape[0]/sr #in seconds
    mu_t = t.mean()
    min_t = t.min()
    max_t = t.max()
    #tnorm = (t - mu_t )
    #tnorm = tnorm / (max_t-mu_t)
    f_token = np.array([filename[:-4]]).reshape(1, -1)
    tokens = np.array([sr, duration, mu_t, max_t, min_t]).reshape(1,-1)
    audio_df = pd.DataFrame(data= np.hstack((f_token, tokens)),
                         columns=['ANNOTATION_FILE', 'SAMPLING_RATE','DURATION', 'MEAN_SIG', 'MAX_SIG', 'MIN_SIG' ],
                         )
    audio_df['SAMPLING_RATE'] = audio_df['SAMPLING_RATE'].astype(float).astype(int)#weird conversion from string to int 
    for i in ['DURATION', 'MEAN_SIG', 'MAX_SIG', 'MIN_SIG' ]:
        audio_df[i] = audio_df[i].astype(float)
    
    return audio_df, t, sr
    
def zero_padding(t, sr, target_duration):
    """do zero-padding to get audio files all of the same duration; 
       this will allow us to have spectrograms all of the same size"""
    target_len = target_duration * sr
    if t.shape[0] > target_len:
        t = t[0:target_len]
    elif t.shape[0] < target_len:
        n_pads = target_len - t.shape[0] 
        t = np.append(t, np.repeat(0,n_pads)  )
    else:
        pass
    return t


def calc_spectral_features(t, n_fft = 512, win_length = None, win_overlap=0.0, rec_width=0):
    if win_length is None:
        win_length = n_fft
        
    assert (win_overlap>=0)&(win_overlap<1.0), "Invalid value of win_overlap {} - it must be in range [0.0, 1.0) ".format(win_overlap)
    hop_length = int(win_length*(1.0-win_overlap))
    # calculate spectrograms
    t_stft_db = librosa.amplitude_to_db(  np.abs(librosa.stft(t, n_fft=n_fft, 
                                                              hop_length=hop_length, win_length=win_length )))
    t_iirt_db = librosa.amplitude_to_db(  np.abs(librosa.iirt(t, hop_length=hop_length, win_length=win_length )) )
    #t_mfcc_db = librosa.feature.mfcc(t, n_mfcc=40)

    R_stft = librosa.segment.recurrence_matrix(t_stft_db, mode='affinity', self=False, width=rec_width)
    R_iirt = librosa.segment.recurrence_matrix(t_iirt_db, mode='affinity', self=False, width=rec_width)

    return t_stft_db,t_iirt_db,R_stft,R_iirt

def append_array(idx,old_a,new_a, axis=0):
    if idx==0:
        return new_a
    else:
        return np.append(old_a, new_a, axis=axis)
    
    assert False, "Should not have been here"

    

In [None]:
wav_file_list = np.sort( [f for f in os.listdir(audio_indir) if f.endswith(".wav") ] )

#set to true after first run, unless you want to change something basic in how the spectrograms are calculated
#(for example duration of padded audio, or window size of STFT)
read_from_file = False 
my_sampling_rate = int(4096*2) # the max frequency in the STFT will be half of this
                               # looking at the STFT spectrograms, there is very little above 4096 Hz
my_n_fft = 512 # number of frequency bins to be calculated in the STFT; 
               # if my_window_size is None, this drives also the time-sampling window
my_window_size = None # should not be greater than n_fft
target_duration = 30 # seconds; obtained from an earlier dry run over all data and charting the distribution 
                     # of duration of the raw sound samples; 30 sec corresponds to the 97th percentile and 
                     # represent a significant improvement in terms of computing time (x5 faster) 
                     # respect to more conservative choices like 70 seconds (99th percentile)

# files where data are stored
audio_metadata_file = audio_outdir+"RESP_METADATA_ALL.pkl"
stft_data_file =  audio_outdir+"RESP_STFT.pkl"
iirt_data_file =  audio_outdir+"RESP_IIRT.pkl"
r_stft_data_file =  audio_outdir+"RESP_RECSTFT.pkl"
r_iirt_data_file =  audio_outdir+"RESP_RECIIRT.pkl"

In [None]:
if read_from_file :
    print("Loading audio metadata from {}".format(audio_metadata_file))
    try:
        audio_metadata = pd.read_pickle(audio_metadata_file)
    except FileNotFoundError as e_fnf:
        print("Could not find metadata file {}. Please rerun this cell after settign the variableread_from_file to False\n\n\n".format(audio_metadata_file))
        raise e_fnf
    except Exception as  e:
        raise e #rethrow exception
    print("Loading time-frequency representations")
    try:
        stft_data = pickle.load(open(stft_data_file, "rb") )
        #iirt_data = pickle.load(open(iirt_data_file, "rb") ) # decomment if you plan to use these in your model
        #r_stft_data = pickle.load(open(r_stft_data_file, "rb") )
        #r_iirt_data = pickle.load(open(r_iirt_data_file, "rb") )
    except FileNotFoundError as e_fnf:
        print("Could not find spectrogram file {}. Please rerun this cell after settign the variableread_from_file to False\n\n\n".format(stft_data_file))
        raise e_fnf
    except Exception as e:
        raise e #rethrow exception

else:
    audio_metadata = pd.DataFrame()
    # choose to use a dict in this way we are sure that we keep track of the image-label association, 
    # without relying on the order of the files that leaves me always a bit uncomfortable
    stft_data = OrderedDict({key[:-4]:None for key in wav_file_list})
    iirt_data = OrderedDict({key[:-4]:None for key in wav_file_list})
    r_stft_data = OrderedDict({key[:-4]:None for key in wav_file_list})
    r_iirt_data = OrderedDict({key[:-4]:None for key in wav_file_list})
    t_start = timer()
    for idxf, f in enumerate( wav_file_list  ):
        if (idxf % 50 ==0) | (idxf==10)| (idxf==2):
            print("File #{}: {} ({:.1f} seconds elapsed)".format(idxf, f,  timer()-t_start ))

        tmp_df, tmp_audio, sr = import_raw_audio(f, indir=audio_indir, sr=my_sampling_rate)
        tmp_audio = zero_padding(tmp_audio, sr=sr, target_duration=target_duration)                                                         
        audio_metadata = audio_metadata.append(tmp_df)
        if (idxf % 300 ==0):
            print("File #{}: starting STFT ({:.1f} seconds elapsed)".format(idxf,   timer()-t_start ))
        stft, iirt, r_stft, r_iirt = calc_spectral_features(tmp_audio, n_fft=my_n_fft, win_length = my_window_size,
                                                            win_overlap=0.5, rec_width=16)
        stft_data[f[:-4]] = stft.astype(np.float32)
        iirt_data[f[:-4]] = iirt.astype(np.float32)
        r_stft_data[f[:-4]] = r_stft.astype(np.float32)
        r_iirt_data[f[:-4]] = r_iirt.astype(np.float32)
        
        if (idxf % 400 ==0) & (idxf>0):
            print("File #{}: ended STFT ({:.1f} seconds elapsed)".format(idxf,   timer()-t_start ))
            # save metadata and spectrograms at intermediate, just as a safety: in case of issues we will not lose it all
            print("Saving data at file #{}".format(idxf))
            pd.to_pickle(audio_metadata, audio_metadata_file)
            with open(stft_data_file, "wb") as fh:
                pickle.dump(stft_data , fh)
            with open(iirt_data_file, "wb") as fh:
                pickle.dump(iirt_data , fh)
            with open(r_stft_data_file, "wb") as fh:
                pickle.dump(r_stft_data , fh)
            with open(r_iirt_data_file, "wb") as fh:
                pickle.dump(r_iirt_data , fh)

    ### end for loop over raw audio files
    print("\n{} files processed in {:.1f} seconds\n".format(idxf+1, timer()-t_start ))

    # save metadata and spectrograms
    print("Saving data")
    pd.to_pickle(audio_metadata, audio_metadata_file)
    with open(stft_data_file, "wb") as fh:
        pickle.dump(stft_data , fh)
    with open(iirt_data_file, "wb") as fh:
        pickle.dump(iirt_data , fh)
    with open(r_stft_data_file, "wb") as fh:
        pickle.dump(r_stft_data , fh)
    with open(r_iirt_data_file, "wb") as fh:
        pickle.dump(r_iirt_data , fh)
### end else do not read_from_file    
    


Print the shape of the containers (should match the number of wav files) and the shape of each 2D array with spectrogram.
Take note of the shape because we will need to know it when we create the CNN later

In [None]:
print("Size of STFT {}: {}".format(type(stft_data), len(stft_data) ))
print("Size of IIRT {}: {}".format(type(iirt_data), len(iirt_data) ))
print("Size of Recurrence STFT {}: {}".format(type(r_stft_data), len(r_stft_data) ))
print("Size of Recurrence IIRT {}: {}".format(type(r_iirt_data), len(r_iirt_data) ))

print("\n\n")
print("Shape of first STFT element {}: {}".format(type(list(stft_data.values())[0]), list(stft_data.values())[0].shape ))
print("Shape of first IIRT element {}: {}".format(type(list(iirt_data.values())[0]), list(iirt_data.values())[0].shape ))
print("Shape of first Recurrence STFT element {}: {}".format(type(list(r_stft_data.values())[0]), list(r_stft_data.values())[0].shape ))
print("Shape of first Recurrence IIRT element {}: {}".format(type(list(r_iirt_data.values())[0]), list(r_iirt_data.values())[0].shape ))



In [None]:
x = stft_data[wav_file_list[0][:-4]]
print("Max of first 20 rows: {}".format(np.max(x[0:20]) ))
print("Max of last 20 rows: {}".format(np.max(x[-20:-1]) ))

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(18,18)) # tight_layout=False,constrained_layout=True
fig.tight_layout()
img0 = librosa_display.specshow(x, y_axis='log', x_axis='time',
                               sr=my_sampling_rate, ax=ax)
ax.set_title('Log-Frequency power spectrogram', size=18)
fig.colorbar(img0, format="%+2.f Db")

### Merge all patient data and record data to a big dataframe

Perform feature extraction and data preparation

In [None]:
x_features = [ 'CHEST_LOC', 'NORM_CRACKLE', 'NORM_WHEEZE',  'MEAN_SIG','MAX_SIG', 'MIN_SIG', 'AGE', 'SEX', 'BMI']
y_label = 'DIAGNOSIS'

full_data = record_data.merge(audio_metadata,on='ANNOTATION_FILE', how='inner')
full_data = full_data.merge(patient_data[['PATIENT_ID', 'AGE', 'SEX', 'BMI', 'DIAGNOSIS']], on='PATIENT_ID', how='inner')
full_data['NORM_CRACKLE'] = full_data['TOT_CRACKLE'] / full_data['DURATION']
full_data['NORM_WHEEZE']  = full_data['TOT_WHEEZE']  / full_data['DURATION']

# Asthma has only one entry, the train/test split procedure will fail. Aggregate it in the COPD class 
# (could also be removed but from quick search online asthma and COPD sound similar to my medically uneducated brain)
full_data.loc[full_data['DIAGNOSIS']=='Asthma', 'DIAGNOSIS'] = 'COPD'
print(full_data.shape)
full_data.head()

In [None]:
data = full_data[['PATIENT_ID']+x_features+[y_label] ]
print(data.shape)
data.loc[data['PATIENT_ID']==107,]

# One-hot encode categorical labels
enc_inputs = ['CHEST_LOC', 'SEX']
enc = OneHotEncoder(handle_unknown='error', sparse=False)
enc1_out = enc.fit_transform(data['CHEST_LOC'].values.reshape(-1,1))
enc1_cat = [x.upper() for x in enc.categories_[0] ]
enc1_df = pd.DataFrame(data=enc1_out,columns=enc1_cat).astype(int)
enc2_out = enc.fit_transform(data['SEX'].values.reshape(-1,1))
enc2_cat = [x.upper() for x in enc.categories_[0] ]
enc2_df = pd.DataFrame(data=enc2_out,columns=enc2_cat).astype(int)

data = pd.concat([data, enc1_df, enc2_df],axis=1)
data.drop(enc_inputs,1)
x_features = [x for x in x_features if x not in enc_inputs] +enc1_cat+enc2_cat


# exclude some features that are either redundant or should not be predictive of the disease 
# (like the part of the lungs that has been recorded)
excluded_x_features = ['AL', 'AR', 'LL', 'LR', 'PL', 'PR', 'TC', 'M']
if len(excluded_x_features) > 0:
    train_features = [ xf for xf in x_features if xf not in excluded_x_features ]
else:
    train_features = x_features

print("Training features: {}".format(train_features) )
data.head()

### Split train-test

Use argument 'stratify' to preserve small classes, due to imbalance in dataset

In [None]:

files = np.array(full_data["ANNOTATION_FILE"].values)
indices = np.array(data.index)
idx_train, idx_test = train_test_split( indices,test_size=0.3,random_state=612, stratify=data['DIAGNOSIS'])

# train_test_split shuffles the order; re-order train and test inputs as per original filename list
idx_train = np.sort(idx_train)
idx_test = np.sort(idx_test)
X_train = data.loc[idx_train, train_features] 
y_train = data.loc[idx_train, y_label]
X_test = data.loc[idx_test, train_features] 
y_test = data.loc[idx_test, y_label]
files_train = files[idx_train]
files_test = files[idx_test]

# rememebr that stft_data and the otehr time-freq matrices are stored as dictionaries, keys are the file names 
# (stripped of file extension). When we split the dataset, we have done it in a way to keep track of which file 
# names go to the train set and which go to the test sets; that extra complication pays back now
stft_train = np.array( [stft_data[f] for f in files_train] )
stft_test  = np.array( [stft_data[f] for f in files_test] )
print(len(stft_train) )
print(len(stft_test))
iirt_train = np.array( [iirt_data[f] for f in files_train] )
iirt_test = np.array( [iirt_data[f] for f in files_test] )
del stft_data, iirt_data # free up some memory

# this is needed later for training the CNN, keras wants the labelling one-hot encoded rather than multiclass
classes = np.unique(y_train)
out_le = LabelEncoder()
out_le.fit(y_train)
print(out_le.classes_)
n_classes = len(out_le.classes_)
out_enc = OneHotEncoder(handle_unknown='error', sparse=False)

y_train_enc = out_le.transform(y_train)
y_train_enc = np.array( out_enc.fit_transform(y_train_enc.reshape(-1,1) ) )
y_test_enc = np.array( out_enc.transform(out_le.transform(y_test).reshape(-1,1) ) )

Some functions to help with evaluating models

In [None]:
def score_eval(ytrue, ypreds, model_name=""):
    tmp_acc = accuracy_score(ytrue, ypreds)
    tmp_precision = precision_score(ytrue, ypreds, average='macro')
    tmp_recall = recall_score(ytrue, ypreds, average='macro')
    tmp_cm = confusion_matrix(ytrue, ypreds)
    print("{mn} accuracy / precision / recall: {a:.3f} / {p:.3f} / {r:.3f}".format(a=tmp_acc, p=tmp_precision, r=tmp_recall, mn=model_name) )
    print("\n\n")
    print(classification_report(ytrue, ypreds) )
    return tmp_acc,tmp_precision, tmp_recall, tmp_cm

### TEST #1: Simple take: do not use the audio trace, use just patient and record data. Feed train test to a logit 

In [None]:
### SETUP LOGISTIC REGRESSION (MULTICLASS)
from sklearn.linear_model import LogisticRegression
logit_params = dict(multi_class='multinomial', penalty='l2', C=0.20, solver='newton-cg', random_state=991)

logit_class = LogisticRegression(**logit_params)
logit_model = logit_class.fit(X_train, y_train)
logit_test  = logit_model.predict(X_test )

logit_acc, logit_precision, logit_recall, logit_cm = score_eval(y_test, logit_test, "Logit_multi")

### TEST #2: do not use the audio trace, use just patient and record data. Feed train test to a BDT 

In [None]:
#
# define XGBoost classification model
xgb_params = {'max_depth': 5, 
              'n_estimators': 100,
              'learning_rate': 0.2,   # learning rate; smaller eta make convergence more accurate but slower
              'reg_lambda':0.0,   # disable L2 reg only if features are all reasonably independent
              'reg_alpha':0.0,    #  L1 reg,tring to prune unnecessary features
              'objective': 'multi:softmax',
              'random_state':9443,
              }


xgb_class = xgb.XGBClassifier(**xgb_params)

xgb_model = xgb_class.fit(X_train, y_train, 
                          eval_metric=['mlogloss'], 
                          eval_set=[(X_train, y_train), (X_test, y_test)])


# run scoring
xgb_test = xgb_model.predict(X_test )

In [None]:
xgb_acc, xgb_precision, xgb_recall , xgb_cm = score_eval(y_test, xgb_test, "XGBoost")

print("\n\nConfusion matrix:")
print(xgb_cm)

plot global feature importance as calculated by xgboost

In [None]:
xgb_importance = xgb_class.feature_importances_
plt.barh(train_features, xgb_importance)

## TEST #3: Conv2D on spectrogram images

Run a Convolutional NN on the STFT files; use the same train/test split defined above

In [None]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Flatten, Concatenate, Conv2D, Input, MaxPooling2D, Activation, BatchNormalization, Dropout, GlobalAveragePooling2D, GlobalMaxPooling2D
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.activations import relu
from tensorflow.keras.initializers import GlorotNormal
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import backend as K
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
from tensorflow.config import list_physical_devices as tf_config_list_physical_devices
tf_config_list_physical_devices() 

### CNN for the STFT data

Use only the STFT. Cascade 3 layers of CNN, drop out to regularise, use max pooling. After the third round of Conv2D, do a Global Avg Pooling and feed into a classification layer. The classification layer is made of two dense hidden layers plus dropout layers to regularise even there. Finally, a dense layer with a softmax activation to produce the scoring for each of the seven classes of respiratory conditions. 

**Note: this specific solution was chosen after few trials of different architectures.**

In [None]:
stft_input_shape = (257, 961, 1) #this was printed few cells above, when the STFT data were loaded
                                  # the last '1' indicates that we feed one image at the time
_stft_input = Input(shape=stft_input_shape, name="stft_input")

# First Conv layer
_stft = Conv2D(filters=32, kernel_size=(7,7), 
                     padding="same", 
                     activation=None, 
                     kernel_initializer=GlorotNormal(seed=41)  , 
                     name="stft_conv2d_01")(_stft_input)

_stft = Activation(relu, name="stft_relu_01")(_stft)
_stft = Dropout(rate=0.25, name="stft_convdropout_01")(_stft)
_stft = MaxPooling2D(pool_size=(7,7),padding='same' ,name="stft_maxpool_01")(_stft)

# Second Conv layer
_stft = Conv2D(filters=64, kernel_size=(5,5), 
                     padding="valid", 
                     activation=None, 
                     kernel_initializer=GlorotNormal(seed=42)  , 
                     name="stft_conv2d_02")(_stft)

_stft = Activation(relu, name="stft_relu_02")(_stft)
_stft = Dropout(rate=0.25, name="stft_convdropout_02")(_stft)
_stft = MaxPooling2D(pool_size=5,padding='same' ,name="stft_maxpool_02")(_stft)


# Third Conv layer
_stft = Conv2D(filters=128, kernel_size=(3,3), 
                     padding="valid", 
                     activation=None, 
                     kernel_initializer=GlorotNormal(seed=42)  , 
                     name="stft_conv2d_03")(_stft)

_stft = Activation(relu, name="stft_relu_03")(_stft)
_stft = Dropout(rate=0.25, name="stft_convdropout_03")(_stft)
_stft = MaxPooling2D(pool_size=3,padding='same' ,name="stft_maxpool_03")(_stft)

_stft = GlobalAveragePooling2D()(_stft)

# Classification layer
_stftclass = Dense(96, activation="relu", name="stft_hiddendense_01")(_stft)
_stftclass = Dropout(rate=0.25, name="stft_hiddendroput_01")(_stftclass)
_stftclass = Dense(48, activation="relu", name="stft_hiddendense_02")(_stftclass)
_stftclass = Dropout(rate=0.25, name="stft_hiddendroput_02")(_stftclass)
_stftclass = Dense(7, activation="softmax", name="stft_outputdense")(_stftclass)

stft_model = Model(_stft_input,_stftclass, name="stft_model")
stft_model.summary()

In [None]:
stft_model.compile(loss='categorical_crossentropy', 
                   optimizer='nadam', 
                   metrics=['accuracy'])
#keras.metrics.CategoricalCrossentropy()
#K.set_value(stft_model.optimizer.learning_rate, 0.001)


In [None]:
my_callbacks = [ EarlyStopping(monitor="val_loss", min_delta=0, patience=5),
                 ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                                  patience=3, min_lr=0.0001,mode='min') ]

In [None]:
history=stft_model.fit(
    {"stft_input":stft_train}, y_train_enc,
    validation_data=({"stft_input":stft_test},y_test_enc),
    epochs=20,
    verbose=2,
    callbacks=my_callbacks
)

In [None]:
history_df = pd.DataFrame(history.history)
history_df.plot()
plt.gca().set_xlabel("Epoch")
plt.gca().set_xlim(-0.5,20.5)
plt.gca().set_ylim(-0.1,1.1)
plt.grid(True)
plt.show()

stft_model.evaluate({"stft_input":stft_test},y_test_enc)

Compute predictions. Select as predicted class the one with the highest score. Then convert the class back to the orginal labelling and use it to evaluate the model as per the previous tests.

In [None]:
cnnstft01_test_probs = stft_model.predict(stft_test)
cnnstft01_test = np.argmax(cnnstft01_test_probs, axis=1)
cnnstft01_test = out_le.inverse_transform(cnnstft01_test)

cnnstft01_acc, cnnstft01_precision, cnnstft01_recall , cnnstft01_cm = score_eval(y_test, cnnstft01_test, "CNN-STFT")

print("\n\nConfusion matrix:")
print(cnnstft01_cm)

## TEST #4: Conv2D combined with patient data and annotations

Combine the same CNN developed so far for STFT with the metadata used previously

In [None]:
stft_input_shape = (257, 961, 1) #this was printed few cells above, when the STFT data were loaded
                                  # the last '1' indicates that we feed one image at the time
_stft_input = Input(shape=stft_input_shape, name="stft_input")

# First Conv layer
_stft = Conv2D(filters=32, kernel_size=(7,7), 
                     padding="same", 
                     activation=None, 
                     kernel_initializer=GlorotNormal(seed=41)  , 
                     name="stft_conv2d_01")(_stft_input)

_stft = Activation(relu, name="stft_relu_01")(_stft)
_stft = Dropout(rate=0.25, name="stft_convdropout_01")(_stft)
_stft = MaxPooling2D(pool_size=(7,7),padding='same' ,name="stft_maxpool_01")(_stft)

# Second Conv layer
_stft = Conv2D(filters=64, kernel_size=(5,5), 
                     padding="valid", 
                     activation=None, 
                     kernel_initializer=GlorotNormal(seed=42)  , 
                     name="stft_conv2d_02")(_stft)

_stft = Activation(relu, name="stft_relu_02")(_stft)
_stft = Dropout(rate=0.25, name="stft_convdropout_02")(_stft)
_stft = MaxPooling2D(pool_size=5,padding='same' ,name="stft_maxpool_02")(_stft)


# Third Conv layer
_stft = Conv2D(filters=128, kernel_size=(3,3), 
                     padding="valid", 
                     activation=None, 
                     kernel_initializer=GlorotNormal(seed=42)  , 
                     name="stft_conv2d_03")(_stft)

_stft = Activation(relu, name="stft_relu_03")(_stft)
_stft = BatchNormalization(name="stft_batchnorm_02")(_stft)# 
_stft = Dropout(rate=0.25, name="stft_convdropout_03")(_stft)
_stft = MaxPooling2D(pool_size=3,padding='same' ,name="stft_maxpool_03")(_stft)

_stft = GlobalAveragePooling2D()(_stft)

stft_model = Model(_stft_input,_stft, name="stft_model")


In [None]:
feat_input_shape = (8, )
_feat_input = Input(shape=feat_input_shape, name="feat_input")

_feat = Dense(48, activation="relu", name="feat_hiddendense_01")(_feat_input)
_feat = Dropout(rate=0.25, name="feat_hiddendroput_01")(_feat)
_feat = Dense(48, activation="relu", name="feat_hiddendense_02")(_feat)
feat_model = Model(_feat_input,_feat, name="feat_model")


In [None]:
print(_stft.shape)
print(_feat.shape)
concat = Concatenate()([_stft,_feat])
print(concat.shape)
_out_class = Dense(96, activation="relu", name="out_hiddendense_01")(concat)
_out_class = Dropout(rate=0.25, name="out_hiddendroput_01")(_out_class)
_out_class = Dense(48, activation="relu", name="out_hiddendense_02")(_out_class)
_out_class = Dropout(rate=0.25, name="out_hiddendroput_02")(_out_class)
_out_class = Dense(7, activation="softmax", name="out_outputdense")(_out_class)
combined_model = Model([_stft_input,_feat_input], _out_class, name="Combined_CNN_Metadata")
combined_model.summary()

In [None]:
combined_model.compile(loss='categorical_crossentropy', 
                   optimizer='nadam', 
                   metrics=['accuracy'])
#keras.metrics.CategoricalCrossentropy()
#K.set_value(stft_model.optimizer.learning_rate, 0.001)

my_callbacks = [ EarlyStopping(monitor="val_loss", min_delta=0, patience=8),
                 ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                                  patience=3, min_lr=0.0001,mode='min') ]


In [None]:
combined_model_history=combined_model.fit(
    {"stft_input":stft_train,"feat_input":X_train}, y_train_enc,
    validation_data=({"stft_input":stft_test,"feat_input":X_test},y_test_enc),
    epochs=30,
    verbose=2,
    callbacks=my_callbacks
)

In [None]:
history_df = pd.DataFrame(combined_model_history.history)
history_df.plot()
plt.gca().set_xlabel("Epoch")
plt.gca().set_xlim(-0.5,30.5)
plt.gca().set_ylim(-0.1,1.1)
plt.grid(True)
plt.show()

#combined_model.evaluate({"stft_input":stft_test,"feat_input":X_test},y_test_enc)

In [None]:
combined_test_probs = combined_model.predict([stft_test, X_test])
combined_test = np.argmax(combined_test_probs, axis=1)
combined_test = out_le.inverse_transform(combined_test)
combined_acc, combined_precision, combined_recall , combined_cm = score_eval(y_test, combined_test, "CNN-COMBINED")

print("\n\nConfusion matrix:")
print(combined_cm)