# Notebook de recherche pour le preprocessing des données

In [1]:
import pandas as pd
import numpy as np
import librosa

In [2]:
#chargement du DF
df = pd.read_csv('Path_DF.csv')
df.head()

Unnamed: 0,Path,Machine_Type,Dataset,Status,Machine_ID,Sample_ID,nb_echantillon,freq_echantillonnage
0,Data\fan\test\anomaly_id_00_00000000.wav,fan,test,anomaly,0,0,160000,16000
1,Data\fan\test\anomaly_id_00_00000001.wav,fan,test,anomaly,0,1,160000,16000
2,Data\fan\test\anomaly_id_00_00000002.wav,fan,test,anomaly,0,2,160000,16000
3,Data\fan\test\anomaly_id_00_00000003.wav,fan,test,anomaly,0,3,160000,16000
4,Data\fan\test\anomaly_id_00_00000004.wav,fan,test,anomaly,0,4,160000,16000


In [3]:
#Choix du type de machine
Machine_Types=df.Machine_Type.unique()
question = "Choisisir le type de machine parmi :\n"+str(Machine_Types)
machine_type = input(question)

Choisisir le type de machine parmi :
['fan' 'pump' 'slider' 'ToyCar' 'ToyConveyor' 'valve'] slider


In [4]:
#Choix du machine_ID
machine_IDs = df[(df.Machine_Type == machine_type)].Machine_ID.unique()
question2 = "Choisisir l'ID de la machine :"+str(machine_IDs)
machine_ID = int(input(question2))

Choisisir l'ID de la machine :[0 2 4 6] 6


In [5]:
#Restriction du DF à la machine et au dataset choisis
df_work = df[(df.Machine_Type == machine_type) & (df.Machine_ID == machine_ID)]
#encodage de la variable cible "Status"
df_work = df_work.replace(['normal', 'anomaly'], [0,1])
df_work.shape

(1246, 8)

In [6]:
#définition de la fonction de chargement du fichier wav
def load_audio(audio_path):
    return librosa.load(audio_path, sr=None)

#récupération de la fréquence d'échantillonnage à partir du premier fichier
freq_echan = load_audio(df_work.iloc[0,0])[1]
freq_echan

16000

In [7]:
#fonction pour obteni un df contenant les données audio brutes
def audio_raw(df):
    df_audio_raw = pd.DataFrame(df['Path'].apply(lambda chemin : load_audio(chemin)[0]).tolist(),
                            index=df.index)
    return pd.concat([df[['Status', 'Dataset']],df_audio_raw],axis = 1)

In [8]:
data_raw = audio_raw(df_work)

data_raw.head()

Unnamed: 0,Status,Dataset,0,1,2,3,4,5,6,7,...,159990,159991,159992,159993,159994,159995,159996,159997,159998,159999
10556,1,test,0.002441,-0.001801,-0.002838,-0.000977,-0.002686,0.003326,0.005096,-0.00235,...,0.007111,0.005524,0.007111,0.011719,0.005554,-0.004669,-0.004608,-0.00592,-0.008881,-0.009033
10557,1,test,0.008789,0.006104,0.013184,0.005341,0.015839,0.014832,0.010773,0.018341,...,-0.011932,-0.01004,-0.008636,-0.00943,-0.007629,-0.007172,-0.005249,-0.003448,-0.004547,-0.002533
10558,1,test,-0.012451,-0.009705,-0.009369,0.001892,0.004089,0.000702,-0.017029,-0.012238,...,-0.011505,-0.006866,-0.001709,-0.007538,-0.006378,0.000977,0.00058,-0.00415,-0.00769,-0.009521
10559,1,test,0.008972,0.007141,0.000671,-0.004059,0.008698,0.000549,0.000214,-0.010895,...,0.008728,0.004974,0.007538,0.013214,0.014709,0.016266,0.01712,0.013763,0.005554,0.00473
10560,1,test,0.016724,0.021179,0.020264,0.018066,0.021027,0.020294,0.020325,0.016571,...,0.004608,0.008087,0.011017,0.000153,-0.000885,0.00592,0.008789,0.012604,0.009583,0.009644


In [9]:
X_train = data_raw[data_raw.Dataset == 'train'].drop(['Status','Dataset'],1).to_numpy()
X_test = data_raw[data_raw.Dataset == 'test'].drop(['Status','Dataset'],1).to_numpy()
y_train = data_raw[data_raw.Dataset == 'train']['Status'].to_numpy()
y_test = data_raw[data_raw.Dataset == 'test']['Status'].to_numpy()

#vérification des dimensions
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((868, 160000), (378, 160000), (868,), (378,))

## Obtention des features

On utilise la librairie *python_speech_features*  pour extraire les Mel Frequency Cepstral Coefficients (MFCC),  Log Filterbank Energies & Spectral Subband Centroids (SSC).

In [10]:
from python_speech_features import mfcc, logfbank, ssc

In [11]:
#winfunc possibles, à choisir pour les MFCC et SSC
winfuncs = [np.bartlett,np.blackman,np.hamming,np.hanning,np.kaiser]

### Preprocessing pour obtenir les MFCC

In [12]:
def load_mfcc(audio_path):
    audio, freq = load_audio(audio_path)
    logmelspec = mfcc(audio,
                 freq,
                 winlen=0.025,
                 winstep=0.01,
                 numcep=13,
                 nfilt=26,
                 nfft=512,
                 preemph=0.97,
                 ceplifter=22,
                 appendEnergy=True,
                 winfunc=winfuncs[2])
    return logmelspec.reshape(-1)


def audio_mfcc(df):
    data_mfcc = pd.DataFrame(df['Path'].apply(lambda chemin : load_mfcc(chemin)).tolist(),
                            index=df.index)
    return pd.concat([df[['Status', 'Dataset']],data_mfcc],axis = 1)

In [13]:
#récupération de la dimension des MFCC à partir du premier fichier
audio, freq = load_audio(df_work.iloc[0,0])
dimensions_mfcc = mfcc(audio,
                 freq,
                 winlen=0.025,
                 winstep=0.01,
                 numcep=13,
                 nfilt=26,
                 nfft=512,
                 preemph=0.97,
                 ceplifter=22,
                 appendEnergy=True,
                 winfunc=winfuncs[2]).shape
dimensions_mfcc

(999, 13)

In [14]:
data_mfcc = audio_mfcc(df_work)

data_mfcc.head()

Unnamed: 0,Status,Dataset,0,1,2,3,4,5,6,7,...,12977,12978,12979,12980,12981,12982,12983,12984,12985,12986
10556,1,test,-6.892782,-10.447458,-6.373519,7.061976,-6.092884,-2.379424,-15.685438,-3.936012,...,0.345188,-6.487016,5.616015,-6.82439,-7.631253,-16.130216,-7.157194,-1.94656,3.135495,0.703283
10557,1,test,-5.98865,-10.560449,5.075734,-9.560812,6.625802,-10.212314,8.906673,7.953229,...,-17.8079,-4.501981,-22.053058,-3.660524,-11.815765,-26.36025,-0.191433,-17.587042,34.371157,-17.191189
10558,1,test,-5.325704,-17.615533,-11.337076,-6.092055,-3.396523,-6.781889,-2.953769,-1.411025,...,-1.489798,-9.473008,-2.365234,-8.762227,-5.017166,-10.939408,-2.328268,-1.229858,-6.203556,-10.850072
10559,1,test,-5.840031,-18.057632,-6.491007,-7.091389,-5.539886,-12.068388,2.457035,15.091515,...,-4.652506,0.097891,-4.946733,1.436967,-6.136934,-17.893105,-13.566289,2.362489,4.898441,8.980998
10560,1,test,-6.620098,-8.506822,-6.283439,2.526479,2.760103,2.420846,-5.319186,-15.129823,...,-3.071477,-0.79344,13.087013,4.403166,-3.983191,-11.145248,-10.706247,-14.095883,1.635834,-18.717234


In [15]:
X_mfcc_train = data_mfcc[data_mfcc.Dataset == 'train'].drop(['Status','Dataset'],1).to_numpy().reshape(-1,dimensions_mfcc[0],dimensions_mfcc[1])
X_mfcc_test = data_mfcc[data_mfcc.Dataset == 'test'].drop(['Status','Dataset'],1).to_numpy().reshape(-1,dimensions_mfcc[0],dimensions_mfcc[1])

#vérification des dimensions
X_mfcc_train.shape,X_mfcc_test.shape

((868, 999, 13), (378, 999, 13))

### Preprocessing pour obtenir les Log Filterbank Energies

In [16]:
def load_fbank(audio_path):
    audio, freq = load_audio(audio_path)
    fbank_feat = logfbank(audio,
                          freq,
                          winlen=0.025,
                          winstep=0.01,
                          nfilt=26,
                          nfft=512,
                          preemph=0.97)
    return fbank_feat.reshape(-1)


def audio_fbank(df):
    data_fbank = pd.DataFrame(df['Path'].apply(lambda chemin : load_fbank(chemin)).tolist(),
                            index=df.index)
    return pd.concat([df[['Status', 'Dataset']],data_fbank],axis = 1)

In [17]:
#récupération de la dimension des Log Filterbank Energies à partir du premier fichier
audio, freq = load_audio(df_work.iloc[0,0])
dimensions_fbank = logfbank(audio,
                          freq,
                          winlen=0.025,
                          winstep=0.01,
                          nfilt=26,
                          nfft=512,
                          preemph=0.97).shape
dimensions_fbank

(999, 26)

In [18]:
data_fbank = audio_fbank(df_work)

data_fbank.head()

Unnamed: 0,Status,Dataset,0,1,2,3,4,5,6,7,...,25964,25965,25966,25967,25968,25969,25970,25971,25972,25973
10556,1,test,-11.359203,-10.308199,-10.464999,-10.088588,-10.386878,-9.553594,-9.775522,-10.992853,...,-9.229557,-9.193597,-8.69984,-8.801199,-8.882726,-8.765566,-8.975627,-9.262138,-9.804495,-9.982168
10557,1,test,-10.757312,-11.705314,-11.325358,-11.286967,-10.37781,-11.276854,-10.74955,-10.492274,...,-11.378645,-11.336512,-10.91848,-11.271658,-11.412401,-11.577599,-10.966083,-6.699265,-7.044201,-10.791879
10558,1,test,-12.557997,-11.738645,-11.722371,-10.509357,-10.700518,-9.920965,-10.017091,-9.280278,...,-9.061353,-8.723323,-8.944794,-8.731433,-9.035763,-8.551071,-9.197085,-9.297146,-9.579584,-9.759756
10559,1,test,-13.175701,-11.638316,-11.935652,-12.009189,-9.436666,-8.795837,-11.268739,-10.073571,...,-8.247985,-8.485483,-8.938056,-8.862233,-7.787864,-7.711837,-8.136772,-7.683774,-7.671882,-7.59233
10560,1,test,-11.405228,-11.12158,-10.165499,-9.673208,-9.311401,-10.787918,-10.689945,-10.073581,...,-9.72847,-8.916641,-8.717782,-8.572509,-9.304456,-9.42443,-8.89998,-9.176286,-9.485502,-9.71847


In [19]:
X_fbank_train = data_fbank[data_fbank.Dataset == 'train'].drop(['Status','Dataset'],1).to_numpy().reshape(-1,dimensions_fbank[0],dimensions_fbank[1])
X_fbank_test = data_fbank[data_fbank.Dataset == 'test'].drop(['Status','Dataset'],1).to_numpy().reshape(-1,dimensions_fbank[0],dimensions_fbank[1])

#vérification des dimensions
X_fbank_train.shape,X_fbank_test.shape

((868, 999, 26), (378, 999, 26))

### Preprocessing pour obtenir les Spectral Subband Centroids

In [20]:
def load_ssc(audio_path):
    audio, freq = load_audio(audio_path)
    ssc_feat = ssc(audio,
               freq,
               winlen=0.025,
               winstep=0.01,
               nfilt=26,
               nfft=512,
               lowfreq=0,
               highfreq=None,
               preemph=0.97,
               winfunc=winfuncs[2])

    return ssc_feat.reshape(-1)


def audio_ssc(df):
    data_ssc = pd.DataFrame(df['Path'].apply(lambda chemin : load_ssc(chemin)).tolist(),
                            index=df.index)
    return pd.concat([df[['Status', 'Dataset']],data_ssc],axis = 1)

In [21]:
#récupération de la dimension des Spectral Subband Centroids à partir du premier fichier
audio, freq = load_audio(df_work.iloc[0,0])
dimensions_ssc = ssc(audio,
               freq,
               winlen=0.025,
               winstep=0.01,
               nfilt=26,
               nfft=512,
               lowfreq=0,
               highfreq=None,
               preemph=0.97,
               winfunc=winfuncs[2]).shape
dimensions_ssc

(999, 26)

In [22]:
data_ssc = audio_ssc(df_work)

data_ssc.head()

Unnamed: 0,Status,Dataset,0,1,2,3,4,5,6,7,...,25964,25965,25966,25967,25968,25969,25970,25971,25972,25973
10556,1,test,56.052518,156.939685,192.754398,317.382989,412.545029,527.601317,601.804611,718.634959,...,2702.000483,3113.063466,3418.669103,3759.319663,4323.070069,4759.634774,5258.719511,5804.515681,6490.640261,7240.355075
10557,1,test,68.216235,132.759431,203.978973,336.508154,401.457483,527.672156,607.447444,763.268081,...,2735.083821,3072.393532,3383.888768,3814.854351,4239.113239,4765.476729,5283.930901,6143.514987,6156.548391,7295.170645
10558,1,test,66.264974,147.867073,192.52202,325.487553,411.572851,517.5413,616.525201,771.097408,...,2760.742039,3019.022158,3400.424721,3839.232457,4239.52242,4766.157596,5293.588753,5834.5406,6455.184257,7198.311023
10559,1,test,74.300081,141.437279,193.994045,340.203689,433.042192,494.395596,619.092906,807.317776,...,2701.213664,2983.39648,3393.43895,3879.418893,4316.15502,4685.693898,5362.83112,5909.40292,6534.320224,7227.067766
10560,1,test,72.959531,146.052977,202.998956,350.145163,382.212557,488.152323,653.774184,758.375821,...,2669.447171,3125.504969,3432.500935,3784.253003,4198.635144,4787.69278,5346.243934,5836.956184,6395.376096,7293.588778


In [23]:
X_ssc_train = data_ssc[data_ssc.Dataset == 'train'].drop(['Status','Dataset'],1).to_numpy().reshape(-1,dimensions_ssc[0],dimensions_ssc[1])
X_ssc_test = data_ssc[data_ssc.Dataset == 'test'].drop(['Status','Dataset'],1).to_numpy().reshape(-1,dimensions_ssc[0],dimensions_ssc[1])

#vérification des dimensions
X_ssc_train.shape,X_ssc_test.shape

((868, 999, 26), (378, 999, 26))