### importação de bibliotecas

In [1]:
import pandas as pd
import numpy as np
import glob 
import matplotlib.pyplot as plt
import urllib
import zipfile
import os
from scipy.fft import rfft, rfftfreq

### Obtenção dos dados

In [2]:
# Download dos arquivos
# urllib.request.urlretrieve('https://www02.smt.ufrj.br/~offshore/mfs/database/mafaulda/imbalance.zip', 'imbalance.zip')
# urllib.request.urlretrieve('https://www02.smt.ufrj.br/~offshore/mfs/database/mafaulda/normal.zip', 'normal.zip')

# # extração dos arquivos
# with zipfile.ZipFile('imbalance.zip', 'r') as zip_ref:
#     zip_ref.extractall('data')
# with zipfile.ZipFile('normal.zip', 'r') as zip_ref:
#     zip_ref.extractall('data')

# # excluir arquivos .zip
# os.remove('normal.zip')
# os.remove('imbalance.zip')

### Verificação de dados inválidos

In [2]:
def dataReader(path_names):
    data = []
    for i in path_names:
        data_file = pd.read_csv(i,header=None)
        data.append(data_file)
    return np.array(data)

In [4]:
def checkNullValues(path_names):
    for file in path_names:
        data = pd.read_csv(file, header=None)
        nullValues = data.isnull().sum()
        if any(nullValues > 0):
            print('CSV file {} has null values'.format(file))

In [5]:
# normal_file_names = glob.glob('./data/normal/*.csv')
# imbalance6g_file_names = glob.glob('./data/imbalance/6g/*.csv')
# imbalance10g_file_names = glob.glob('./data/imbalance/10g/*.csv')
# imbalance15g_file_names = glob.glob('./data/imbalance/15g/*.csv')
# imbalance20g_file_names = glob.glob('./data/imbalance/20g/*.csv')
# imbalance25g_file_names = glob.glob('./data/imbalance/25g/*.csv')
# imbalance30g_file_names = glob.glob('./data/imbalance/30g/*.csv')
# imbalance35g_file_names = glob.glob('./data/imbalance/35g/*.csv')

# checkNullValues(normal_file_names)
# checkNullValues(imbalance6g_file_names)
# checkNullValues(imbalance10g_file_names)
# checkNullValues(imbalance15g_file_names)
# checkNullValues(imbalance20g_file_names)
# checkNullValues(imbalance25g_file_names)
# checkNullValues(imbalance30g_file_names)
# checkNullValues(imbalance35g_file_names)

In [6]:
def plotSignalSample(dataSample):
    timeAxes = np.arange(0 ,5, 1/50000)
    figure, axis = plt.subplots(8,1, figsize=(12,15),  constrained_layout=True)
    axis[0].plot(timeAxes, dataSample[0])
    axis[1].plot(timeAxes, dataSample[1])
    axis[2].plot(timeAxes, dataSample[2])
    axis[3].plot(timeAxes, dataSample[3])
    axis[4].plot(timeAxes, dataSample[4])
    axis[5].plot(timeAxes, dataSample[5])
    axis[6].plot(timeAxes, dataSample[6])
    axis[7].plot(timeAxes, dataSample[7])
    figure.supxlabel('Seconds')
    figure.supylabel('Voltage')

In [None]:
normalSample = pd.read_csv('./data/normal/61.44.csv', header=None)
plotSignalSample(normalSample)


In [None]:
imbalance6gSample = pd.read_csv('./data/imbalance/6g/13.9264.csv', header=None)
plotSignalSample(imbalance6gSample)

Deste gráfico chegamos a 2 conclusões:
- Dados não estão na mesma escala, o que pode enviesar o algoritmo de machine learn.
- Alta taxa de amostragem (50kHz), o que tornará o processo de treinamento muito lento.

In [40]:
def resample(data, sample_target_rate):
    data_resampled = pd.DataFrame()
    period = 5 # seconds
    step = int(len(data)/period / sample_target_rate )
    start_index = 0
    stop_index = step
    for i in range(0, len(data), step):
        # data_resampled = pd.concat([data_resampled, data[start_index:stop_index].mean().to_frame().T], axis=0, ignore_index=True) ## para funcionar com dataframe
        temp_df =  pd.DataFrame([data[start_index:stop_index].mean(axis=0)])
        # data_resampled = pd.concat([data_resampled, temp_df], ignore_index=True, axis=0) # para ndarray
        data_resampled = pd.concat([data_resampled, temp_df], ignore_index=True) 
        start_index += step
        stop_index += step
   
    return data_resampled


In [5]:
target_rate = 500
ORIGINAL_SAMPLE_RATE = 50000 # taxa de amostragem original dos dados

In [41]:
# normalSample = pd.read_csv('./data/normal/61.44.csv', header=None)
a = normalSample.to_numpy() 
normalSampleResampled = resample(a, target_rate)
# imbalance6gSampleResampled = resample(imbalance6gSample, target_rate)
normalSampleResampled.shape

(2500, 8)

In [None]:
# plot 0,2s dos dados originais e reamostrados para comparação
# usado valor 10000 e 100, pois é 1/5 da taxa de amostragem original. pois esta sendo plotado 0,2s (1/5 de segundo)
time = 0.2 # s
qty_original_samples =  int(ORIGINAL_SAMPLE_RATE * time)
qty_resampled_sample = int(target_rate * time)
t1 = np.linspace(0, time, qty_original_samples)
t2 = np.linspace(0, time, qty_resampled_sample)
plt.figure(figsize=(10, 6))
plt.plot(t1,  normalSample[0][0:qty_original_samples], label='Sinal original', color='blue')
plt.plot(t2,  normalSampleResampled[0][0:qty_resampled_sample], label='Sinal reamostrado', color='orange')
plt.xlabel('Seconds')
plt.ylabel('Voltage')
plt.legend() 


podemos observar que há perdas nos dados. Será testado diferentes frequencias de reamostragem para encontrar o melhor  trade-off entre a precisão dos dados e o custo de treinamento.

modelos que podem ser usados
LSTM
dnn
knn
svm


In [6]:
def apply_fft(signal, sample_rate):
    N = signal.shape[0]
    T = 1 / sample_rate
    yf = np.fft.fftn(signal)
    xf = np.fft.fftfreq(N, T)[:N//2] # obtem somente frequencias positivas, pois o espectro é simétrico
    spectrum = 2.0/N * np.abs(yf)[:N//2] # normalização
    return spectrum, xf

In [13]:
# def apply_rfft(data, sample_rate):
#     # Calcular a Transformada de Fourier usando rfftn
#     spectrum = np.fft.rfftn(data)
    
#     # Calcular as frequências correspondentes
#     freqs = np.fft.rfftfreq(len(data), d=1/sample_rate)
#     # freqs = [np.fft.fftfreq(n, d=1/sample_rate) for n in data.shape]
#     return spectrum, freqs

In [None]:
normal_sample_spectrum, xf = apply_fft(normalSample, ORIGINAL_SAMPLE_RATE)   
plt.plot(xf,normal_sample_spectrum)


In [None]:
normal_sample_spectrum_imbalance, xf_imbalance = apply_fft(imbalance6gSample, ORIGINAL_SAMPLE_RATE)   
plt.plot(xf_imbalance, normal_sample_spectrum_imbalance)

In [None]:
normal_sample_spectrum_resampled, xf_resampled = apply_fft(normalSampleResampled, target_rate)   
plt.plot(xf_resampled,normal_sample_spectrum_resampled)


In [None]:
imbalance6g_resample_spectrum, xf_imbalance6g_resampled = apply_fft(imbalance6gSampleResampled, target_rate)   
plt.plot(xf_imbalance6g_resampled, imbalance6g_resample_spectrum)

## processamento de dados
aqui os dados serão tratados para uso nos modelos de machine learn

In [42]:
from sklearn.model_selection import train_test_split

def process_data(files_folder):
    processed_data = []
    data = dataReader(files_folder)
    for i in range(len(data)):
        data_resampled = resample(data[i], target_rate)
        spectrum, xf = apply_fft(data_resampled, target_rate)
        processed_data.append(spectrum)

    return process_data


In [43]:
data_normal = process_data( glob.glob('./data/normal/*.csv'))
data_imbalance35g= process_data( glob.glob('./data/imbalance/35g/*.csv'))

X = []
X.append(data_normal)
X.append(data_imbalance35g)

y = []
y.append(np.full(data_normal.shape[0], 'normal'))
y.append(np.full(data_imbalance35g.shape[0], 'imbalance35g'))

X_train, X_test, y_train, y_test  = train_test_split(X, y)

In [47]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data_normal)