In [1]:
from pyprojroot import here as get_project_root
import os
os.chdir(get_project_root()) # hack for notebook development

import pandas as pd
import numpy as np
import logging
import data_acquisition.data_acquisition_config as config
from data_acquisition.custom_types import Dataset

logging.basicConfig(level=logging.INFO)

def _dataReader(path_names:list) -> list:
    '''
    Reads in raw data from .csv files and returns a list
    
    params:
    ---
    path_names (list): list of all the data files to read in
    
    returns:
    ---
    sequences (list): raw dataset from data directory
    '''
    
    sequences = list()
    
    for name in path_names:
        data = pd.read_csv(name, header=None)
        sequences.append(data.values)
          
    return sequences


def get_data() -> Dataset:
    '''
    runs the `_dataReader` method and stores the raw data into a Dataset named tuple. 
    
    returns:
    ---
    dataset (Dataset): named tuple of (data_n)
    '''
    
    logging.info(f"Loading raw data.")
    
    data_normal = np.stack(_dataReader(config.NORMAL_FILE_NAMES))
    data_horizontal = np.stack(_dataReader(config.HORI_MIS_FILE_NAMES))
    data_vertical = np.stack(_dataReader(config.VERT_MIS_FILE_NAMES))
    data_imbalance = np.stack(_dataReader(config.IMBALANCE_FILE_NAMES))
    data_overhang = np.stack(_dataReader(config.OVERHANG_FILE_NAMES))
    data_underhang = np.stack(_dataReader(config.UNDERHANG_FILE_NAMES))
    
    logging.info(f"Load complete.")
    
    dataset = Dataset(data_normal, data_horizontal, data_vertical, data_imbalance, data_overhang, data_underhang)
    return dataset


In [2]:
"""
Any utility function that is required for data processing goes here
"""

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from scipy.fft import rfft


def _dataScaler(data:list) -> list:
    '''
    Reads in data and returns a scaled list.
    
    params:
    ---
    data (list): data to down sample
    
    returns:
    ---
    final_sequence (list): resampled data
    '''   
    data_temp = np.reshape(data, (-1, data.shape[2]))    
    norm = MinMaxScaler().fit(data_temp)   
    data_norm = norm.transform(data_temp)    
    data_final = np.reshape(data_norm, (-1, data.shape[1], data.shape[2]))
    
    return data_final


def _downSampler(data:list, start_index:int, sample_rate:int) -> list:
    '''
    Reads in raw data from .csv files and returns a resampled list
    
    params:
    ---
    data (list): data to down sample
    start_index (int): starting index
    sample_rate (int): sampling rate
    
    returns:
    ---
    final_sequence (list): resampled data
    '''    
    final_sequence = list()
    for dataset in data:
        data_resampled = []
        start = start_index
        stop = sample_rate
        for i in range(int(len(dataset)/sample_rate)):
            data_resampled.append(dataset[start:stop, :].mean(axis=0))
            start += sample_rate
            stop += sample_rate
        final_sequence.append(np.stack(data_resampled))    
        
    return np.stack(final_sequence)


def _FFT(data:list) -> list:
    '''
    Reads in resampled data and peforms a Fast Fourier Transform with DC offset removal
    
    params:
    ---
    data (pd.DataFrame): data to perform Fast Fourier Transform
    
    returns:
    ---
    data_fft (list): FFT data
    '''
    data_fft = list()
    for dataset in data:
        data_fft.append(np.stack(np.abs(rfft(dataset, axis=0))[1:,:]))
    
    return np.stack(data_fft)


FileNotFoundError: Could not find module 'C:\ProgramData\Anaconda3\lib\site-packages\scipy\.libs\libbanded5x.EHDKC2XVYTQQ5MALRS6XN2CUSS6SRL6P.gfortran-win_amd64.dll' (or one of its dependencies). Try using the full path with constructor syntax.

In [None]:
# +
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import logging
import collections
import os
import re

path_parent = os.path.dirname(os.getcwd())
os.chdir(path_parent)

logging.basicConfig(level=logging.INFO)
# -

from data_acquisition.main import get_data
from data_processing.utils import _dataScaler, _downSampler, _FFT
import data_processing.data_processing_config as config

Dataset = collections.namedtuple('Dataset', 'X_train X_test y_train y_test')


def get_train_test_data(raw_data:Dataset) -> Dataset:
    '''
    runs the 'get_data()' and '_downSampler' methods to generate training and testing data sets
    
    params:
    ---
    dataset (Dataset): raw data set from data_acquisition module
    
    returns:
    ---
    train_test_data (Dataset): named tuple of (X_train, y_train, X_test, y_test)
    '''   
    logging.info(f"Data is being resampled at a sample rate of: {config.RESAMPLE_RATE}")    
    data_n = _downSampler(raw_data.normal, 0, config.RESAMPLE_RATE)
    data_horizontal = _downSampler(raw_data.horizontal, 0, config.RESAMPLE_RATE)
    data_imbalance = _downSampler(raw_data.imbalance, 0, config.RESAMPLE_RATE)
 
    logging.info(f"Scaling the data.")
    data_n = _dataScaler(data_n)
    data_horizontal = _dataScaler(data_horizontal)
    data_imbalance = _dataScaler(data_imbalance)
       
    logging.info(f"Performing FFT.")
    data_n = _FFT(data_n)
    data_horizontal = _FFT(data_horizontal)
    data_imbalance = _FFT(data_imbalance)
    
    y_1 = np.zeros(int(len(data_n)),dtype=int)
    y_2 = np.full(int(len(data_horizontal)),1)
    y_3 = np.full(int(len(data_imbalance)),2)
    y = np.concatenate((y_1, y_2, y_3))
    
    X = np.concatenate((data_n, data_horizontal, data_imbalance))
    
    logging.info(f"Spliting data to a test size of: {config.DATA_TEST_SIZE}")
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config.DATA_TEST_SIZE, random_state=42)
       
    train_test_data = Dataset(X_train, X_test, y_train, y_test)
    
    logging.info(f"Complete. Happy modelling :).")
    
    return train_test_data


In [None]:
"""
Any utility function that is required for data exploratory analysis goes here
"""

import numpy as np
import matplotlib.pyplot as plt
from scipy.fft import rfftfreq
import data_processing.data_processing_config as config


def time_plot(yf:np.ndarray, start:int, stop:int):
    '''
    Plots a time series
    
    params:
    ---
    yf (np.ndarray): input data to plot
    sample_rate (int): sampling rate (Hz)
    duration (int): signal duration in seconds
    '''   
    time = np.linspace(0, config.DURATION, len(yf), endpoint=False)
    
    fig, axs = plt.subplots(nrows=1, figsize=(11, 9))
    plt.rcParams['font.size'] = '14'
    
    for label in (axs.get_xticklabels() + axs.get_yticklabels()):
        label.set_fontsize(14)
    
    plt.plot(time[start:stop], yf[start:stop])
    axs.set_title('Time-series signal')
    axs.set_ylabel('Voltage (V)', fontsize=14)
    axs.set_xlabel('Time (s)', fontsize=14)
    plt.show()


def fft_plot(yf:np.ndarray):
    '''
    Plots the FFT
    
    params:
    ---
    yf (np.ndarray): input data to plot
    sample_rate (int): sampling rate (Hz)
    duration (int): signal duration in seconds
    '''   
    N = int((config.SAMPLE_RATE / config.RESAMPLE_RATE) * config.DURATION)
    xf = rfftfreq(N-1, 1 / int(config.SAMPLE_RATE / config.RESAMPLE_RATE))
    
    fig, axs = plt.subplots(nrows=1, figsize=(11, 9))
    plt.rcParams['font.size'] = '14'
    
    for label in (axs.get_xticklabels() + axs.get_yticklabels()):
        label.set_fontsize(14)
    
    plt.plot(xf, yf)
    axs.set_title('Frequency spectra')
    axs.set_ylabel('Signal strength', fontsize=14)
    axs.set_xlabel('Frequency (Hz)', fontsize=14)
    plt.show()


In [None]:
from pyprojroot import here as get_project_root
import os
os.chdir(get_project_root()) # hack for notebook development

# +
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import logging
import collections
import os
import re
from data_acquisition.main import get_data
from data_processing.main import get_train_test_data
from model_training import model_training_config as config

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

from keras.models import load_model
from keras.callbacks import ModelCheckpoint

path_parent = os.path.dirname(os.getcwd())
os.chdir(path_parent)

logging.basicConfig(level=logging.INFO)
# -

# ## Model Development

# Load the data
dataset = get_data()

# Process the data and train/test split
train_test_data = get_train_test_data(dataset)

# +
# Prepare the data
X_train = np.array(train_test_data.X_train)
X_test = np.array(train_test_data.X_test)

y_train = np.array(train_test_data.y_train)
y_test = np.array(train_test_data.y_test)
# -
# Generating the model
model = Sequential()
model.add(LSTM(config.LSTM_UNITS, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(config.OUTPUT_SIZE, activation=config.ACTIVATION))
model.summary()


# Training the model
chk = ModelCheckpoint('best_model.pkl', monitor=config.MONITOR, save_best_only=True, mode='auto', verbose=1)
model.compile(loss=config.LOSS_FUNCTION, optimizer=config.OPTIMIZER, metrics=['accuracy'])
hist = model.fit(X_train, y_train, epochs=config.EPOCHS, batch_size=int(X_train.shape[0]), callbacks=[chk], validation_split=config.VAL_SPLOT)

# ## Model Validation

# +
# Plotting training and validation accuracy per epoch
fig, axs = plt.subplots(nrows=1, figsize=(11, 9))
plt.rcParams['font.size'] = '14'
    
for label in (axs.get_xticklabels() + axs.get_yticklabels()):
    label.set_fontsize(14)    

plt.plot(hist.history['accuracy'])
plt.plot(hist.history['val_accuracy'])

axs.set_title('Model Accuracy')
axs.set_ylabel('Accuracy', fontsize=14)
axs.set_xlabel('Epoch', fontsize=14)
plt.legend(['train', 'val'], loc='upper left')
plt.show()

# +
# Loading the model and checking accuracy on the test data
model = load_model('best_model.pkl')

from sklearn.metrics import accuracy_score
test_preds = model.predict_classes(X_test)
accuracy_score(y_test, test_preds)
# -

# Comparing the actual values versus the predicted values
data_dict = {0:'normal', 1:'horizontal misalignment', 2:'imbalance'}
results = pd.DataFrame([y_test, test_preds]).T
results.columns = ['Actual', 'Prediction']
results.applymap(lambda x: data_dict[x])


