In [85]:
import numpy as np 
import pandas as pd
import scipy
import warnings
warnings.filterwarnings(action="ignore",category=UserWarning)
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [102]:
path = "Bearing Data"

In [87]:
#helper functions
def read_files(path):
    dict = {}
    for file in os.listdir(path):
        if file.endswith(".mat"):
            dict[file] = scipy.io.loadmat(os.path.join(path,file))
    data = {}
    for file in dict.keys():
        df = dict[file]
        for key in df.keys():
            if key.endswith("DE_time"):
                data[file] = pd.DataFrame(df[key])
        # df = pd.DataFrame(data[file])
        # dfs[file] = df
    return data

def split_data(dfs,size):
    dfs_new = {}
    for file in dfs.keys():
        df = dfs[file].to_numpy()
        total_num_blocks = len(df)//size
        data_for_resampling = df[:total_num_blocks*size]

        # split into segments
        segments = data_for_resampling.reshape(-1, size)
        dfs_new[file] = segments
    return dfs_new

def extract_features(signal):
    min = np.min(signal)
    max = np.max(signal)
    mean = np.mean(signal)
    std_dev = np.std(signal)
    skewness = pd.Series(signal).skew()
    kurtosis = pd.Series(signal).kurtosis()
    peak_to_peak = np.ptp(signal)
    rms = np.sqrt(np.mean(np.square(signal)))
    crest_factor = np.max(np.abs(signal)) / rms
    mad = np.mean(np.abs(signal - np.mean(signal)))
    sma = np.sum(np.abs(signal))
    zcr = ((signal[:-1] * signal[1:]) < 0).sum()
    features = {"mean":mean,'std_dev':std_dev,'skewness':skewness,'kurtosis':kurtosis,'peak_to_peak':peak_to_peak,
                             'rms':rms,'crest_factor':crest_factor,'mad':mad,'sma':sma,'zcr':zcr,'min':min,'max':max}

    return features

def stats_params(dfs):
    dfs_stats = {}
    for file in dfs.keys():
        signal_df = dfs[file]
        segment_features = []
        for i in range(len(signal_df)):
            features = pd.DataFrame(extract_features(signal_df[i]),index=[i])
            segment_features.append(features)
        dfs_stats[file] = pd.concat(segment_features,axis=0,ignore_index=True)
    return dfs_stats

def label_data(dfs):
    df_list = []
    for file in dfs.keys():
        df = dfs[file]
        df['label'] = file.split("_")[0]
        df_list.append(df)
    data = pd.concat(df_list,axis=0,ignore_index=True)
    return data

In [103]:
#reading multiple .mat files
data = read_files(path)

In [89]:
#splitting data to small chunks
data = split_data(data,2048)

In [90]:
#calculating statistical and time domain features
data = stats_params(data)

In [91]:
#labeling data
data = label_data(data)

In [92]:
data.head()

Unnamed: 0,mean,std_dev,skewness,kurtosis,peak_to_peak,rms,crest_factor,mad,sma,zcr,min,max,label
0,0.01784,0.122716,-0.118746,-0.036491,0.778761,0.124006,3.378048,0.097425,202.450174,248,-0.4189,0.359862,Ball
1,0.022255,0.132456,0.174953,-0.075956,0.828829,0.134312,3.482303,0.106591,220.724673,242,-0.361113,0.467716,Ball
2,0.02047,0.149614,0.040399,-0.269128,0.906642,0.151008,3.10282,0.119333,247.139762,248,-0.438092,0.46855,Ball
3,0.02096,0.157029,-0.0233,0.141028,1.127775,0.158422,3.69109,0.124934,257.875318,251,-0.543026,0.584749,Ball
4,0.022167,0.138155,-0.081652,0.41003,1.025762,0.139922,4.137354,0.108189,224.647686,253,-0.578908,0.446854,Ball


In [93]:
data.shape

(2317, 13)

In [94]:
data.label.value_counts()

label
Ball      711
OR        711
IR        659
Normal    236
Name: count, dtype: int64

In [95]:
#saving data
data.to_csv("labeled_data.csv")