In [None]:
from google.colab import drive
import os
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
import json

In [None]:
pd.set_option('display.max_columns', None)

## 0. Load Data From Drive

In [None]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
DATA_DIR = '/content/drive/My Drive/CS6140_final_project/Data/'
PREPROCESSED_DATA_F = 'preprocessed_data_variable_hour_window_cols.csv'
PREPROCESSED_DATA_LOC = os.path.join(DATA_DIR, PREPROCESSED_DATA_F)

In [None]:
df = pd.read_csv(
    PREPROCESSED_DATA_LOC,
)
print(f"- {format(len(df), ',')} total rows in df.")

- 876,100 total rows in df.


In [None]:
df.head()

Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration,12_machineID_time_window,12_order_in_time_window,12_machine_example_ID,12_step_ID,24_machineID_time_window,24_order_in_time_window,24_machine_example_ID,24_step_ID,48_machineID_time_window,48_order_in_time_window,48_machine_example_ID,48_step_ID,72_machineID_time_window,72_order_in_time_window,72_machine_example_ID,72_step_ID,comp1_maint,comp2_maint,comp3_maint,comp4_maint,error1,error2,error3,error4,error5,comp1_failure,comp2_failure,comp3_failure,comp4_failure,age,model2,model3,model4
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686,1,1,"(001, 001)","(001, 001, 001)",1,1,"(001, 001)","(001, 001, 001)",1,1,"(001, 001)","(001, 001, 001)",1,1,"(001, 001)","(001, 001, 001)",0,0,0,0,0,0,0,0,0,0,0,0,0,18,False,True,False
1,2015-01-01 07:00:00,1,162.879223,402.74749,95.460525,43.413973,1,2,"(001, 001)","(001, 001, 002)",1,2,"(001, 001)","(001, 001, 002)",1,2,"(001, 001)","(001, 001, 002)",1,2,"(001, 001)","(001, 001, 002)",0,0,0,0,0,0,0,0,0,0,0,0,0,18,False,True,False
2,2015-01-01 08:00:00,1,170.989902,527.349825,75.237905,34.178847,1,3,"(001, 001)","(001, 001, 003)",1,3,"(001, 001)","(001, 001, 003)",1,3,"(001, 001)","(001, 001, 003)",1,3,"(001, 001)","(001, 001, 003)",0,0,0,0,0,0,0,0,0,0,0,0,0,18,False,True,False
3,2015-01-01 09:00:00,1,162.462833,346.149335,109.248561,41.122144,1,4,"(001, 001)","(001, 001, 004)",1,4,"(001, 001)","(001, 001, 004)",1,4,"(001, 001)","(001, 001, 004)",1,4,"(001, 001)","(001, 001, 004)",0,0,0,0,0,0,0,0,0,0,0,0,0,18,False,True,False
4,2015-01-01 10:00:00,1,157.610021,435.376873,111.886648,25.990511,1,5,"(001, 001)","(001, 001, 005)",1,5,"(001, 001)","(001, 001, 005)",1,5,"(001, 001)","(001, 001, 005)",1,5,"(001, 001)","(001, 001, 005)",0,0,0,0,0,0,0,0,0,0,0,0,0,18,False,True,False


## 1. Postprocess Data

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'])
df['12_machine_example_ID'] = df['12_machine_example_ID'].astype(str)
df['24_machine_example_ID'] = df['24_machine_example_ID'].astype(str)
df['48_machine_example_ID'] = df['48_machine_example_ID'].astype(str)
df['72_machine_example_ID'] = df['72_machine_example_ID'].astype(str)
df['12_step_ID'] = df['12_step_ID'].astype(str)
df['24_step_ID'] = df['24_step_ID'].astype(str)
df['48_step_ID'] = df['48_step_ID'].astype(str)
df['72_step_ID'] = df['72_step_ID'].astype(str)
df['model2'] = df['model2'].astype(int)
df['model3'] = df['model3'].astype(int)
df['model4'] = df['model4'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 876100 entries, 0 to 876099
Data columns (total 39 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   datetime                  876100 non-null  datetime64[ns]
 1   machineID                 876100 non-null  int64         
 2   volt                      876100 non-null  float64       
 3   rotate                    876100 non-null  float64       
 4   pressure                  876100 non-null  float64       
 5   vibration                 876100 non-null  float64       
 6   12_machineID_time_window  876100 non-null  int64         
 7   12_order_in_time_window   876100 non-null  int64         
 8   12_machine_example_ID     876100 non-null  object        
 9   12_step_ID                876100 non-null  object        
 10  24_machineID_time_window  876100 non-null  int64         
 11  24_order_in_time_window   876100 non-null  int64         
 12  24

In [None]:
df.head()

Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration,12_machineID_time_window,12_order_in_time_window,12_machine_example_ID,12_step_ID,24_machineID_time_window,24_order_in_time_window,24_machine_example_ID,24_step_ID,48_machineID_time_window,48_order_in_time_window,48_machine_example_ID,48_step_ID,72_machineID_time_window,72_order_in_time_window,72_machine_example_ID,72_step_ID,comp1_maint,comp2_maint,comp3_maint,comp4_maint,error1,error2,error3,error4,error5,comp1_failure,comp2_failure,comp3_failure,comp4_failure,age,model2,model3,model4
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686,1,1,"(001, 001)","(001, 001, 001)",1,1,"(001, 001)","(001, 001, 001)",1,1,"(001, 001)","(001, 001, 001)",1,1,"(001, 001)","(001, 001, 001)",0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,1,0
1,2015-01-01 07:00:00,1,162.879223,402.74749,95.460525,43.413973,1,2,"(001, 001)","(001, 001, 002)",1,2,"(001, 001)","(001, 001, 002)",1,2,"(001, 001)","(001, 001, 002)",1,2,"(001, 001)","(001, 001, 002)",0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,1,0
2,2015-01-01 08:00:00,1,170.989902,527.349825,75.237905,34.178847,1,3,"(001, 001)","(001, 001, 003)",1,3,"(001, 001)","(001, 001, 003)",1,3,"(001, 001)","(001, 001, 003)",1,3,"(001, 001)","(001, 001, 003)",0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,1,0
3,2015-01-01 09:00:00,1,162.462833,346.149335,109.248561,41.122144,1,4,"(001, 001)","(001, 001, 004)",1,4,"(001, 001)","(001, 001, 004)",1,4,"(001, 001)","(001, 001, 004)",1,4,"(001, 001)","(001, 001, 004)",0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,1,0
4,2015-01-01 10:00:00,1,157.610021,435.376873,111.886648,25.990511,1,5,"(001, 001)","(001, 001, 005)",1,5,"(001, 001)","(001, 001, 005)",1,5,"(001, 001)","(001, 001, 005)",1,5,"(001, 001)","(001, 001, 005)",0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,1,0


In [None]:
def aggregate_by_time_window(df: pd.DataFrame, time_window: int) -> pd.DataFrame:
    """
    Purpose: Groups and aggregates the data based on the time windows passed in.
    :param df: DataFrame containing machine data.
    :param time_window: int representing the number of hours to group and aggregate the data into;
        choose from 12, 24, 48, 72.
    :return: DataFrame with data grouped and aggregated by 'machine_example_ID'
    """

    # Check if the time window is valid
    valid_time_windows = [12, 24, 48, 72]
    if time_window not in valid_time_windows:
        raise ValueError("Invalid time window. Please choose one of 12, 24, 48, 72.")

    def range(x):
      return np.max(x) - np.min(x)

    # Define aggregation functions for each column
    agg_funcs = {
        'datetime': 'first',
        'machineID': 'first',
        'volt': [
            'mean', 'std', skew, kurtosis, 'min', 'max', range
        ],
        'rotate': [
            'mean', 'std', skew, kurtosis, 'min', 'max', range
        ],
        'pressure': [
            'mean', 'std', skew, kurtosis, 'min', 'max', range
        ],
        'vibration': [
            'mean', 'std', skew, kurtosis, 'min', 'max', range
        ],
        'comp1_maint': 'sum',
        'comp2_maint': 'sum',
        'comp3_maint': 'sum',
        'comp4_maint': 'sum',
        'error1': 'sum',
        'error2': 'sum',
        'error3': 'sum',
        'error4': 'sum',
        'error5': 'sum',
        'comp1_failure': 'sum',
        'comp2_failure': 'sum',
        'comp3_failure': 'sum',
        'comp4_failure': 'sum',
        'age': 'first',
        'model2': 'max',
        'model3': 'max',
        'model4': 'max'
    }

    # Group by the selected columns and aggregate using agg_funcs
    aggregated_df = df.groupby(
                      f'{time_window}_machine_example_ID'
                    ).agg(agg_funcs).reset_index()

    return aggregated_df

In [None]:
aggregated_df = aggregate_by_time_window(df, 24)

In [None]:
aggregated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12200 entries, 0 to 12199
Data columns (total 48 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   (72_machine_example_ID, )  12200 non-null  object        
 1   (datetime, first)          12200 non-null  datetime64[ns]
 2   (machineID, first)         12200 non-null  int64         
 3   (volt, mean)               12200 non-null  float64       
 4   (volt, std)                12200 non-null  float64       
 5   (volt, skew)               12200 non-null  float64       
 6   (volt, kurtosis)           12200 non-null  float64       
 7   (volt, min)                12200 non-null  float64       
 8   (volt, max)                12200 non-null  float64       
 9   (volt, range)              12200 non-null  float64       
 10  (rotate, mean)             12200 non-null  float64       
 11  (rotate, std)              12200 non-null  float64       
 12  (rot

In [None]:
aggregated_df.head()

Unnamed: 0_level_0,24_machine_example_ID,datetime,machineID,volt,volt,volt,volt,volt,volt,volt,rotate,rotate,rotate,rotate,rotate,rotate,rotate,pressure,pressure,pressure,pressure,pressure,pressure,pressure,vibration,vibration,vibration,vibration,vibration,vibration,vibration,comp1_maint,comp2_maint,comp3_maint,comp4_maint,error1,error2,error3,error4,error5,comp1_failure,comp2_failure,comp3_failure,comp4_failure,age,model2,model3,model4
Unnamed: 0_level_1,Unnamed: 1_level_1,first,first,mean,std,skew,kurtosis,min,max,range,mean,std,skew,kurtosis,min,max,range,mean,std,skew,kurtosis,min,max,range,mean,std,skew,kurtosis,min,max,range,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,first,max,max,max
0,"(001, 001)",2015-01-01 06:00:00,1,169.733809,11.23312,0.621126,0.692484,151.335682,200.87243,49.536748,445.179865,48.717395,0.095361,-0.925833,346.149335,527.349825,181.20049,96.797113,10.07988,-0.098792,-0.409428,75.237905,113.077935,37.840031,40.38516,5.853209,-0.251997,0.436215,25.990511,52.355876,26.365365,0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,1,0
1,"(001, 002)",2015-01-02 06:00:00,1,170.356866,15.033766,-0.153168,-0.55091,140.776309,197.363125,56.586815,454.242875,40.858613,-0.004614,-1.2496,384.645962,521.837936,137.191975,100.787669,11.38492,0.263643,-0.45787,80.668287,127.014498,46.346211,38.471831,5.12186,0.087263,-0.880854,29.527665,47.862484,18.33482,0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,1,0
2,"(001, 003)",2015-01-03 06:00:00,1,172.56676,12.805887,0.584383,-0.745664,154.199258,195.564941,41.365683,450.897424,45.129934,0.494953,0.442788,374.127148,568.97231,194.845162,100.2404,9.861049,0.067344,-1.053918,85.24661,117.148227,31.901618,52.351479,5.830561,0.158237,0.358548,39.273567,66.764515,27.490947,0,0,0,0,1,0,1,0,0,0,0,0,0,18,0,1,0
3,"(001, 004)",2015-01-04 06:00:00,1,171.333562,21.282989,-0.186068,0.013508,127.16362,215.656488,88.492868,441.310894,34.033248,-0.317283,0.381821,365.213804,517.348533,152.134729,98.087115,9.480559,0.422419,-0.487941,82.400818,118.853452,36.452635,51.607772,5.294177,0.174354,-0.648749,41.674887,62.464103,20.789216,0,0,0,0,0,0,0,0,1,0,0,0,0,18,0,1,0
4,"(001, 005)",2015-01-05 06:00:00,1,171.393482,13.858179,-0.474058,0.071414,138.826437,197.636954,58.810518,468.359778,42.645922,0.215679,0.165255,392.124959,575.505189,183.380231,104.452155,10.968382,-0.433933,0.008458,78.721961,126.46458,47.742619,39.999324,5.506359,0.683191,0.074323,32.813967,52.383097,19.56913,1,0,0,0,0,0,0,0,0,0,0,0,1,18,0,1,0


In [None]:
# Flatten MultiIndex columns
aggregated_df.columns = [
    col[0] if i == 0 else '_'.join(col).strip() for i, col in enumerate(aggregated_df.columns.values)
]
aggregated_df.head(1)

Unnamed: 0,72_machine_example_ID,datetime_first,machineID_first,volt_mean,volt_std,volt_skew,volt_kurtosis,volt_min,volt_max,volt_range,rotate_mean,rotate_std,rotate_skew,rotate_kurtosis,rotate_min,rotate_max,rotate_range,pressure_mean,pressure_std,pressure_skew,pressure_kurtosis,pressure_min,pressure_max,pressure_range,vibration_mean,vibration_std,vibration_skew,vibration_kurtosis,vibration_min,vibration_max,vibration_range,comp1_maint_sum,comp2_maint_sum,comp3_maint_sum,comp4_maint_sum,error1_sum,error2_sum,error3_sum,error4_sum,error5_sum,comp1_failure_sum,comp2_failure_sum,comp3_failure_sum,comp4_failure_sum,age_first,model2_max,model3_max,model4_max
0,"(001, 001)",2015-01-01 06:00:00,1,170.885811,12.988999,0.236573,-0.187545,140.776309,200.87243,60.096121,450.106721,44.537872,0.16778,-0.494777,346.149335,568.97231,222.822975,99.275061,10.467413,0.12748,-0.432249,75.237905,127.014498,51.776593,43.736157,8.298338,0.335899,-0.301468,25.990511,66.764515,40.774004,0,0,0,0,1,0,1,0,0,0,0,0,0,18,0,1,0


In [None]:
def rename_first_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Purpose: Renames the first column of the input dataframe by removing the first
        3 characters.
    :param df: DataFrame containing machine data.
    :return: DataFrame with the first column renamed.
    """

    # Extract the common part of the column names
    common_part = df.columns[0][:3]  # Assuming the first column follows the pattern

    # Replace the common part with an empty string
    new_column_name = df.columns[0].replace(common_part, '')

    # Rename the column
    df.rename(columns={df.columns[0]: new_column_name}, inplace=True)

    return df

In [None]:
aggregated_df = rename_first_column(aggregated_df)
aggregated_df.tail(1)

Unnamed: 0,machine_example_ID,datetime_first,machineID_first,volt_mean,volt_std,volt_skew,volt_kurtosis,volt_min,volt_max,volt_range,rotate_mean,rotate_std,rotate_skew,rotate_kurtosis,rotate_min,rotate_max,rotate_range,pressure_mean,pressure_std,pressure_skew,pressure_kurtosis,pressure_min,pressure_max,pressure_range,vibration_mean,vibration_std,vibration_skew,vibration_kurtosis,vibration_min,vibration_max,vibration_range,comp1_maint_sum,comp2_maint_sum,comp3_maint_sum,comp4_maint_sum,error1_sum,error2_sum,error3_sum,error4_sum,error5_sum,comp1_failure_sum,comp2_failure_sum,comp3_failure_sum,comp4_failure_sum,age_first,model2_max,model3_max,model4_max
12199,"(100, 122)",2015-12-30 06:00:00,100,170.15341,14.952094,-0.061634,-0.334672,136.363542,204.190937,67.827395,458.133285,59.734066,0.407187,-0.664071,356.662576,578.429999,221.767423,101.209942,8.435513,-0.64779,0.674782,79.095538,117.42634,38.330802,39.712244,5.706438,0.078506,-0.70524,27.457001,50.83221,23.375209,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,1


In [None]:
def remove_max_time_window(df: pd.DataFrame) -> pd.DataFrame:
    """
    Purpose: Removes the last time window from the input dataframe.
    :param df: DataFrame containing machine data.
    :return: DataFrame with the last time window removed.
    """

    print(f"- {format(len(df), ',')} total entries.")

    # Extract machine ID and time window
    df[['machineID', 'time_window']] = df['machine_example_ID'].str.extract(r'\((\d+),\s*(\d+)\)')

    # Convert machineID and time_window to integers
    df['machineID'] = df['machineID'].astype(int)
    df['time_window'] = df['time_window'].astype(int)

    # Group by machine ID and find the maximum time window for each machine
    max_time_windows = df.groupby('machineID')['time_window'].max()

    # Filter out rows where the time window matches the maximum time window for each machine
    filtered_df = df[~df.apply(lambda row: row['time_window'] == max_time_windows[row['machineID']], axis=1)].copy()

    # Drop machineID and time_window columns
    filtered_df.drop(columns=['machineID', 'time_window'], inplace=True)

    print(f"- {format(len(filtered_df), ',')} total entries after removal.")

    return filtered_df

In [None]:
# Remove entries corresponding to maximum machineID_time_window
aggregated_df = remove_max_time_window(aggregated_df)
aggregated_df.info()

- 12,200 total entries.
- 12,100 total entries after removal.
<class 'pandas.core.frame.DataFrame'>
Index: 12100 entries, 0 to 12198
Data columns (total 48 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   machine_example_ID  12100 non-null  object        
 1   datetime_first      12100 non-null  datetime64[ns]
 2   machineID_first     12100 non-null  int64         
 3   volt_mean           12100 non-null  float64       
 4   volt_std            12100 non-null  float64       
 5   volt_skew           12100 non-null  float64       
 6   volt_kurtosis       12100 non-null  float64       
 7   volt_min            12100 non-null  float64       
 8   volt_max            12100 non-null  float64       
 9   volt_range          12100 non-null  float64       
 10  rotate_mean         12100 non-null  float64       
 11  rotate_std          12100 non-null  float64       
 12  rotate_skew         12100 non-null  float64  

## 2. Create Features

### 2a. Logistic Regression

#### Create Logistic Regression Data Matrix From Postprocessed Data

In [None]:
# Create the data matrix containing the features for a logistic regression model
X = aggregated_df.drop(
    columns=[
        'datetime_first',
        'machineID_first',
        'machine_example_ID',
    ]
)
X.head()

Unnamed: 0,volt_mean,volt_std,volt_skew,volt_kurtosis,volt_min,volt_max,volt_range,rotate_mean,rotate_std,rotate_skew,rotate_kurtosis,rotate_min,rotate_max,rotate_range,pressure_mean,pressure_std,pressure_skew,pressure_kurtosis,pressure_min,pressure_max,pressure_range,vibration_mean,vibration_std,vibration_skew,vibration_kurtosis,vibration_min,vibration_max,vibration_range,comp1_maint_sum,comp2_maint_sum,comp3_maint_sum,comp4_maint_sum,error1_sum,error2_sum,error3_sum,error4_sum,error5_sum,comp1_failure_sum,comp2_failure_sum,comp3_failure_sum,comp4_failure_sum,age_first,model2_max,model3_max,model4_max
0,170.885811,12.988999,0.236573,-0.187545,140.776309,200.87243,60.096121,450.106721,44.537872,0.16778,-0.494777,346.149335,568.97231,222.822975,99.275061,10.467413,0.12748,-0.432249,75.237905,127.014498,51.776593,43.736157,8.298338,0.335899,-0.301468,25.990511,66.764515,40.774004,0,0,0,0,1,0,1,0,0,0,0,0,0,18,0,1,0
1,171.487472,17.667202,-0.256072,0.97957,121.502718,215.656488,94.15377,450.703413,41.818685,-0.127176,0.406995,345.418513,575.505189,230.086676,101.323552,10.21323,0.023665,-0.433038,78.721961,126.46458,47.742619,44.487873,7.103904,0.401587,-0.431127,32.813967,62.464103,29.650136,1,0,0,0,0,0,0,0,1,0,0,0,1,18,0,1,0
2,172.183333,13.290595,-0.031461,-0.494151,143.506562,200.741045,57.234483,451.975443,39.772529,0.226721,-0.114202,361.562141,552.372925,190.810784,99.888028,9.198322,-0.29156,-0.8923,81.565288,117.164152,35.598864,39.924214,4.944806,0.14151,0.069886,29.031392,54.919547,25.888155,0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,1,0
3,169.639459,15.727822,-0.083465,-0.091825,130.223881,203.777627,73.553746,447.914585,48.995525,0.056837,0.721875,307.175549,586.945963,279.770414,98.173025,11.176246,-0.537925,1.053841,58.733724,119.749926,61.016203,40.227433,5.115975,0.194594,-0.757725,30.127114,51.786687,21.659573,0,0,0,0,0,0,0,1,0,0,0,0,0,18,0,1,0
4,170.27197,16.233572,-0.1129,0.767195,119.059047,216.955936,97.896889,460.005194,51.218599,-0.527761,1.135857,283.244579,568.837979,285.593399,101.344157,10.064452,-0.316597,0.350864,68.760655,123.952205,55.19155,40.225275,5.192884,-0.189996,-0.01627,27.105512,52.884798,25.779285,0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,1,0


In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12100 entries, 0 to 12198
Data columns (total 45 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   volt_mean           12100 non-null  float64
 1   volt_std            12100 non-null  float64
 2   volt_skew           12100 non-null  float64
 3   volt_kurtosis       12100 non-null  float64
 4   volt_min            12100 non-null  float64
 5   volt_max            12100 non-null  float64
 6   volt_range          12100 non-null  float64
 7   rotate_mean         12100 non-null  float64
 8   rotate_std          12100 non-null  float64
 9   rotate_skew         12100 non-null  float64
 10  rotate_kurtosis     12100 non-null  float64
 11  rotate_min          12100 non-null  float64
 12  rotate_max          12100 non-null  float64
 13  rotate_range        12100 non-null  float64
 14  pressure_mean       12100 non-null  float64
 15  pressure_std        12100 non-null  float64
 16  pressure_

#### Save Logistic Regression Data Matrix as JSON

In [None]:
SAVE_FILENAME = 'LogReg_features_matrix.json'
SAVE_LOC = os.path.join(DATA_DIR, SAVE_FILENAME)

In [None]:
# Convert the X dataframe to a JSON serializable numpy matrix
X = np.array(X).tolist()

# Write the JSON object to a file
with open(SAVE_LOC, mode = 'w') as f:
  json.dump(X, fp = f, indent = 4)

### 2b. MLP

#### Create MLP Data Dict From Preprocessed Data

In [None]:
# Create the data matrix containing the features for a logistic regression model
X = aggregated_df.drop(
    columns=[
        'datetime_first',
        'machineID_first',
    ]
)
X.head()

Unnamed: 0,machine_example_ID,volt_mean,volt_std,volt_skew,volt_kurtosis,volt_min,volt_max,volt_range,rotate_mean,rotate_std,rotate_skew,rotate_kurtosis,rotate_min,rotate_max,rotate_range,pressure_mean,pressure_std,pressure_skew,pressure_kurtosis,pressure_min,pressure_max,pressure_range,vibration_mean,vibration_std,vibration_skew,vibration_kurtosis,vibration_min,vibration_max,vibration_range,comp1_maint_sum,comp2_maint_sum,comp3_maint_sum,comp4_maint_sum,error1_sum,error2_sum,error3_sum,error4_sum,error5_sum,comp1_failure_sum,comp2_failure_sum,comp3_failure_sum,comp4_failure_sum,age_first,model2_max,model3_max,model4_max
0,"(001, 001)",170.885811,12.988999,0.236573,-0.187545,140.776309,200.87243,60.096121,450.106721,44.537872,0.16778,-0.494777,346.149335,568.97231,222.822975,99.275061,10.467413,0.12748,-0.432249,75.237905,127.014498,51.776593,43.736157,8.298338,0.335899,-0.301468,25.990511,66.764515,40.774004,0,0,0,0,1,0,1,0,0,0,0,0,0,18,0,1,0
1,"(001, 002)",171.487472,17.667202,-0.256072,0.97957,121.502718,215.656488,94.15377,450.703413,41.818685,-0.127176,0.406995,345.418513,575.505189,230.086676,101.323552,10.21323,0.023665,-0.433038,78.721961,126.46458,47.742619,44.487873,7.103904,0.401587,-0.431127,32.813967,62.464103,29.650136,1,0,0,0,0,0,0,0,1,0,0,0,1,18,0,1,0
2,"(001, 003)",172.183333,13.290595,-0.031461,-0.494151,143.506562,200.741045,57.234483,451.975443,39.772529,0.226721,-0.114202,361.562141,552.372925,190.810784,99.888028,9.198322,-0.29156,-0.8923,81.565288,117.164152,35.598864,39.924214,4.944806,0.14151,0.069886,29.031392,54.919547,25.888155,0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,1,0
3,"(001, 004)",169.639459,15.727822,-0.083465,-0.091825,130.223881,203.777627,73.553746,447.914585,48.995525,0.056837,0.721875,307.175549,586.945963,279.770414,98.173025,11.176246,-0.537925,1.053841,58.733724,119.749926,61.016203,40.227433,5.115975,0.194594,-0.757725,30.127114,51.786687,21.659573,0,0,0,0,0,0,0,1,0,0,0,0,0,18,0,1,0
4,"(001, 005)",170.27197,16.233572,-0.1129,0.767195,119.059047,216.955936,97.896889,460.005194,51.218599,-0.527761,1.135857,283.244579,568.837979,285.593399,101.344157,10.064452,-0.316597,0.350864,68.760655,123.952205,55.19155,40.225275,5.192884,-0.189996,-0.01627,27.105512,52.884798,25.779285,0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,1,0


In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12100 entries, 0 to 12198
Data columns (total 46 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   machine_example_ID  12100 non-null  object 
 1   volt_mean           12100 non-null  float64
 2   volt_std            12100 non-null  float64
 3   volt_skew           12100 non-null  float64
 4   volt_kurtosis       12100 non-null  float64
 5   volt_min            12100 non-null  float64
 6   volt_max            12100 non-null  float64
 7   volt_range          12100 non-null  float64
 8   rotate_mean         12100 non-null  float64
 9   rotate_std          12100 non-null  float64
 10  rotate_skew         12100 non-null  float64
 11  rotate_kurtosis     12100 non-null  float64
 12  rotate_min          12100 non-null  float64
 13  rotate_max          12100 non-null  float64
 14  rotate_range        12100 non-null  float64
 15  pressure_mean       12100 non-null  float64
 16  pressure_

In [None]:
# Create dict with machine_example_ID keys and corresponding rows as a list value
machine_summary_dict = X.set_index('machine_example_ID').T.to_dict('list')

In [None]:
i = 0
for k, v in machine_summary_dict.items():
    i += 1
    if i > 5:
        break
    print(f"{k}: {v}")

(001, 001): [170.8858114624484, 12.988998585540942, 0.23657261238213764, -0.18754454523144837, 140.776309290179, 200.872429816452, 60.09612052627301, 450.1067211349322, 44.5378720679287, 0.16777975702039052, -0.49477721853209644, 346.149335043074, 568.972310325907, 222.82297528283294, 99.27506051721805, 10.467412730403368, 0.127479588845299, -0.43224858709881886, 75.2379048586662, 127.014497747491, 51.77659288882481, 43.73615663190936, 8.298337892989313, 0.33589869397807204, -0.3014679076312867, 25.9905109982024, 66.7645147253914, 40.774003727189, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 18.0, 0.0, 1.0, 0.0]
(001, 002): [171.4874722590641, 17.667201768926702, -0.25607222161856363, 0.9795697255810554, 121.502718006153, 215.656488291429, 94.15377028527601, 450.7034130376246, 41.81868548546561, -0.1271760080238314, 0.40699488567150555, 345.418513385006, 575.50518940115, 230.086676016144, 101.32355213528129, 10.213230127528549, 0.02366549919009111, -0.43303795549459

#### Save MLP Data Dict as JSON

In [None]:
SAVE_FILENAME = 'MLP_features_dict.json'
SAVE_LOC = os.path.join(DATA_DIR, SAVE_FILENAME)

In [None]:
with open(SAVE_LOC, mode = 'w') as json_f:
  json.dump(obj = machine_summary_dict, fp = json_f, indent = 4)