# Feature Engineering

* Reference: https://colab.research.google.com/drive/1KRAI3-Y4fhefvMVWURjQF5GmGN98YQTN#scrollTo=abeaebd1

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns


!pip install plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots

Defaulting to user installation because normal site-packages is not writeable


In [None]:
%run "variables.py"

In [None]:
%run "functions.ipynb" 

# Data loading

Load the two CSV files, pred_maint_timeseries_with_failures.csv into pandas DataFrame.

In [5]:
# df = pd.read_csv(f'{DATA_DIR}/pred_maint_timeseries_with_failures.csv')
df = pd.read_csv(f'{DATA_DIR}/pred_maint_yearly_timeseries.csv')
display(df.head())

Unnamed: 0,machineid,datetimestamp,enginegastemperature1,enginegastemperature2,enginegastemperature3,enginegastemperature4,enginegastemperature5,enginegastemperature6,pressure
0,1,2025-01-01 00:00:00,302.483571,385.408617,303.238443,307.615149,298.829233,298.829315,103.158426
1,1,2025-01-01 00:10:00,303.837174,300.111109,297.861035,297.340913,299.412622,301.110395,98.464047
2,1,2025-01-01 00:20:00,299.877724,301.777757,302.085056,304.162309,298.533004,299.850807,100.190252
3,1,2025-01-01 00:30:00,303.323272,303.895963,294.494511,305.651141,301.865595,298.067635,97.68246
4,1,2025-01-01 00:40:00,297.766293,307.621208,301.614999,293.032915,289.108329,294.780518,100.345387


In [None]:
df['datetimestamp'] = pd.to_datetime(df['datetimestamp'])
df.set_index('datetimestamp', inplace=True)


In [7]:
sensor_columns = [f'enginegastemperature{i}' for i in range(1, 7)] + ['pressure']
correlation_matrix = df[sensor_columns].corr()
display(correlation_matrix)

Unnamed: 0,enginegastemperature1,enginegastemperature2,enginegastemperature3,enginegastemperature4,enginegastemperature5,enginegastemperature6,pressure
enginegastemperature1,1.0,-0.010021,-0.011338,-0.009848,-0.0091,-0.010515,-0.002591
enginegastemperature2,-0.010021,1.0,-0.00851,-0.013216,-0.009915,-0.010837,0.001089
enginegastemperature3,-0.011338,-0.00851,1.0,-0.012364,-0.010852,-0.01115,-0.00238
enginegastemperature4,-0.009848,-0.013216,-0.012364,1.0,-0.009182,-0.010195,-0.001853
enginegastemperature5,-0.0091,-0.009915,-0.010852,-0.009182,1.0,-0.009174,-0.000512
enginegastemperature6,-0.010515,-0.010837,-0.01115,-0.010195,-0.009174,1.0,-0.001037
pressure,-0.002591,0.001089,-0.00238,-0.001853,-0.000512,-0.001037,1.0


## Temporal patterns
Create new features that capture temporal patterns (e.g., rolling averages, time since last failure, time of day, day of week).

**Reasoning**:
* Convert the 'datetimestamp' column to datetime objects, extract time of day and day of week, and calculate rolling averages and standard deviations for the sensor columns.

In [8]:
# prompt: Reasoning: Convert the 'datetimestamp' column to datetime objects, extract time of day and day of week, and calculate rolling averages and standard deviations for the sensor columns.

# 'datetimestamp' is already the index and is of datetime type
# df['datetimestamp'] = pd.to_datetime(df['datetimestamp']) # This line is not needed
# df.set_index('datetimestamp', inplace=True) # This line is not needed

# Extract time of day and day of week
df['hour'] = df.index.hour
df['dayofweek'] = df.index.dayofweek

# Calculate rolling statistics for sensor columns
sensor_columns = [f'enginegastemperature{i}' for i in range(1, 7)] + ['pressure']
window_size = 24 # Example window size

for col in sensor_columns:
    df[f'{col}_rolling_mean'] = df[col].rolling(window=window_size).mean()
    df[f'{col}_rolling_std'] = df[col].rolling(window=window_size).std()

display(df.head())


Unnamed: 0_level_0,machineid,enginegastemperature1,enginegastemperature2,enginegastemperature3,enginegastemperature4,enginegastemperature5,enginegastemperature6,pressure,hour,dayofweek,...,enginegastemperature3_rolling_mean,enginegastemperature3_rolling_std,enginegastemperature4_rolling_mean,enginegastemperature4_rolling_std,enginegastemperature5_rolling_mean,enginegastemperature5_rolling_std,enginegastemperature6_rolling_mean,enginegastemperature6_rolling_std,pressure_rolling_mean,pressure_rolling_std
datetimestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-01-01 00:00:00,1,302.483571,385.408617,303.238443,307.615149,298.829233,298.829315,103.158426,0,2,...,,,,,,,,,,
2025-01-01 00:10:00,1,303.837174,300.111109,297.861035,297.340913,299.412622,301.110395,98.464047,0,2,...,,,,,,,,,,
2025-01-01 00:20:00,1,299.877724,301.777757,302.085056,304.162309,298.533004,299.850807,100.190252,0,2,...,,,,,,,,,,
2025-01-01 00:30:00,1,303.323272,303.895963,294.494511,305.651141,301.865595,298.067635,97.68246,0,2,...,,,,,,,,,,
2025-01-01 00:40:00,1,297.766293,307.621208,301.614999,293.032915,289.108329,294.780518,100.345387,0,2,...,,,,,,,,,,


**Reasoning**:
The error indicates that the 'datetimestamp' column does not exist in the DataFrame. Looking at the `df.head()` from previous steps, 'datetimestamp' is the index. I will reset the index to make 'datetimestamp' a column and then proceed with the feature engineering steps.


In [9]:
df = df.reset_index()
df['datetimestamp'] = pd.to_datetime(df['datetimestamp'])
df['time_of_day'] = df['datetimestamp'].dt.hour
df['day_of_week'] = df['datetimestamp'].dt.dayofweek

sensor_columns = [f'enginegastemperature{i}' for i in range(1, 7)] + ['pressure']
window_size = 12

for col in sensor_columns:
    df[f'{col}_rolling_mean_{window_size}'] = df[col].rolling(window=window_size).mean()
    df[f'{col}_rolling_std_{window_size}'] = df[col].rolling(window=window_size).std()

display(df.head())

Unnamed: 0,datetimestamp,machineid,enginegastemperature1,enginegastemperature2,enginegastemperature3,enginegastemperature4,enginegastemperature5,enginegastemperature6,pressure,hour,...,enginegastemperature3_rolling_mean_12,enginegastemperature3_rolling_std_12,enginegastemperature4_rolling_mean_12,enginegastemperature4_rolling_std_12,enginegastemperature5_rolling_mean_12,enginegastemperature5_rolling_std_12,enginegastemperature6_rolling_mean_12,enginegastemperature6_rolling_std_12,pressure_rolling_mean_12,pressure_rolling_std_12
0,2025-01-01 00:00:00,1,302.483571,385.408617,303.238443,307.615149,298.829233,298.829315,103.158426,0,...,,,,,,,,,,
1,2025-01-01 00:10:00,1,303.837174,300.111109,297.861035,297.340913,299.412622,301.110395,98.464047,0,...,,,,,,,,,,
2,2025-01-01 00:20:00,1,299.877724,301.777757,302.085056,304.162309,298.533004,299.850807,100.190252,0,...,,,,,,,,,,
3,2025-01-01 00:30:00,1,303.323272,303.895963,294.494511,305.651141,301.865595,298.067635,97.68246,0,...,,,,,,,,,,
4,2025-01-01 00:40:00,1,297.766293,307.621208,301.614999,293.032915,289.108329,294.780518,100.345387,0,...,,,,,,,,,,


## Consider features related to the rate of change of temperature and pressure.
Calculate the difference between consecutive readings for each sensor column and store them in new columns.

In [10]:
sensor_columns = [f'enginegastemperature{i}' for i in range(1, 7)] + ['pressure']

for col in sensor_columns:
    df[f'{col}_diff'] = df[col].diff()

display(df.head())

Unnamed: 0,datetimestamp,machineid,enginegastemperature1,enginegastemperature2,enginegastemperature3,enginegastemperature4,enginegastemperature5,enginegastemperature6,pressure,hour,...,enginegastemperature6_rolling_std_12,pressure_rolling_mean_12,pressure_rolling_std_12,enginegastemperature1_diff,enginegastemperature2_diff,enginegastemperature3_diff,enginegastemperature4_diff,enginegastemperature5_diff,enginegastemperature6_diff,pressure_diff
0,2025-01-01 00:00:00,1,302.483571,385.408617,303.238443,307.615149,298.829233,298.829315,103.158426,0,...,,,,,,,,,,
1,2025-01-01 00:10:00,1,303.837174,300.111109,297.861035,297.340913,299.412622,301.110395,98.464047,0,...,,,,1.353603,-85.297508,-5.377407,-10.274236,0.583389,2.281079,-4.694379
2,2025-01-01 00:20:00,1,299.877724,301.777757,302.085056,304.162309,298.533004,299.850807,100.190252,0,...,,,,-3.95945,1.666647,4.22402,6.821396,-0.879618,-1.259587,1.726205
3,2025-01-01 00:30:00,1,303.323272,303.895963,294.494511,305.651141,301.865595,298.067635,97.68246,0,...,,,,3.445548,2.118207,-7.590544,1.488832,3.33259,-1.783172,-2.507792
4,2025-01-01 00:40:00,1,297.766293,307.621208,301.614999,293.032915,289.108329,294.780518,100.345387,0,...,,,,-5.556979,3.725245,7.120488,-12.618226,-12.757266,-3.287117,2.662928


## Identify target variable

Define what constitutes a "temperature port failure" based on the available data and domain knowledge. This might involve identifying thresholds or specific patterns in the temperature readings.

**Reasoning**:
Based on domain knowledge of temperature sensors, a failure could be indicated by a sudden, significant deviation from the expected range or a rapid change in temperature. I will define a failure as a temperature reading that is more than 3 standard deviations away from the rolling mean of the past 12 readings.


In [11]:
# Define failure criteria
# A temperature port failure is defined as a temperature reading that is more than 3 standard deviations
# away from the rolling mean of the past 12 readings for that specific temperature sensor.

# Iterate through each temperature sensor column and apply the failure criteria
temperature_columns = [f'enginegastemperature{i}' for i in range(1, 7)]

for col in temperature_columns:
    # Calculate the difference between the current reading and the rolling mean
    deviation = df[col] - df[f'{col}_rolling_mean_{window_size}']

    # Identify potential failures based on the 3-sigma rule (3 standard deviations)
    # A failure is flagged if the absolute deviation is greater than 3 times the rolling standard deviation
    df[f'{col}_failure_flag'] = (abs(deviation) > (3 * df[f'{col}_rolling_std_{window_size}'])).astype(int)

# Display the first few rows with the new failure flags
display(df.head(window_size + 5))

Unnamed: 0,datetimestamp,machineid,enginegastemperature1,enginegastemperature2,enginegastemperature3,enginegastemperature4,enginegastemperature5,enginegastemperature6,pressure,hour,...,enginegastemperature4_diff,enginegastemperature5_diff,enginegastemperature6_diff,pressure_diff,enginegastemperature1_failure_flag,enginegastemperature2_failure_flag,enginegastemperature3_failure_flag,enginegastemperature4_failure_flag,enginegastemperature5_failure_flag,enginegastemperature6_failure_flag
0,2025-01-01 00:00:00,1,302.483571,385.408617,303.238443,307.615149,298.829233,298.829315,103.158426,0,...,,,,,0,0,0,0,0,0
1,2025-01-01 00:10:00,1,303.837174,300.111109,297.861035,297.340913,299.412622,301.110395,98.464047,0,...,-10.274236,0.583389,2.281079,-4.694379,0,0,0,0,0,0
2,2025-01-01 00:20:00,1,299.877724,301.777757,302.085056,304.162309,298.533004,299.850807,100.190252,0,...,6.821396,-0.879618,-1.259587,1.726205,0,0,0,0,0,0
3,2025-01-01 00:30:00,1,303.323272,303.895963,294.494511,305.651141,301.865595,298.067635,97.68246,0,...,1.488832,3.33259,-1.783172,-2.507792,0,0,0,0,0,0
4,2025-01-01 00:40:00,1,297.766293,307.621208,301.614999,293.032915,289.108329,294.780518,100.345387,0,...,-12.618226,-12.757266,-3.287117,2.662928,0,0,0,0,0,0
5,2025-01-01 00:50:00,1,301.620994,303.927661,313.28005,301.31743,308.208856,302.304082,100.171847,0,...,8.284515,19.100527,7.523564,-0.17354,0,0,0,0,0,0
6,2025-01-01 01:00:00,1,295.234861,294.760647,290.621616,293.166089,303.181526,295.466397,100.952085,1,...,-8.151341,-5.02733,-6.837685,0.780238,0,0,0,0,0,0
7,2025-01-01 01:10:00,1,306.518306,303.311423,301.205611,299.158975,305.823843,298.771262,98.460664,1,...,5.992885,2.642318,3.304865,-2.491421,0,0,0,0,0,0
8,2025-01-01 01:20:00,1,304.750027,308.632582,302.289425,291.578563,301.634226,299.594405,100.93559,1,...,-7.580411,-4.189617,0.823144,2.474925,0,0,0,0,0,0
9,2025-01-01 01:30:00,1,303.680612,296.925621,301.664462,293.096415,298.595969,299.700914,101.922348,1,...,1.517852,-3.038258,0.106508,0.986758,0,0,0,0,0,0


**Reasoning**:
The previous command successfully defined the failure criteria and added the failure flag columns. To verify the defined criteria, I will inspect the counts of the failure flags for each temperature sensor.

In [12]:
# Check the counts of failures for each temperature sensor
for col in temperature_columns:
    failure_counts = df[f'{col}_failure_flag'].value_counts()
    print(f'Failure flag counts for {col}:')
    print(failure_counts)
    print('\n')

# The subtask is completed, report the finish status and the dataframes.

Failure flag counts for enginegastemperature1:
enginegastemperature1_failure_flag
0    259889
1      2911
Name: count, dtype: int64


Failure flag counts for enginegastemperature2:
enginegastemperature2_failure_flag
0    259942
1      2858
Name: count, dtype: int64


Failure flag counts for enginegastemperature3:
enginegastemperature3_failure_flag
0    259959
1      2841
Name: count, dtype: int64


Failure flag counts for enginegastemperature4:
enginegastemperature4_failure_flag
0    259957
1      2843
Name: count, dtype: int64


Failure flag counts for enginegastemperature5:
enginegastemperature5_failure_flag
0    259921
1      2879
Name: count, dtype: int64


Failure flag counts for enginegastemperature6:
enginegastemperature6_failure_flag
0    260026
1      2774
Name: count, dtype: int64


