# Goal: Correlate Master Fault Data with Critical Fault Data

In [68]:
# import dependencies
import pandas as pd

In [69]:
# read in the master fault record
master_fault_df = pd.read_csv("Resources/EDPR_Master_Fault_Record.csv")
master_fault_df.head()

Unnamed: 0,TimeDetected,Turbine_ID,Remark
0,4/23/2016 19:01,T11,External power ref.:2000kW
1,5/31/2016 11:24,T01,Generator 1 in
2,8/7/2016 15:29,T01,Hot HV trafo 270°C 0kW
3,4/20/2016 8:38,T01,External power ref.:2000kW
4,4/20/2016 8:42,T06,External power ref.:2000kW


In [70]:
# describe data to determine how to clean it
print((master_fault_df).describe())

           TimeDetected Turbine_ID                      Remark
count            256390     256401                      252618
unique           241902          4                        7071
top     11/4/2016 10:16        T01  External power ref.:2000kW
freq                 13      78639                      111136


In [71]:
# drop the duplicates seen above. Now the counts match
master_fault_df.drop_duplicates('TimeDetected', inplace=True)
master_fault_df.dropna(inplace=True)
master_fault_df.describe()

Unnamed: 0,TimeDetected,Turbine_ID,Remark
count,238153,238153,238153
unique,238153,4,3429
top,4/23/2016 19:01,T01,External power ref.:2000kW
freq,1,71297,109889


In [76]:
# change the time to be datetime format
master_fault_df['TimeDetected'] = pd.to_datetime(master_fault_df['TimeDetected'], utc= True)
master_fault_df.dtypes

TimeDetected        datetime64[ns, UTC]
Turbine_ID                       object
Remark                           object
Remark_Condensed                 object
dtype: object

In [77]:
# bucket similar faults
# Extra info, Nac.vent, Rotor:,TimeSpentIn, E.Wind
rem_cond = []
for value in master_fault_df["Remark"]:
    if "Extra info" in value:
        err_num = value[16:20]
        rem_cond.append(err_num)
    elif "Nac.vent" in value:
        rem_cond.append('Nac.vent')
    elif "Rotor:" in value:
        rem_cond.append('Rotor RPM')
    elif "TimeSpentIn" in value:
        rem_cond.append('TimeSpentIn')
    elif "E.Wind" in value:
        rem_cond.append('E.Wind')
    else:
        rem_cond.append(value.split()[0])

master_fault_df["Remark_Condensed"] = rem_cond  
print(master_fault_df)

                    TimeDetected Turbine_ID                       Remark  \
0      2016-04-23 19:01:00+00:00        T11   External power ref.:2000kW   
1      2016-05-31 11:24:00+00:00        T01               Generator 1 in   
2      2016-08-07 15:29:00+00:00        T01  Hot HV trafo 270°C      0kW   
3      2016-04-20 08:38:00+00:00        T01   External power ref.:2000kW   
4      2016-04-20 08:42:00+00:00        T06   External power ref.:2000kW   
...                          ...        ...                          ...   
256392 2017-12-25 04:22:13+00:00        T01     Prepare Reconnecttimeout   
256393 2017-12-25 04:23:40+00:00        T01            Gen. int. vent. 1   
256394 2017-12-25 04:22:23+00:00        T11   External power ref.:2000kW   
256397 2017-12-25 04:35:07+00:00        T06   External power ref.:2000kW   
256400 2017-12-25 05:04:54+00:00        T01              Generator 0 out   

       Remark_Condensed  
0              External  
1             Generator  
2        

In [78]:
# Confirm reduction of unique values
master_fault_df['Remark_Condensed'].nunique()

161

In [79]:
# Review which need to be further cleaned: 
cond_uniq_list = master_fault_df['Remark_Condensed'].unique()
for i in cond_uniq_list:
    print(i)

External
Generator
Hot
Nac.vent
Yaw
Running
Yawcontr.
GearoilCooler
Accumulator
HV
276 
Gen.
Stop
Main
Hub
Gen
Feedback=0
Heating
Run
New
Too
IGBT
900 
User
Key
Pause
YawSpeedFault:
Ch
309 
Nacelle
149 
275 
Dip
B
EMF
Extreme
Power
Yawing
Start
Emergency
Low
Pitch
Feedback
High
Frequency
Event
356 
ExEx
EmcPitchAvel:
151 
155 
Nac
Error
Slip:
 83 
168 
Parameter
176 
Ambient
324 
Feedback=1
EMCV.
Gearoilheater
Trip
Circuit
A
148 
WS1
SignalError.
326 
170 
171 
Rotor RPM
Accu
336 
338 
158 
929 
Thermoerr.
 85 
 81 
No
315 
Supply
Thermo
Grid
202 
WATCHDOG
313 
Q8
C
RT
Q7
OVP
192 
Internal
353 
Shutdown
Peak
899 
Remote
Encoder
190 
182 
Ext.
160 
Write
TimeSpentIn
Pitch:
E.Wind
Wind:
Prepare
Production
Tow.
169 
144 
Load/PowerMode
891 
Session
NacReset
Hydr
CableTwistCodeShift:New
273 
Tower
KeySwitchCommRights
217 
Thermoerror
CableUntwistPerf:CableTw
CableUntwistCCW:CableTw
Puls
 86 
 37 
OVPHwErr
Extr.
CableTwistReset:Err
Inv.
186 
Cable
 87 
216 
Cannot
Oil
181 
Press.drop
328 
S

In [80]:
# store long remarks in new dataframe for re-join later if needed
long_remark_df = master_fault_df['Remark']
long_remark_df

0          External power ref.:2000kW
1                      Generator 1 in
2         Hot HV trafo 270°C      0kW
3          External power ref.:2000kW
4          External power ref.:2000kW
                     ...             
256392       Prepare Reconnecttimeout
256393              Gen. int. vent. 1
256394     External power ref.:2000kW
256397     External power ref.:2000kW
256400                Generator 0 out
Name: Remark, Length: 238153, dtype: object

In [81]:
# Drop the long remarks from master df
master_fault_df.drop(['Remark'], axis = 1, inplace = True)
master_fault_df.head()

Unnamed: 0,TimeDetected,Turbine_ID,Remark_Condensed
0,2016-04-23 19:01:00+00:00,T11,External
1,2016-05-31 11:24:00+00:00,T01,Generator
2,2016-08-07 15:29:00+00:00,T01,Hot
3,2016-04-20 08:38:00+00:00,T01,External
4,2016-04-20 08:42:00+00:00,T06,External


In [82]:
# Read in all significant failure data
sig_fault_df = pd.read_csv("Resources/EDPR_Significant_Fault.csv")
sig_fault_df.columns = ["Turbine_ID", "Failure_group", "timestamp", "description"]
t11_sig_failures_df = sig_fault_df[sig_fault_df['Turbine_ID'] == 'T11']

In [240]:
sig_fault_df.head(21)

Unnamed: 0,Turbine_ID,Failure_group,timestamp,description
19,T11,GENERATOR,2016-03-03 19:00:00+00:00,Electric circuit error in generator
15,T06,HYDRAULIC_GROUP,2016-04-04 18:53:00+00:00,Error in pitch regulation
16,T07,GENERATOR_BEARING,2016-04-30 12:40:00+00:00,High temperature in generator bearing (replace...
17,T07,TRANSFORMER,2016-07-10 03:46:00+00:00,High temperature transformer
10,T06,GENERATOR,2016-07-11 19:48:00+00:00,Generator replaced
9,T01,GEARBOX,2016-07-18 02:10:00+00:00,Gearbox pump damaged
11,T06,GENERATOR,2016-07-24 17:01:00+00:00,Generator temperature sensor failure
18,T07,TRANSFORMER,2016-08-23 02:21:00+00:00,High temperature transformer. Transformer refr...
12,T06,GENERATOR,2016-09-04 08:08:00+00:00,High temperature generator error
14,T06,GENERATOR,2016-10-02 17:08:00+00:00,Refrigeration system and temperature sensors i...


In [177]:
t11_sig_fault_sorted_df = t11_sig_failures_df.sort_values(by="timestamp")
t11_sig_fault_sorted_df

Unnamed: 0,Turbine_ID,Failure_group,timestamp,description
19,T11,GENERATOR,2016-03-03T19:00:00+00:00,Electric circuit error in generator
20,T11,HYDRAULIC_GROUP,2016-10-17T17:44:00+00:00,Hydraulic group error in the brake circuit
7,T11,HYDRAULIC_GROUP,2017-04-26T18:06:00+00:00,Hydraulic group error in the brake circuit
8,T11,HYDRAULIC_GROUP,2017-09-12T15:30:00+00:00,Hydraulic group error in the brake circuit


In [133]:
# Convert time to datetime format
sig_fault_df['timestamp'] = pd.to_datetime(sig_fault_df['timestamp'], utc=True)
sig_fault_df.dtypes

Turbine_ID                    object
Failure_group                 object
timestamp        datetime64[ns, UTC]
description                   object
dtype: object

In [168]:
# Sort by timestamp 
sig_fault_df = sig_fault_df.sort_values(by="timestamp")
sig_fault_df.head(21)

Unnamed: 0,Turbine_ID,Failure_group,timestamp,description
19,T11,GENERATOR,2016-03-03 19:00:00+00:00,Electric circuit error in generator
15,T06,HYDRAULIC_GROUP,2016-04-04 18:53:00+00:00,Error in pitch regulation
16,T07,GENERATOR_BEARING,2016-04-30 12:40:00+00:00,High temperature in generator bearing (replace...
17,T07,TRANSFORMER,2016-07-10 03:46:00+00:00,High temperature transformer
10,T06,GENERATOR,2016-07-11 19:48:00+00:00,Generator replaced
9,T01,GEARBOX,2016-07-18 02:10:00+00:00,Gearbox pump damaged
11,T06,GENERATOR,2016-07-24 17:01:00+00:00,Generator temperature sensor failure
18,T07,TRANSFORMER,2016-08-23 02:21:00+00:00,High temperature transformer. Transformer refr...
12,T06,GENERATOR,2016-09-04 08:08:00+00:00,High temperature generator error
14,T06,GENERATOR,2016-10-02 17:08:00+00:00,Refrigeration system and temperature sensors i...


<class 'pandas._libs.tslibs.timestamps.Timestamp'>

<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [136]:
# Create the dummy frame
fault_dummies_df = pd.get_dummies(master_fault_df, columns = ['Remark_Condensed'], prefix_sep="")
fault_dummies_df.head(20)

Unnamed: 0,TimeDetected,Turbine_ID,Remark_Condensed 37,Remark_Condensed 81,Remark_Condensed 82,Remark_Condensed 83,Remark_Condensed 85,Remark_Condensed 86,Remark_Condensed 87,Remark_Condensed102,...,Remark_CondensedUser,Remark_CondensedWATCHDOG,Remark_CondensedWS1,Remark_CondensedWind:,Remark_CondensedWrite,Remark_CondensedYaw,Remark_CondensedYawSpeedFault:,Remark_CondensedYawUntwistCCW:,Remark_CondensedYawcontr.,Remark_CondensedYawing
0,2016-04-23 19:01:00+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2016-05-31 11:24:00+00:00,T01,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2016-08-07 15:29:00+00:00,T01,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2016-04-20 08:38:00+00:00,T01,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2016-04-20 08:42:00+00:00,T06,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2016-06-03 19:08:00+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,2016-04-20 09:01:00+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,2016-06-13 10:56:00+00:00,T01,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2016-04-20 09:12:00+00:00,T06,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,2016-04-15 16:29:00+00:00,T07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
#Create the dummies table for T11
t11_fault_dummies_df = fault_dummies_df[fault_dummies_df['Turbine_ID'] == 'T11']

# clean up duplicates
t11_fault_dummies_df.drop_duplicates(subset=['TimeDetected'], inplace=True)

# Sort and print to confirm sort
sorted_t11_fault_dummies_df = t11_fault_dummies_df.sort_values(by=["TimeDetected"])
sorted_t11_fault_dummies_df.head(100)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,TimeDetected,Turbine_ID,Remark_Condensed 37,Remark_Condensed 81,Remark_Condensed 82,Remark_Condensed 83,Remark_Condensed 85,Remark_Condensed 86,Remark_Condensed 87,Remark_Condensed102,...,Remark_CondensedUser,Remark_CondensedWATCHDOG,Remark_CondensedWS1,Remark_CondensedWind:,Remark_CondensedWrite,Remark_CondensedYaw,Remark_CondensedYawSpeedFault:,Remark_CondensedYawUntwistCCW:,Remark_CondensedYawcontr.,Remark_CondensedYawing
68215,2016-01-01 00:02:18+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40222,2016-01-01 00:32:18+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47031,2016-01-01 01:02:18+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4748,2016-01-01 01:32:00+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40227,2016-01-01 02:02:17+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46997,2016-01-02 12:30:29+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33909,2016-01-02 12:38:22+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4699,2016-01-02 12:39:00+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64724,2016-01-02 12:39:59+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [106]:
#empty columns exist
pd.set_option('display.max_rows', 500)
sorted_t11_fault_dummies_df.sum()


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



Turbine_ID                                     T11T11T11T11T11T11T11T11T11T11T11T11T11T11T11T...
Remark_Condensed 37                                                                            1
Remark_Condensed 81                                                                            7
Remark_Condensed 82                                                                            1
Remark_Condensed 83                                                                           10
Remark_Condensed 85                                                                            1
Remark_Condensed 86                                                                            4
Remark_Condensed 87                                                                            1
Remark_Condensed102                                                                            0
Remark_Condensed144                                                                            0
Remark_Condensed148           

In [110]:
# drop empty columns
cleaned_t11_master_fault_df = sorted_t11_fault_dummies_df.loc[(sorted_t11_fault_dummies_df!=0).any(1), (sorted_t11_fault_dummies_df!=0).any(0)]
cleaned_t11_master_fault_df.sum()


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



Turbine_ID                        T11T11T11T11T11T11T11T11T11T11T11T11T11T11T11T...
Remark_Condensed 37                                                               1
Remark_Condensed 81                                                               7
Remark_Condensed 82                                                               1
Remark_Condensed 83                                                              10
Remark_Condensed 85                                                               1
Remark_Condensed 86                                                               4
Remark_Condensed 87                                                               1
Remark_Condensed160                                                               1
Remark_Condensed168                                                               2
Remark_Condensed171                                                               1
Remark_Condensed176                                                         

# Need to transpose data into new columns (get dummies?). Also need to choose a turbine number to work with

In [230]:
import plotly.express as px

# Visualize the data with one variable to check for accuracy

fig = px.line(cleaned_t11_master_fault_df.iloc[::100, :], x='TimeDetected', y="Remark_CondensedNac.vent")
for time in t11_sig_failures_df['timestamp']: 
    fig.add_vline(x=time)

fig.show()


In [114]:
# Binning

import numpy as np
cleaned_t11_master_fault_df['Time Bin'] = pd.cut(cleaned_t11_master_fault_df.TimeDetected, bins=29, labels=np.arange(0,29))
cleaned_t11_master_fault_df.head(29)

Unnamed: 0,TimeDetected,Turbine_ID,Remark_Condensed 37,Remark_Condensed 81,Remark_Condensed 82,Remark_Condensed 83,Remark_Condensed 85,Remark_Condensed 86,Remark_Condensed 87,Remark_Condensed160,...,Remark_CondensedToo,Remark_CondensedTrip,Remark_CondensedUser,Remark_CondensedWATCHDOG,Remark_CondensedWS1,Remark_CondensedYaw,Remark_CondensedYawSpeedFault:,Remark_CondensedYawcontr.,Remark_CondensedYawing,Time Bin
68215,2016-01-01 00:02:18+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40222,2016-01-01 00:32:18+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47031,2016-01-01 01:02:18+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4748,2016-01-01 01:32:00+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40227,2016-01-01 02:02:17+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40228,2016-01-01 02:05:36+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
82087,2016-01-01 02:43:41+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47040,2016-01-01 03:28:25+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43485,2016-01-01 03:28:41+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97453,2016-01-01 03:58:41+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [115]:
cleaned_t11_master_fault_df[cleaned_t11_master_fault_df["Time Bin"] == 0].head(25)

Unnamed: 0,TimeDetected,Turbine_ID,Remark_Condensed 37,Remark_Condensed 81,Remark_Condensed 82,Remark_Condensed 83,Remark_Condensed 85,Remark_Condensed 86,Remark_Condensed 87,Remark_Condensed160,...,Remark_CondensedToo,Remark_CondensedTrip,Remark_CondensedUser,Remark_CondensedWATCHDOG,Remark_CondensedWS1,Remark_CondensedYaw,Remark_CondensedYawSpeedFault:,Remark_CondensedYawcontr.,Remark_CondensedYawing,Time Bin
68215,2016-01-01 00:02:18+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40222,2016-01-01 00:32:18+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47031,2016-01-01 01:02:18+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4748,2016-01-01 01:32:00+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40227,2016-01-01 02:02:17+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40228,2016-01-01 02:05:36+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
82087,2016-01-01 02:43:41+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47040,2016-01-01 03:28:25+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43485,2016-01-01 03:28:41+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97453,2016-01-01 03:58:41+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [116]:
# Confirming number of bins
cleaned_t11_master_fault_df["Time Bin"].unique()

[0, 1, 2, 3, 4, ..., 24, 25, 26, 27, 28]
Length: 29
Categories (29, int64): [0 < 1 < 2 < 3 ... 25 < 26 < 27 < 28]

In [181]:
print(t11_sig_fault_sorted_df)
failure_dates = t11_sig_fault_sorted_df.timestamp
print(failure_dates)

   Turbine_ID    Failure_group                  timestamp  \
19        T11        GENERATOR  2016-03-03T19:00:00+00:00   
20        T11  HYDRAULIC_GROUP  2016-10-17T17:44:00+00:00   
7         T11  HYDRAULIC_GROUP  2017-04-26T18:06:00+00:00   
8         T11  HYDRAULIC_GROUP  2017-09-12T15:30:00+00:00   

                                   description  
19         Electric circuit error in generator  
20  Hydraulic group error in the brake circuit  
7   Hydraulic group error in the brake circuit  
8   Hydraulic group error in the brake circuit  
19    2016-03-03T19:00:00+00:00
20    2016-10-17T17:44:00+00:00
7     2017-04-26T18:06:00+00:00
8     2017-09-12T15:30:00+00:00
Name: timestamp, dtype: object


In [193]:
# Check for failures in current bin 
failure_in_bin = {}
failure_in_next_bin = {}
failure_dates = t11_sig_fault_sorted_df.timestamp
print(failure_dates)
# if start <= date <= end:
#     print("in between")
# else:
#     print("No!")

# time_bin = cleaned_t11_master_fault_df[cleaned_t11_master_fault_df["Time Bin"] == bin]

# start = time_bin.TimeDetected.iloc[1]
# end = time_bin.TimeDetected.iloc[-1]

for bin in cleaned_t11_master_fault_df["Time Bin"].unique():

    time_bin = cleaned_t11_master_fault_df[cleaned_t11_master_fault_df["Time Bin"] == bin]
    
    start = time_bin.TimeDetected.iloc[1]
    end = time_bin.TimeDetected.iloc[-1]
    
    for date in failure_dates:
        dt_date = pd.to_datetime(date)
        if start <= dt_date <= end:
            failure_in_bin[bin] = 1
            print("failure observed in bin: " + str(bin))
            break
        else:
            failure_in_bin[bin] = 0

19    2016-03-03T19:00:00+00:00
20    2016-10-17T17:44:00+00:00
7     2017-04-26T18:06:00+00:00
8     2017-09-12T15:30:00+00:00
Name: timestamp, dtype: object
failure observed in bin: 2
failure observed in bin: 11
failure observed in bin: 19
failure observed in bin: 24


In [194]:
failure_in_bin

{0: 0,
 1: 0,
 2: 1,
 3: 0,
 4: 0,
 5: 0,
 6: 0,
 7: 0,
 8: 0,
 9: 0,
 10: 0,
 11: 1,
 12: 0,
 13: 0,
 14: 0,
 15: 0,
 16: 0,
 17: 0,
 18: 0,
 19: 1,
 20: 0,
 21: 0,
 22: 0,
 23: 0,
 24: 1,
 25: 0,
 26: 0,
 27: 0,
 28: 0}

In [195]:
# Build failure in Next Bin by shifting failure in bin up one. 
failure_in_next_bin = np.int_(pd.Series(failure_in_bin).shift(-1).fillna(0))
failure_in_next_bin = dict(zip(failure_in_bin.keys(), failure_in_next_bin))

In [196]:
failure_in_next_bin

{0: 0,
 1: 1,
 2: 0,
 3: 0,
 4: 0,
 5: 0,
 6: 0,
 7: 0,
 8: 0,
 9: 0,
 10: 1,
 11: 0,
 12: 0,
 13: 0,
 14: 0,
 15: 0,
 16: 0,
 17: 0,
 18: 1,
 19: 0,
 20: 0,
 21: 0,
 22: 0,
 23: 1,
 24: 0,
 25: 0,
 26: 0,
 27: 0,
 28: 0}

In [198]:
# Add failure in bin identifier to turbine dataframe
cleaned_t11_master_fault_df["failure_in_bin"] = cleaned_t11_master_fault_df["Time Bin"].apply(lambda x: failure_in_bin[x])

In [199]:
# Add failure in NEXT bin identifier to turbine dataframe
cleaned_t11_master_fault_df["failure_in_next_bin"] = cleaned_t11_master_fault_df["Time Bin"].apply(lambda x: failure_in_next_bin[x])

In [200]:
# Check that our lambda function translated from the dictionary effectively
cleaned_t11_master_fault_df['failure_in_bin'].value_counts()

0    50174
1     8531
Name: failure_in_bin, dtype: int64

In [201]:
cleaned_t11_master_fault_df['failure_in_next_bin'].value_counts()

0    50217
1     8488
Name: failure_in_next_bin, dtype: int64

In [241]:
# Size of buckets can differ dramatically... need to know why, as questioned earlier... 
cleaned_t11_master_fault_df["Time Bin"].value_counts()

22    3010
24    2572
21    2541
10    2502
7     2444
23    2440
8     2211
20    2203
25    2148
26    2134
6     2127
11    2124
9     2099
5     2090
19    1976
27    1958
2     1859
18    1840
28    1806
4     1780
17    1709
1     1706
3     1691
13    1680
16    1659
15    1649
12    1640
14    1563
0     1544
Name: Time Bin, dtype: int64

# Need to transpose the sig faults df

In [36]:
import plotly.express as px

# Visualize the data with bins and failure points

fig = px.line(t11_df, x='times_stamp', y="Gen_Bear2_Temp_Avg")
for time in t11_failures_df['timestamp']: 
    fig.add_vline(x=time, line_color='red')

for bin in t11_df["Time Bin"].unique():

    time_bin = t11_df[t11_df["Time Bin"] == bin]
    
    start = time_bin.times_stamp.iloc[1]
    end = time_bin.times_stamp.iloc[-1]

    fig.add_vrect(x0=start, x1=end)


fig.show()


In [202]:
cleaned_t11_master_fault_df.head()

Unnamed: 0,TimeDetected,Turbine_ID,Remark_Condensed 37,Remark_Condensed 81,Remark_Condensed 82,Remark_Condensed 83,Remark_Condensed 85,Remark_Condensed 86,Remark_Condensed 87,Remark_Condensed160,...,Remark_CondensedUser,Remark_CondensedWATCHDOG,Remark_CondensedWS1,Remark_CondensedYaw,Remark_CondensedYawSpeedFault:,Remark_CondensedYawcontr.,Remark_CondensedYawing,Time Bin,failure_in_bin,failure_in_next_bin
68215,2016-01-01 00:02:18+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40222,2016-01-01 00:32:18+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47031,2016-01-01 01:02:18+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4748,2016-01-01 01:32:00+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40227,2016-01-01 02:02:17+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


SUPERVISED LEARNING TEST


In [38]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [203]:
# Starting with failure_in_next_bin, as we want a predictive model. 

# Create target
y = cleaned_t11_master_fault_df['failure_in_next_bin']

# Create features
X = cleaned_t11_master_fault_df.drop(columns=["Turbine_ID", "TimeDetected", "Time Bin", "failure_in_bin", "failure_in_next_bin"])

In [204]:
X.head()

Unnamed: 0,Remark_Condensed 37,Remark_Condensed 81,Remark_Condensed 82,Remark_Condensed 83,Remark_Condensed 85,Remark_Condensed 86,Remark_Condensed 87,Remark_Condensed160,Remark_Condensed168,Remark_Condensed171,...,Remark_CondensedThermoerror,Remark_CondensedToo,Remark_CondensedTrip,Remark_CondensedUser,Remark_CondensedWATCHDOG,Remark_CondensedWS1,Remark_CondensedYaw,Remark_CondensedYawSpeedFault:,Remark_CondensedYawcontr.,Remark_CondensedYawing
68215,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40222,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47031,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40227,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [205]:
X.describe()

Unnamed: 0,Remark_Condensed 37,Remark_Condensed 81,Remark_Condensed 82,Remark_Condensed 83,Remark_Condensed 85,Remark_Condensed 86,Remark_Condensed 87,Remark_Condensed160,Remark_Condensed168,Remark_Condensed171,...,Remark_CondensedThermoerror,Remark_CondensedToo,Remark_CondensedTrip,Remark_CondensedUser,Remark_CondensedWATCHDOG,Remark_CondensedWS1,Remark_CondensedYaw,Remark_CondensedYawSpeedFault:,Remark_CondensedYawcontr.,Remark_CondensedYawing
count,58705.0,58705.0,58705.0,58705.0,58705.0,58705.0,58705.0,58705.0,58705.0,58705.0,...,58705.0,58705.0,58705.0,58705.0,58705.0,58705.0,58705.0,58705.0,58705.0,58705.0
mean,1.7e-05,0.000119,1.7e-05,0.00017,1.7e-05,6.8e-05,1.7e-05,1.7e-05,3.4e-05,1.7e-05,...,1.7e-05,0.000256,3.4e-05,0.000767,8.5e-05,8.5e-05,0.029384,1.7e-05,0.018039,0.001226
std,0.004127,0.010919,0.004127,0.013051,0.004127,0.008254,0.004127,0.004127,0.005837,0.004127,...,0.004127,0.015983,0.005837,0.027676,0.009229,0.009229,0.168882,0.004127,0.133095,0.035
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [206]:
#Check balance of target values
y.value_counts()

0    50217
1     8488
Name: failure_in_next_bin, dtype: int64

In [207]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [208]:
y_train.value_counts()

0    37662
1     6366
Name: failure_in_next_bin, dtype: int64

In [209]:
# Scaling the data to assist the algo
# Create the StandardScaler instance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [210]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification

clf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
clf.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(random_state=1)

In [211]:
# Calculated the balanced accuracy score
y_pred = clf.predict(X_test_scaled)

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.5279349561270654

In [212]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[9245, 3310],
       [1444,  678]], dtype=int64)

In [213]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.86      0.74      0.32      0.80      0.49      0.25     12555
          1       0.17      0.32      0.74      0.22      0.49      0.23      2122

avg / total       0.76      0.68      0.38      0.71      0.49      0.24     14677



In [214]:
# List the features sorted in descending order by feature importance
importances = clf.feature_importances_
sorted(zip(clf.feature_importances_, X_train.columns), reverse=True)

[(0.14870252832718212, 'Remark_CondensedNac.vent'),
 (0.08071264352053666, 'Remark_CondensedYaw'),
 (0.06525734422389032, 'Remark_CondensedGearoilCooler'),
 (0.057657049170235106, 'Remark_CondensedYawing'),
 (0.032266787192323745, 'Remark_CondensedGen'),
 (0.030695101015269722, 'Remark_CondensedStart'),
 (0.029184534524490734, 'Remark_CondensedPause'),
 (0.02584666799256853, 'Remark_CondensedNew'),
 (0.022439101017781855, 'Remark_CondensedEmergency'),
 (0.022428624268018806, 'Remark_CondensedExternal'),
 (0.01853165046513347, 'Remark_CondensedMain'),
 (0.016523168634531352, 'Remark_CondensedStop'),
 (0.016478878410975492, 'Remark_CondensedPower'),
 (0.016214651137493298, 'Remark_CondensedGen.'),
 (0.016207974455710608, 'Remark_CondensedFeedback'),
 (0.013887919666493729, 'Remark_CondensedSlip:'),
 (0.013795055990478873, 'Remark_CondensedKey'),
 (0.013143029671228643, 'Remark_CondensedGenerator'),
 (0.012813297265610866, 'Remark_CondensedPitch'),
 (0.01279391498674932, 'Remark_Condensed

In [216]:
testing_full_df = cleaned_t11_master_fault_df.drop(columns=["Turbine_ID", "TimeDetected", "Time Bin", "failure_in_bin", "failure_in_next_bin"])
testing_full_df = X_scaler.transform(testing_full_df)

In [217]:
full_prediction = clf.predict(testing_full_df)

In [218]:
len(full_prediction)

58705

In [219]:
full_prediction_df = cleaned_t11_master_fault_df.copy()
full_prediction_df['prediction'] = full_prediction


In [220]:
full_prediction_df.head()

Unnamed: 0,TimeDetected,Turbine_ID,Remark_Condensed 37,Remark_Condensed 81,Remark_Condensed 82,Remark_Condensed 83,Remark_Condensed 85,Remark_Condensed 86,Remark_Condensed 87,Remark_Condensed160,...,Remark_CondensedWATCHDOG,Remark_CondensedWS1,Remark_CondensedYaw,Remark_CondensedYawSpeedFault:,Remark_CondensedYawcontr.,Remark_CondensedYawing,Time Bin,failure_in_bin,failure_in_next_bin,prediction
68215,2016-01-01 00:02:18+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40222,2016-01-01 00:32:18+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47031,2016-01-01 01:02:18+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4748,2016-01-01 01:32:00+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40227,2016-01-01 02:02:17+00:00,T11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [222]:
predicted_dates = full_prediction_df[full_prediction_df["prediction"] == 1]
predicted_dates = predicted_dates[['TimeDetected', 'prediction']]
predicted_dates.head()

Unnamed: 0,TimeDetected,prediction
82057,2016-01-01 16:05:02+00:00,1
47006,2016-01-02 03:38:02+00:00,1
40215,2016-01-02 05:16:55+00:00,1
97457,2016-01-02 06:05:38+00:00,1
7708,2016-01-02 06:12:00+00:00,1


In [223]:
len(predicted_dates)
sampled_predicted_dates = predicted_dates.sample(n=128)

In [237]:
# Visualize the data with bins and failure points

fig = px.line(cleaned_t11_master_fault_df.iloc[::250, :], x='TimeDetected', y="Remark_CondensedExternal")
for time in t11_sig_fault_sorted_df['timestamp']: 
    fig.add_vline(x=time, line_color='red')

for bin in cleaned_t11_master_fault_df["Time Bin"].unique():

    time_bin = cleaned_t11_master_fault_df[cleaned_t11_master_fault_df["Time Bin"] == bin]
    
    start = time_bin.TimeDetected.iloc[1]
    end = time_bin.TimeDetected.iloc[-1]

    fig.add_vrect(x0=start, x1=end)


for time in sampled_predicted_dates['TimeDetected']:
    fig.add_vline(x=time, line_color='yellow')

fig.show()