In [1]:
import pandas as pd
import psycopg2 as psy
import plotly.express as px
import numpy as np

In [2]:
# Pull the full dataframe from the AWS RDS server. 
main_df = pd.read_sql("main", con= "postgresql://postgres:password@wind-turbine-analysis.chv2nnusygyy.us-west-1.rds.amazonaws.com:5432/wind_turbine_analysis")
main_df.head()

Unnamed: 0,index,time_stamp,turbine_id,amb_temp_avg,amb_winddir_abs_avg,amb_winddir_relative_avg,amb_windspeed_avg,blds_pitchangle_avg,cont_hub_temp_avg,cont_top_temp_avg,...,hvtrafo_phase1_temp_avg,hvtrafo_phase2_temp_avg,hvtrafo_phase3_temp_avg,hyd_oil_temp_avg,nac_direction_avg,nac_temp_avg,rtr_rpm_avg,spin_temp_avg,suspect,wind_bucket
0,57456,2016-07-16 20:50:00,T11,26,85.9,-3.2,13.2,6.9,34,55,...,97,113,104,50,89.1,38,14.9,28,0.0,13
1,57457,2016-07-16 20:50:00,T01,26,103.5,-3.5,19.9,17.8,34,52,...,90,112,121,52,107.0,42,14.9,27,0.0,20
2,57458,2016-07-16 20:50:00,T07,25,98.3,0.5,19.6,84.6,33,56,...,80,93,101,46,97.7,40,0.2,26,1.0,20
3,57459,2016-07-16 21:00:00,T11,26,83.0,-6.1,12.0,4.4,34,55,...,98,113,104,47,89.1,38,14.9,28,0.0,12
4,57460,2016-07-16 21:00:00,T01,26,108.5,1.5,19.5,17.5,34,52,...,91,112,121,46,107.0,42,14.9,27,0.0,20


In [3]:
# clean incoming dataset, this can be resolved at the database level eventually. 
main_df.drop(columns=["index", "suspect"], inplace=True)

main_df['time_stamp'] = pd.to_datetime(main_df['time_stamp'], utc=True)
main_df.dtypes

time_stamp                     datetime64[ns, UTC]
turbine_id                                  object
amb_temp_avg                                 int64
amb_winddir_abs_avg                        float64
amb_winddir_relative_avg                   float64
amb_windspeed_avg                          float64
blds_pitchangle_avg                        float64
cont_hub_temp_avg                            int64
cont_top_temp_avg                            int64
cont_vcp_chokcoiltemp_avg                    int64
cont_vcp_temp_avg                            int64
cont_vcp_wtrtemp_avg                         int64
gear_bear_temp_avg                           int64
gear_oil_temp_avg                            int64
gen_bear2_temp_avg                           int64
gen_bear_temp_avg                            int64
gen_phase1_temp_avg                          int64
gen_phase2_temp_avg                          int64
gen_phase3_temp_avg                          int64
gen_rpm_avg                    

In [4]:
main_df["turbine_id"].unique()

array(['T11', 'T01', 'T07', 'T06'], dtype=object)

In [5]:
# Read in each turbines data
turbine_dataframes = {}

for turbine in main_df["turbine_id"].unique():

    turbine_dataframes[turbine] = main_df[main_df["turbine_id"] == turbine].drop_duplicates("time_stamp")

In [6]:
turbine_dataframes["T01"]

Unnamed: 0,time_stamp,turbine_id,amb_temp_avg,amb_winddir_abs_avg,amb_winddir_relative_avg,amb_windspeed_avg,blds_pitchangle_avg,cont_hub_temp_avg,cont_top_temp_avg,cont_vcp_chokcoiltemp_avg,...,grd_rtrinvphase3_temp_avg,hvtrafo_phase1_temp_avg,hvtrafo_phase2_temp_avg,hvtrafo_phase3_temp_avg,hyd_oil_temp_avg,nac_direction_avg,nac_temp_avg,rtr_rpm_avg,spin_temp_avg,wind_bucket
1,2016-07-16 20:50:00+00:00,T01,26,103.5,-3.5,19.9,17.8,34,52,128,...,46,90,112,121,52,107.0,42,14.9,27,20
4,2016-07-16 21:00:00+00:00,T01,26,108.5,1.5,19.5,17.5,34,52,128,...,46,91,112,121,46,107.0,42,14.9,27,20
8,2016-07-16 21:10:00+00:00,T01,26,114.5,7.5,18.6,16.3,34,52,128,...,46,91,112,121,46,107.0,42,14.9,27,19
11,2016-07-16 21:20:00+00:00,T01,25,96.5,-10.5,18.7,16.3,34,52,128,...,46,91,112,122,48,107.0,42,14.9,26,19
13,2016-07-16 21:30:00+00:00,T01,25,113.5,6.5,17.0,13.5,34,52,128,...,45,91,113,122,49,107.0,41,14.9,26,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227863,2016-07-16 20:00:00+00:00,T01,26,111.9,4.9,20.5,18.2,35,52,127,...,46,90,110,119,54,107.0,42,14.9,27,21
227866,2016-07-16 20:10:00+00:00,T01,26,110.0,2.9,21.1,19.6,35,52,128,...,46,90,110,119,54,107.0,42,14.9,27,21
227869,2016-07-16 20:20:00+00:00,T01,26,100.5,-6.5,19.3,17.2,35,52,128,...,46,90,111,120,54,107.0,42,14.9,27,19
227872,2016-07-16 20:30:00+00:00,T01,26,94.5,-12.5,19.2,17.0,34,52,128,...,46,90,111,120,55,107.0,42,14.9,27,19


In [7]:
# Read in the failure data for each turbine 
failures_df = pd.read_sql("major_faults", con= "postgresql://postgres:password@wind-turbine-analysis.chv2nnusygyy.us-west-1.rds.amazonaws.com:5432/wind_turbine_analysis")

turbine_failures = {}

for turbine in failures_df["turbine_id"]:

    current_failure = failures_df[failures_df["turbine_id"] == turbine]

    current_failure['time_stamp'] = pd.to_datetime(current_failure['time_stamp'], utc=True)
    current_failure.sort_values(by="time_stamp", inplace=True)
    current_failure.drop(['index'], axis=1, inplace=True)

    turbine_failures[turbine] = current_failure

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [8]:
# Let's verify
turbine_failures['T06'].dtypes

time_stamp     datetime64[ns, UTC]
turbine_id                  object
fault                       object
description                 object
dtype: object

In [9]:
turbine_dataframes['T06'].dtypes

time_stamp                     datetime64[ns, UTC]
turbine_id                                  object
amb_temp_avg                                 int64
amb_winddir_abs_avg                        float64
amb_winddir_relative_avg                   float64
amb_windspeed_avg                          float64
blds_pitchangle_avg                        float64
cont_hub_temp_avg                            int64
cont_top_temp_avg                            int64
cont_vcp_chokcoiltemp_avg                    int64
cont_vcp_temp_avg                            int64
cont_vcp_wtrtemp_avg                         int64
gear_bear_temp_avg                           int64
gear_oil_temp_avg                            int64
gen_bear2_temp_avg                           int64
gen_bear_temp_avg                            int64
gen_phase1_temp_avg                          int64
gen_phase2_temp_avg                          int64
gen_phase3_temp_avg                          int64
gen_rpm_avg                    

ADD TIME_BIN AND FAILURE_IN_NEXT_BIN TO EACH DATAFRAME

In [10]:
# Loop through dataframes in turbine dataframes
for turbine in turbine_dataframes:

    # Add time_bin and failure_in_next_bin to each dataframe by checking for failures in current bin

    failure_dates = turbine_failures[turbine]['time_stamp']
    df = turbine_dataframes[turbine]

    df["time_bin"] = pd.cut(df.time_stamp, bins=48, labels=np.arange(0,48))

    failure_in_bin = {}
    failure_in_next_bin = {}

    for bin in df["time_bin"].unique():

        time_bin = df[df["time_bin"] == bin]
        
        start = time_bin.time_stamp.iloc[0]
        end = time_bin.time_stamp.iloc[-1]

        for date in failure_dates:
            if start <= date <= end:
                failure_in_bin[bin] = 1
                break
            else:
                failure_in_bin[bin] = 0

    # Build failure in Next Bin by shifting failure in bin up one. 
    failure_in_next_bin = np.int_(pd.Series(failure_in_bin).shift(-1).fillna(0))
    failure_in_next_bin = dict(zip(failure_in_bin.keys(), failure_in_next_bin))

    # Add failure in NEXT bin identifier to turbine dataframe
    df["failure_in_next_bin"] = df["time_bin"].apply(lambda x: failure_in_next_bin[x])


In [11]:
# Let's verify
turbine_dataframes['T06'].time_bin

1493      13
1500      13
1506      13
1510      13
1514      13
          ..
226260    12
226264    12
226269    12
226273    12
226276    12
Name: time_bin, Length: 55331, dtype: category
Categories (48, int64): [0 < 1 < 2 < 3 ... 44 < 45 < 46 < 47]

APPLY A GIVEN DATAFRAME TO THE MODEL, AND APPLY TO FULL DATAFRAME, VISUALIZE RESULTS

In [12]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier

In [13]:
# What key indicators do we want to record from each test? Balanced Acc Score, F1? Save the BRFCs. Also feature importances
balanced_accuracy_scores = {}
classification_reports = {}
BRFCs = {}
feature_importances = {}
confusion_matrices = {}

# Cycle through dataframes
for turbine in turbine_dataframes:
    # Choose the dataframe to examine
    selected_df = turbine_dataframes[turbine]
    # Create target
    y = selected_df['failure_in_next_bin']

    # Create features
    X = selected_df.drop(columns=["turbine_id", "time_stamp", "time_bin", "failure_in_next_bin"])

    # Scaling the data to assist the algo
    # Create the StandardScaler instance
    scaler = StandardScaler()

    # Fit the Standard Scaler with the training data
    X_scaler = scaler.fit(X)

    # Scale the training and testing data
    X_scaled = X_scaler.transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=1, stratify=y)

    # We're going to test both 100 and 800
    clf = BalancedRandomForestClassifier(n_estimators=800, random_state=1)
    clf.fit(X_train, y_train)

    # Calculated the balanced accuracy score
    y_pred = clf.predict(X_test)

    BRFCs[turbine] = clf
    balanced_accuracy_scores[turbine] = balanced_accuracy_score(y_test, y_pred)
    classification_reports[turbine] = classification_report_imbalanced(y_test, y_pred, output_dict=True)
    feature_importances[turbine] = sorted(zip(clf.feature_importances_, X.columns), reverse=True)
    confusion_matrices[turbine] = confusion_matrix(y_test, y_pred)



In [14]:
balanced_accuracy_scores

{'T11': 0.9059108436248651,
 'T01': 0.9236566290763557,
 'T07': 0.9276895049656895,
 'T06': 0.915454027959178}

In [29]:
for turbine in confusion_matrices.keys():
    print(f"{turbine} results: \n True Negative: {confusion_matrices[turbine][0][0]} \n False Positive: {confusion_matrices[turbine][0][1]} \n False Negative: {confusion_matrices[turbine][1][0]} \n True Positive: {confusion_matrices[turbine][1][1]} \n")

T11 results: 
 True Negative: 12010 
 False Positive: 1494 
 False Negative: 96 
 True Positive: 1142 

T01 results: 
 True Negative: 11554 
 False Positive: 1979 
 False Negative: 3 
 True Positive: 462 

T07 results: 
 True Negative: 11110 
 False Positive: 1603 
 False Negative: 31 
 True Positive: 1642 

T06 results: 
 True Negative: 10373 
 False Positive: 1773 
 False Negative: 39 
 True Positive: 1648 



In [30]:
type(confusion_matrices['T01'])

numpy.ndarray

In [31]:
for turbine in classification_reports :
    print(f"{turbine} average F1: {classification_reports[turbine]['avg_f1']}")

T11 average F1: 0.9086617839438283
T01 average F1: 0.9009719281411371
T07 average F1: 0.9008276510519735
T06 average F1: 0.8862079578278167


In [32]:
for turbine in classification_reports :
    print(f"{turbine} average Precision: {classification_reports[turbine]['avg_pre']}")

T11 average Precision: 0.9451400999445122
T01 average Precision: 0.9728172650962065
T07 average Precision: 0.9400931011565262
T06 average Precision: 0.9335056985300099


In [33]:
BRFCs

{'T11': BalancedRandomForestClassifier(n_estimators=800, random_state=1),
 'T01': BalancedRandomForestClassifier(n_estimators=800, random_state=1),
 'T07': BalancedRandomForestClassifier(n_estimators=800, random_state=1),
 'T06': BalancedRandomForestClassifier(n_estimators=800, random_state=1)}

In [34]:
for turbine in feature_importances:
    print(f"Top Five Features: {feature_importances[turbine][:5]} \n")

Top Five Features: [(0.07487088600438875, 'nac_direction_avg'), (0.053546599275278474, 'amb_winddir_abs_avg'), (0.04886667192176666, 'hyd_oil_temp_avg'), (0.04286167531553545, 'gen_bear2_temp_avg'), (0.04253294674941025, 'amb_temp_avg')] 

Top Five Features: [(0.1361711254772611, 'amb_temp_avg'), (0.11477782428163144, 'spin_temp_avg'), (0.08316737162638553, 'cont_hub_temp_avg'), (0.07676985541570423, 'hyd_oil_temp_avg'), (0.05318055401939586, 'nac_temp_avg')] 

Top Five Features: [(0.10031199462962523, 'amb_temp_avg'), (0.06499602048143224, 'spin_temp_avg'), (0.05186142109027726, 'nac_direction_avg'), (0.050765717104560126, 'nac_temp_avg'), (0.04273564748370064, 'hvtrafo_phase3_temp_avg')] 

Top Five Features: [(0.09484602003523201, 'amb_temp_avg'), (0.06847494169953755, 'spin_temp_avg'), (0.05726062687290253, 'cont_hub_temp_avg'), (0.056049580881294254, 'nac_direction_avg'), (0.04280118336589087, 'amb_winddir_abs_avg')] 



APPLY TO FULL DATAFRAME AND SAVE DATAFRAME FOR EACH

In [None]:
prediction_dataframes = {}

for turbine in turbine_dataframes:
    # Apply each turbine's BRFC to it's OWN full dataframe
    # Choose the dataframe to examine
    selected_df = turbine_dataframes[turbine].copy()

    # Create features
    X = selected_df.drop(columns=["turbine_id", "time_stamp", "time_bin", "failure_in_next_bin"])

    # Fit the Standard Scaler with the full data
    X_scaler = scaler.fit(X)

    # Scale the full data
    X_scaled = X_scaler.transform(X)

    # Create a prediction for the FULL, SCALED dataframe
    full_pred = BRFCs[turbine].predict(X_scaled)

    df_with_prediction = selected_df.copy()

    df_with_prediction["prediction"] = full_pred

    prediction_dataframes[turbine] = df_with_prediction.copy()

: 

In [None]:
# Verify
prediction_dataframes['T11']['prediction'].value_counts()

: 

In [None]:
#Verify
prediction_dataframes['T01']

: 

In [None]:
len(prediction_dataframes['T01'])

: 

In [None]:
turbine_failures['T01'].time_stamp

: 

BUILD FIGS AND SAVE FOR EACH

In [None]:
# The Sampling of the data doesn't produce the best picture of the accuracy of the data... just gives a sense of where the points clustered
figures = {}

for turbine in prediction_dataframes:

    selected_df = prediction_dataframes[turbine]

    predicted_dates = selected_df[selected_df["prediction"] == 1]
    predicted_dates = predicted_dates[['time_stamp', 'prediction']]

    num_samples = int(len(predicted_dates) / 100)
    print(num_samples)

    sampled_predicted_dates = predicted_dates.iloc[::100, :]

    fig = px.line(selected_df, x='time_stamp', y="amb_temp_avg")

    for bin in selected_df["time_bin"].unique():

        time_bin = selected_df[selected_df["time_bin"] == bin]
        
        start = time_bin.time_stamp.iloc[0]
        end = time_bin.time_stamp.iloc[-1]

        fig.add_vrect(x0=start, x1=end)


    for time in sampled_predicted_dates['time_stamp']:
        fig.add_vline(x=time, line_color='yellow')

    for time in turbine_failures[turbine].time_stamp: 
        fig.add_vline(x=time, line_color='red')

    figures[turbine] = fig

    print(f"Finished fig {turbine}")




: 

In [None]:
figures['T11']

: 

In [None]:
# Next step find percentage of datapoints per bin predicting failure in next bin. 
percent_flagged_figures = {}

for turbine in prediction_dataframes:

    selected_df = prediction_dataframes[turbine]

    num_flagged = selected_df[["time_bin", "prediction"]].groupby("time_bin").sum()["prediction"].to_list()
    total_datapoints = selected_df["time_bin"].value_counts().sort_index(ascending=True).to_list()

    percent_flagged = [m/n for m, n in zip(num_flagged, total_datapoints)]

    formatted_list = []
    for item in percent_flagged:
        formatted_list.append("%.2f"%item)

    fig = px.bar(x=np.arange(1, len(percent_flagged) + 1), y=percent_flagged, text=formatted_list, labels={
                        "x": "Bin Number",
                        "y": "Percent of readings indicating failure"})
    fig.update_traces(textposition="outside", cliponaxis=False)

    percent_flagged_figures[turbine] = fig

: 

In [None]:
percent_flagged_figures['T11']

: 

In [None]:
# test to apply one turbine's BRFC to another turbine's data
data_from_turbine = 'T07'
BRFC_from_turbine = 'T01'

selected_df = turbine_dataframes[data_from_turbine].copy()

# Create features
X = selected_df.drop(columns=["turbine_id", "time_stamp", "time_bin", "failure_in_next_bin"])

# Fit the Standard Scaler with the full data
X_scaler = scaler.fit(X)

# Scale the full data
X_scaled = X_scaler.transform(X)

# Create a prediction for the FULL, SCALED dataframe
full_pred = BRFCs[BRFC_from_turbine].predict(X_scaled)

df_with_prediction = selected_df.copy()

df_with_prediction["prediction"] = full_pred


: 

In [None]:
actual_values = selected_df['failure_in_next_bin'].to_list()

: 

In [None]:
balanced_accuracy_score(actual_values, full_pred)

: 

In [None]:
classification_report_imbalanced(actual_values, full_pred, output_dict=True)

: 

In [None]:
sorted(zip(BRFCs[BRFC_from_turbine].feature_importances_, X.columns), reverse=True)

: 

In [None]:
fig = px.line(df_with_prediction, x='time_stamp', y="amb_temp_avg")

for bin in df_with_prediction["time_bin"].unique(): 

    predicted_dates = df_with_prediction[df_with_prediction["prediction"] == 1]
    predicted_dates = predicted_dates[['time_stamp', 'prediction']]

    num_samples = int(len(predicted_dates) / 100)

    sampled_predicted_dates = predicted_dates.iloc[::100, :]

    time_bin = df_with_prediction[df_with_prediction["time_bin"] == bin]
        
    start = time_bin.time_stamp.iloc[0]
    end = time_bin.time_stamp.iloc[-1]

    fig.add_vrect(x0=start, x1=end)


for time in sampled_predicted_dates['time_stamp']:
    fig.add_vline(x=time, line_color='yellow')

for time in turbine_failures[data_from_turbine].time_stamp: 
    fig.add_vline(x=time, line_color='red')

fig.show()


: 

: 