In [34]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.losses import BinaryCrossentropy
from keras.optimizers import Adam
from keras.layers import Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
import warnings
warnings.filterwarnings("ignore")

In [2]:
data=pd.read_csv("dane-treningowe.csv",index_col=1)

In [3]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
    data: Sequence of observations as a list or NumPy array.
    n_in: Number of lag observations as input (X).
    n_out: Number of observations as output (y).
    dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
    Pandas DataFrame of series framed for supervised learning.
    source: https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [4]:
def df_lag(i,o,X_train,X_test,Y_train,Y_test,lstm):
    X_train=series_to_supervised(X_train,i,o).values
    X_test=series_to_supervised(X_test,i,o).values
    Y_train=Y_train[i:len(Y_train)-o+1]
    Y_test=Y_test[i:len(Y_test)-o+1]
    oversample = RandomOverSampler(sampling_strategy=0.3)
    X_train, Y_train = oversample.fit_resample(X_train, Y_train)
    if lstm==True:
        X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
        X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
    return [X_train,X_test,Y_train,Y_test]

In [8]:
def data_cl(data,n_in,n_out,turbine,lstm=False,time_window=72):
    data=data.drop(columns=["Unnamed: 0","Time_ID"])
    data=data[data.Turbine_ID==turbine]
    data["Efficiency"]=round(data['Grd_Prod_Pwr_Avg']/data['Predicted_power'],2)
    columns=['Grd_Prod_Pwr_Avg', 'Amb_WindSpeed_Avg', 'Amb_Temp_Avg',
           'Nac_Temp_Avg', 'Rtr_RPM_Avg', 'Gear_Bear_Temp_Avg', 'Gen_RPM_Avg',
           'Gen_Bear2_Temp_Avg', 'Gen_Bear_Temp_Avg', 'Amb_WindDir_Relative_Avg']
    data.loc[:,columns]=1-data.loc[:,columns].div(data["Predicted_power"],axis=0)
    data["Failure_window"]=0
    data.loc[data[data.Time_to_failure<time_window].index,"Failure_window"]=1
    data.sort_index(inplace=True)
    X=data.iloc[:,[1,2,3,4,5,6,7,8,9,10,16,17]]
    y=data.iloc[:,18]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    df=df_lag(n_in,n_out,X_train,X_test,y_train,y_test,lstm)
    return df

# Trenowanie na pełnym zbiorze treningowym

In [9]:
turbines=data.Turbine_ID.unique()
turbines.sort()
models=[]
for turbine in list(turbines):
    data_=data_cl(data,30,6,turbine,False,72)
    X_train=data_[0]
    X_test=data_[1]
    y_train=data_[2]
    y_test=data_[3]
    X_train=np.concatenate((X_train,X_test))
    y_train=np.concatenate((y_train,y_test))
    model=xgb.XGBClassifier(eta=0.1,n_estimators=100,max_depth=2,gamma=1)
    model.fit(X_train,y_train)
    models.append(model)





















In [11]:
data_test=pd.read_csv("dane-testowy.csv",index_col=1)

In [12]:
def data_cl_test(data,n_in,n_out,turbine,lstm=False):
    data=data.drop(columns=["Unnamed: 0","Time_ID"])
    data=data[data.Turbine_ID==turbine]
    data["Efficiency"]=round(data['Grd_Prod_Pwr_Avg']/data['Predicted_power'],2)
    columns=['Grd_Prod_Pwr_Avg', 'Amb_WindSpeed_Avg', 'Amb_Temp_Avg',
           'Nac_Temp_Avg', 'Rtr_RPM_Avg', 'Gear_Bear_Temp_Avg', 'Gen_RPM_Avg',
           'Gen_Bear2_Temp_Avg', 'Gen_Bear_Temp_Avg', 'Amb_WindDir_Relative_Avg']
    data.loc[:,columns]=1-data.loc[:,columns].div(data["Predicted_power"],axis=0)
    data.sort_index(inplace=True)
    X=data.iloc[:,[1,2,3,4,5,6,7,8,9,10,14,15]]
    X_test=series_to_supervised(X,n_in,n_out).values
    return X_test

# Prognoza awarii z 12 godzinnym wyprzedzeniem na zbiorze testowym

In [13]:
i=0
predictions=[]
for turbine in list(turbines):
    X_test=data_cl_test(data_test,30,6,turbine,False)
    y_test=models[i].predict(X_test)
    predictions.append(y_test)
    i+=1



In [27]:
i=0
for turbine in turbines:
    pred=pd.DataFrame(predictions[i],columns=[turbines[i]])
    pred.to_csv("prediction_"+turbine+".csv",index=False)
    i+=1