<a href="https://colab.research.google.com/github/smedegaard/krafthack/blob/main/krafthack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Krafthack 2022
Authors and developers: Anders Pedersen and Håkon Holte

In [None]:
try:
  import colab
  !pip install --upgrade pip
except:
  pass

[0m

# Data Exploration

In [None]:
# Import necessary packages and set some global parameters
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import io
import seaborn as sns

plt.rcParams["figure.figsize"] = (12, 10)

In [None]:
# Get and view dataset - needs path to dataset
dataset_df = pd.read_parquet('')
dataset_df.head()

In [None]:
# Get relevant facts about dataset
n_rows = len(dataset_df)
n_cols = len(dataset_df.columns)

# Get count, mean and other relevant statistics
dataset_description = dataset_df.describe()

print("Dataset consists of:\n", n_rows, "rows\n", n_cols, "columns\n")
print("Description of dataset:\n", dataset_description)

In [None]:
# Create correlation plot to consider correlations between variables
dataset_corr = dataset_df.corr()

fig, ax = plt.subplots(figsize=(18, 15))
sns.heatmap(dataset_corr, annot=True, cmap=sns.diverging_palette(220, 10, as_cmap=True), square=True, ax=ax)

In [None]:
# Change x_var and y_var to desired variables to make scatterplots grouped by mode (start or operational)
x_var = "Bolt_1_Tensile"
y_var = "Turbine_Guide Vane Opening"
by_var = "mode"
groups = dataset_df.groupby(by_var)
fig, ax = plt.subplots(figsize=(12, 10))
for name, group in groups:
  plt.plot(group[x_var], group[y_var], marker='o', linestyle='', markersize=1, label=name)
ax.set_xlabel(x_var)
ax.set_ylabel(y_var)
plt.legend()

# Data Preparation

In [None]:
# Import necessary packages
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import math
import warnings as wgs

In [None]:
# Variables not to be included as features
drop_vars = ['Bolt_1_Tensile', 'Bolt_2_Tensile', 'Bolt_3_Tensile', 'Bolt_4_Tensile', 'Bolt_5_Tensile', 'Bolt_6_Tensile', 
             'Bolt_1_Torsion', 'Bolt_2_Torsion', 'Bolt_3_Torsion', 'Bolt_4_Torsion', 'Bolt_5_Torsion', 'Bolt_6_Torsion', 
             'Bolt_1_Steel tmp', 'mode']

# Create mode indicator variable
dataset_df['is_mode_start'] = (dataset_df['mode'] == 'start') * 1

In [None]:
# Remove null values. For this, vibration variables are removed from the dataset, as these have many occurences of null values
X_clean = dataset_df.drop(['lower_bearing_vib_vrt',	'turbine_bearing_vib_vrt'], axis=1).dropna()

In [None]:
# Custom data split function to split data into equally sized chunks
def split_dataframe(df, chunk_size = 10000): 
    chunks = list()
    num_chunks = math.ceil(len(df) / chunk_size)
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

In [None]:
# Use split_dataframe function to split datafram into chunks with chunk_size elements in each chunk
chunk_size = 10000
df_chunks = pd.Series(split_dataframe(X_clean, chunk_size))

In [None]:
# Set size of training data set and shuffle chunk indices
train_size = 0.8
shuffled_idxs = np.arange(0, len(df_chunks), 1)
np.random.shuffle(shuffled_idxs)

# Set chunks to be placed in train, validation and test set
train_chunks_temp = shuffled_idxs[:math.ceil(train_size*len(shuffled_idxs))]
test_chunks = shuffled_idxs[math.ceil(train_size*len(shuffled_idxs)):]
train_chunks = train_chunks_temp[:math.ceil((2-1/train_size)*len(train_chunks_temp))]
val_chunks = train_chunks_temp[math.ceil((2-1/train_size)*len(train_chunks_temp)):]

# Set training, validation and test data
train_data = pd.concat(list(df_chunks[train_chunks]))
val_data = pd.concat(list(df_chunks[val_chunks]))
test_data = pd.concat(list(df_chunks[test_chunks]))

# Set up train, validation and test feature dataframes
X_train = train_data.drop(drop_vars, inplace=False, axis=1)
X_val = val_data.drop(drop_vars, inplace=False, axis=1)
X_test = test_data.drop(drop_vars, inplace=False, axis=1)

# Set up train, validation and test target dataframes
y_train_df = train_data[['Bolt_1_Tensile', 'Bolt_2_Tensile', 'Bolt_3_Tensile', 'Bolt_4_Tensile', 'Bolt_5_Tensile', 'Bolt_6_Tensile']]
y_val_df = val_data[['Bolt_1_Tensile', 'Bolt_2_Tensile', 'Bolt_3_Tensile', 'Bolt_4_Tensile', 'Bolt_5_Tensile', 'Bolt_6_Tensile']]
y_test_df = test_data[['Bolt_1_Tensile', 'Bolt_2_Tensile', 'Bolt_3_Tensile', 'Bolt_4_Tensile', 'Bolt_5_Tensile', 'Bolt_6_Tensile']]

In [None]:
# Scaling features for using chunk split
with wgs.catch_warnings():
  wgs.simplefilter("ignore")
  scaler1 = MinMaxScaler()
  scaler2 = MinMaxScaler((-1, 1))
  keep_vars = dataset_df.columns.drop([*drop_vars, 'Unit_4_Reactive Power', 'is_mode_start', 'lower_bearing_vib_vrt', 'turbine_bearing_vib_vrt'])
  for keep_var in keep_vars:
    scaler1.fit(X_train[[keep_var]])
    X_train[[keep_var]] = scaler1.transform(X_train[[keep_var]])
    X_val[[keep_var]] = scaler1.transform(X_val[[keep_var]])
    X_test[[keep_var]] = scaler1.transform(X_test[[keep_var]])
  scaler2.fit(X_train[['Unit_4_Reactive Power']])
  X_train[['Unit_4_Reactive Power']] = scaler2.transform(X_train[['Unit_4_Reactive Power']])
  X_val[['Unit_4_Reactive Power']] = scaler2.transform(X_val[['Unit_4_Reactive Power']])
  X_test[['Unit_4_Reactive Power']] = scaler2.transform(X_test[['Unit_4_Reactive Power']])

# Verify that values are scaled
X_train.head()

In [None]:
# Scaling targets. As above, the test targets are scaled using the same scaler as the train targets. A seperate scaler is fitted for each of the six bolt tensil columns
# Scaling targets for using chunk split
with wgs.catch_warnings():
  wgs.simplefilter("ignore")
  y_scalers = [MinMaxScaler() for i in range(0, 6)]
  for i in range(0, 6):
    col_i = y_train_df[y_train_df.columns[i]]
    y_scalers[i].fit(col_i.values.reshape(-1, 1))
    y_train_df[y_train_df.columns[i]] = y_scalers[i].transform(col_i.values.reshape(-1, 1))
    y_val_df[y_val_df.columns[i]] = y_scalers[i].transform(y_val_df[y_val_df.columns[i]].values.reshape(-1, 1))
    y_test_df[y_test_df.columns[i]] = y_scalers[i].transform(y_test_df[y_test_df.columns[i]].values.reshape(-1, 1))

# Verify that values are scaled
y_train_df.head()

# Model Training

In [None]:
# Import necessary libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import Sequence
from keras.regularizers import l2, l1

In [None]:
# Convert to tensors for chunk data
X_train_chunks = pd.Series(split_dataframe(X_train, chunk_size))
X_train_tensors = [tf.convert_to_tensor(chunk) for chunk in X_train_chunks]

y_train_chunks = pd.Series(split_dataframe(y_train_df, chunk_size))
y_train_tensors = [tf.convert_to_tensor(chunk) for chunk in y_train_chunks]

X_val_tensor = tf.convert_to_tensor(X_val)
y_val_tensor = tf.convert_to_tensor(y_val_df[y_val_df.columns[0]])

In [None]:
# Set some hyperparameters
NUM_EPOCHS = 5
BATCH_SIZE = 32 #16
LEARNING_RATE = 0.01
REGULARIZATION_PARAM = 0.02
LOSS_FUNC = tf.keras.losses.MeanSquaredError()
METRIC_NAME = 'MSE'
DROPOUT_RATE = 0.1

# Define fully connected network
model1 = tf.keras.models.Sequential()
model1.add(tf.keras.Input(shape=(7,)))
model1.add(tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=l2(REGULARIZATION_PARAM), bias_regularizer=l1(REGULARIZATION_PARAM),
                                  kernel_initializer=tf.keras.initializers.GlorotNormal(), bias_initializer=tf.keras.initializers.TruncatedNormal(mean=0.1, stddev=0.1)))
model1.add(tf.keras.layers.Dense(64, activation='relu'))
model1.add(tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=l2(REGULARIZATION_PARAM), bias_regularizer=l1(REGULARIZATION_PARAM),
                                  kernel_initializer=tf.keras.initializers.GlorotNormal(), bias_initializer=tf.keras.initializers.TruncatedNormal(mean=0.1, stddev=0.1)))
model1.add(tf.keras.layers.Dense(1, activation='relu'))
model1.output_shape

In [None]:
# Configure early stopping
MIN_DELTA = 0
PATIENCE = 5

# Helper function for checking validation loss and early stopping
def check_loss_and_early_stopping(val_loss_history, best_val_loss, es_counter, best_model_idx, model_idx):
    if (len(val_loss_history) == 1):
      best_val_loss = val_loss_history[0]
    elif ((val_loss_history[-1] - best_val_loss) >= -MIN_DELTA):
      es_counter += 1
    else:
      es_counter = 0
      best_val_loss = val_loss_history[-1]
      best_model_idx = model_idx
    return best_val_loss, es_counter, best_model_idx

In [None]:
# Set metrics for monitoring validation loss
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
loss = mean_squared_error
if METRIC_NAME == 'MAPE':
  loss = mean_absolute_percentage_error

In [None]:
# Compile model and view summary
model1.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=LEARNING_RATE),
              loss=LOSS_FUNC,
              metrics=['mse', 'mape'])

model1.summary()

In [None]:
# Train fully connected model for bolt 1 with chunks
history = 0
es_counter = 0
best_val_loss = 100
best_model_idx = 0
val_loss_history = []
model_history = []

for epoch in range(NUM_EPOCHS):
  if es_counter > PATIENCE:
    print("Patience reached, stopping early at epoch #{}".format(epoch))
    break
  print('epoch #{}'.format(epoch))
  for i in range(0, len(X_train_tensors)):
    model_history.append(model1.fit(x=X_train_tensors[i], y=y_train_tensors[i][:, 0], batch_size=BATCH_SIZE))
    if (i%7 == 0 and i != 0):
      print("Scoring validation set...")
      val_preds = model1.predict(X_val_tensor)
      print("Calculating loss...")
      val_loss_history.append(loss(y_val_tensor, val_preds))
      print("Checking loss and early stopping...")
      best_val_loss, es_counter, best_model_idx = check_loss_and_early_stopping(val_loss_history, best_val_loss, es_counter, best_model_idx, 
                                                                                i+epoch*len(X_train_tensors))
      print("Current validation loss:", val_loss_history[-1])
      print("Best validation loss:", best_val_loss)
      print("Early stopping counter:", es_counter)
      if es_counter > PATIENCE:
        break


# Evaluate and save Model

In [None]:
# Get the best model
best_model = model_history[best_model_idx].model

In [None]:
# Score model on test set
X_test_tensor = tf.convert_to_tensor(X_test)
y_test_tensor = tf.convert_to_tensor(y_test_df[y_test_df.columns[0]])

y_pred = best_model.predict(X_test_tensor)

print('mse:', mean_squared_error(y_test_tensor, y_pred), 'mape:', mean_absolute_percentage_error(y_test_tensor, y_pred))

In [None]:
# Save the model to Google Drive
model_name = ""
savepath = "/content/gdrive/My Drive/Datasets/Krafthack/Models/" + model_name
tf.keras.models.save_model(best_model, filepath=savepath)