# Import packages and data sets

In [None]:
import os
import sys
data_dir = "/home/ec2-user/pwp-summer-2019/master_thesis_nhh_2019/processed_data/" 
history_dir = '/home/ec2-user/SageMaker/LSTM/History/'
model_dir = '/home/ec2-user/SageMaker/LSTM/Models/'
fig_dir = '/home/ec2-user/SageMaker/OverleafMasterThesis/Images/'

import pandas as pd
import numpy as np
import random
import math
import joblib

import matplotlib.pylab as plt

from keras.layers import LSTM
from keras.models import Sequential
from keras.layers import (Dense, Dropout, Input, Embedding, 
                          Dropout, Conv1D, MaxPooling1D, 
                          BatchNormalization)
from keras import callbacks
from keras.models import load_model

from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adam
from keras.initializers import Zeros

#from sklearn import svm
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight

pd.set_option('display.max_columns', 999)

In [None]:
from Functions import (setup_lstm_stratify, feature_engineering, DataGenerator)

In [None]:
df_train = pd.read_pickle(data_dir+'df_train')
df_val = pd.read_pickle(data_dir+'df_val')
df_test = pd.read_pickle(data_dir+'df_test')

formation_dictionary = joblib.load(data_dir+'formation_dictionary.pkl')
n_formation = len(formation_dictionary)

# Feature engineering and remove outliers

In [None]:
params_features = {
    'outlier_values': {'gr': df_train.append(df_val).gr.quantile(0.9995),
                       'rmed': df_train.append(df_val).rmed.quantile(0.9995),
                       'rdep': df_train.append(df_val).rdep.quantile(0.9995)
                      },
    'above_below_variables': ['gr','rdep','rmed'],
    'y_variable': 'formation_2',
    'num_shifts': 1,
    'cols_to_remove' : ['depth', 'dts','hgr', 'hnphi', 
                        'hrdep', 'hrhob', 'hrmed', 'hrsh','rsh','field','main_area','md'],
    'thresh': 7,
    'log_variables': ['rmed','rdep'],
    'var1_ratio': 'gr'
}

In [None]:
train_class = feature_engineering(df_train,**params_features)

train_class.remove_outliers()
train_class.cleaning()
train_class.xyz()
train_class.df = train_class.df.dropna(thresh=int(len(train_class.df)*0.9),axis=1)

df_train = train_class.df
columns_class = df_train.columns

val_class = feature_engineering(df_val,**params_features)

val_class.remove_outliers()
val_class.cleaning()
val_class.xyz()
df_val = val_class.df[columns_class]

print('Control: ', df_val.shape[1] == df_train.shape[1])

## Normalize the data

In [None]:
norm_keys = ['tvd','gr','x','y','z','rmed','rdep']
not_norm_keys = ['title','formation','formation_2','group']

In [None]:
#scaler = MinMaxScaler().fit(df_train.append(df_val)[norm_keys])
scaler = StandardScaler().fit(df_train.append(df_val)[norm_keys])

In [None]:
df_train_scaled = pd.DataFrame(scaler.transform(df_train[norm_keys]),index=df_train.index)

df_train_scaled.columns = norm_keys

df_train = pd.concat([df_train_scaled,df_train[not_norm_keys]],axis=1)

In [None]:
df_val_scaled = pd.DataFrame(scaler.transform(df_val[norm_keys]),index=df_val.index)

df_val_scaled.columns = norm_keys

df_val = pd.concat([df_val_scaled,df_val[not_norm_keys]],axis=1)

## Remove columns with NaNs above threshold/fill NaNs

In [None]:
df_train = df_train.fillna(0)
df_val = df_val.fillna(0)

# Choose variables

In [None]:
y_var = 'formation_2'
wvars = [column for column in df_train.columns if column not in [y_var,'title','formation','group']]
print(wvars)

# LSTM Model

In [None]:
def lstm_model(
    df_train,
    df_val,
    wvars,
    batch_size,
    win,
    n_formation,
    callback_patience = 10,
    epochs = 100,
    generator = True, 
    seed = 42,
    verbose = 1
):
    
    lstm_model = Sequential()
    lstm_model.add(LSTM(50,
                    batch_input_shape=(batch_size, len(wvars), win),
                    #return_sequences=True, # implement if stacked LSTM layers
                    stateful=True, 
                    kernel_initializer=Zeros()))
    
    lstm_model.add(Dropout(0.1))
    lstm_model.add(Dense(n_formation, activation='softmax')) 
    
    lstm_model.compile(loss='categorical_crossentropy', 
                   optimizer= Adam(0.01),   
                   metrics=['accuracy']) 
    
    early_stopping_cb = callbacks.EarlyStopping(patience=callback_patience,
                                                restore_best_weights=True)
    # Train
    x_train, y_train = setup_lstm_stratify(
            df = df_train,
            n_val = n_formation,
            y_var = y_var,
            wvars = wvars,
            batch_size = batch_size,
            win = win
        )
    # Validation
    x_val, y_val = setup_lstm_stratify(
            df = df_val,
            n_val = n_formation,
            y_var = y_var,
            wvars = wvars,
            batch_size = batch_size,
            win = win
        )
    
    if generator:
        training_generator = DataGenerator(x_train, y_train, batch_size)
        validation_generator = DataGenerator(x_val, y_val, batch_size)
        
        history = lstm_model.fit_generator(
            generator = training_generator,
            epochs = epochs,
            validation_data = validation_generator,
            shuffle = False,
            callbacks = [early_stopping_cb],
            verbose = verbose,
            use_multiprocessing=True
        )
        lstm_model.save(model_dir+'lstm_model_generator.h5')
        return history
    else:
        history = lstm_model.fit(x_train,y_train,
                         epochs=epochs, 
                         batch_size=batch_size, 
                         validation_data=(x_val, y_val),
                         verbose=verbose,
                         callbacks=[early_stopping_cb]
                        )
        lstm_model.save(model_dir+'lstm_model.h5')
        return history

# Run model

In [None]:
batch_size = 128
win = 9

In [None]:
history = lstm_model(
    df_train = df_train,
    df_val = df_val,
    batch_size = batch_size,
    win = win,
    n_formation = n_formation,
    wvars = wvars,
    callback_patience = 10,
    epochs = 30,
    generator = False,
    seed = 42,
    verbose = 1
)

# Blind wells

### Load data

In [None]:
df_test = pd.read_pickle(data_dir+'df_test')

### Clean data and set up 

In [None]:
test_class = feature_engineering(df_test,**params_features)
test_class.thresh = 0 # In order to not remove any rows when cleaning
test_class.cleaning()
test_class.xyz()

df_test = test_class.df[columns_class]
not_norm_key_test = not_norm_keys.copy()

In [None]:
df_test_scaled = pd.DataFrame(scaler.transform(df_test[norm_keys]),index=df_test.index)

df_test_scaled.columns = norm_keys

df_test = pd.concat([df_test_scaled,df_test[not_norm_key_test]],axis=1)

In [None]:
x_test, y_test = setup_lstm_stratify(
    df = df_test,
    n_val = n_formation,
    y_var = y_var,
    wvars = wvars,
    batch_size = batch_size,
    win = win
)

## Load model

In [None]:
lstm_model_loaded = load_model(model_dir+'lstm_model.h5')

## Predict on blind wells

In [None]:
prediction_lstm = lstm_model_loaded.predict(x_test, batch_size=batch_size)

test_set = pd.DataFrame(pd.DataFrame(y_test).idxmax(axis=1), columns = ['formation_2'])
test_set['predicted'] = pd.DataFrame(prediction_lstm).idxmax(axis=1)

### If generator function is used:

## Performance

In [None]:
from sklearn import metrics
metrics.f1_score(test_set["formation_2"], test_set["predicted"],average = 'micro')