# *KAGGLE CHALLENGE: LANL Earthquake Prediction*

Un projet de Matthieu Dagommer, Paul Boulgakoff, Godefroy Bichon, Germain L'Hostis

Versions utilisées:

Python: 3.10.4
Torch: 1.11

In [None]:
import numpy as np
import math
import matplotlib
import matplotlib.pyplot as plt 
import time

import torch as th
th.cuda.empty_cache()

import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn.functional as F
import torch.nn as nn

#from torchsummary import summary 

import gzip
import pickle

import pandas as pd
import scipy
import scipy.stats as stats
from scipy.stats import kurtosis, skew
import csv
import os
import pickle
import gc

from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
### Ensuring CPU is connected

print(th.cuda.is_available())
print(th.cuda.get_device_name())
device = th.device('cuda' if th.cuda.is_available() else 'cpu')

In [None]:
os.mkdir("./Models")

# *General Hyperparameters*

In [None]:
### Setting General Hyperparameters 

batch_size = 100 # number of patches per batch
valid_rate = 0.1 # fraction of data dedicated to validation
overlap_rate = 0.2 # overlap

# Parameters

nrows = 50_000_000
seed = 1
patch_size = 150_000

In [None]:
th.manual_seed(seed)
np.random.seed(seed)

# *Data Preparation*



In [None]:
### Loading Data

#rootpath = os.getcwd() + "/"
rootpath = "../input/LANL-Earthquake-Prediction/"

train_data = pd.read_csv(rootpath + "train.csv", usecols = ['acoustic_data', 'time_to_failure'], \
                         dtype = {'acoustic_data': np.int16, 'time_to_failure': np.float64}, nrows = nrows)

In [None]:
X = th.squeeze(th.tensor(train_data['acoustic_data'].values, dtype = th.int16))
Y = th.squeeze(th.tensor(train_data['time_to_failure'].values, dtype = th.float64))

del train_data
gc.collect()

In [None]:
### Function to create patched sequences with some overlap out of the training data

# Overlap is expressed as a fraction of patch size
def patching(patch_size, X, Y, overlap_rate):
    
    overlap = int(patch_size*overlap_rate)
    L = X.shape[0] # total length of the training acoustic signal
    n_patch = int(np.floor((L-patch_size)/(patch_size-overlap)+1))
    ids_no_seism = []

    X_patch = th.zeros(n_patch,patch_size)
    Y_patch = th.zeros(n_patch)
    
    for i in range (n_patch):
        X_patch[i,:] = X[i*(patch_size-overlap):(i+1)*patch_size-i*overlap] 
        Y_patch[i] = Y[(i+1)*patch_size - i*overlap] 
    
        # Removing patches with no seism
        if th.min(Y[i*(patch_size-overlap):(i+1)*patch_size - i*overlap]) > 0.001:
            ids_no_seism.append(i)
    
    X_patch = X_patch[ids_no_seism]
    Y_patch = Y_patch[ids_no_seism]
    
    return(n_patch, X_patch, Y_patch)

In [None]:
### Normalizing Data

ss = StandardScaler()
mm = MinMaxScaler()
    
X = th.squeeze(th.from_numpy(ss.fit_transform(np.array(X).reshape(-1, 1))))
Y = th.squeeze(th.from_numpy(mm.fit_transform(np.array(Y).reshape(-1, 1))))

In [None]:
### Distribution of data in patches

_, X_patch, Y_patch = patching(patch_size, X, Y, overlap_rate = overlap_rate)

del X; del Y
gc.collect()

print(X_patch.shape)
print("There are ", X_patch.shape[0], "time series available for training and validation after patching with overlap. \n")

In [None]:
### Initializating Data Sets


# Shuffling data
idx = np.arange(X_patch.shape[0])
shuffled_idx = np.random.shuffle(idx)
X_patch = th.squeeze(X_patch[shuffled_idx,:])
Y_patch = th.squeeze(Y_patch[shuffled_idx])

N_samples = X_patch.shape[0]
N_valid = int(valid_rate*N_samples) # number of validation patches

X_valid = X_patch[:N_valid,:].cpu()
Y_valid = Y_patch[:N_valid].cpu()


# Inputs of nn.Conv1d must have the following shape: (N, C_in, *),
#where N: number of samples (batch size), C_in: number of input channels, *: can be any dimension (150_000 for our time series)

# Add channel dimension
X_valid = th.unsqueeze(X_valid, 1)
X_train = X_patch[N_valid:,:].cpu()
Y_train = Y_patch[N_valid:].cpu()
N_train = X_train.shape[0]

n_batch = int(X_train.shape[0] / batch_size) # Number of batch per epoch

X_train = X_train.reshape([N_train, 1, patch_size])

# *Function for Results plotting*

In [None]:
### Plotting Graphs

def plot_and_save_results(train_losses, valid_losses, best_mvd, best_mtd, min_tl, min_vl, mm, model_name):

    best_epoch = np.fromiter(valid_losses, dtype=float).argmin()
    
    fig, ax = plt.subplots(1, 2, figsize = (20,10))
    plt.rcParams['font.size'] = '20'
    ax[0].set(title = "Losses: Training and Validation") 
    ax[0].set_xlabel("epochs", fontsize = 20)
    ax[0].set_ylabel("MSE", fontsize = 20)
    ax[0].plot(train_losses,"r", label = "Training", linewidth = 3)
    ax[0].plot(valid_losses, "b", label = "Validation", linewidth = 3)
    ax[0].legend(loc = "upper right", fontsize = 18)
    ax[0].axvline(x=int(best_epoch), color = 'black', linestyle ="--", linewidth = 3)
    ax[0].annotate("Best epoch: {}\nMSE_train: {:.3f}\nMSE_valid: {:.3f}".format(int(best_epoch), min_tl, min_vl), \
                   xy = (0.5,0.5), xycoords = 'axes fraction')
    
    best_mvd_plot = mm.inverse_transform(best_mvd.reshape(-1, 1))
    best_mtd_plot = mm.inverse_transform(best_mtd.reshape(-1, 1))

    N_valid = best_mvd_plot.shape[0]
    
    mean = float(np.mean(best_mvd_plot))
    std_dev = float(np.std(best_mvd_plot))
    kurt = float(kurtosis(best_mvd_plot))
    skewn = float(skew(best_mvd_plot))
    q1 = float(np.quantile(best_mvd_plot, 0.25))
    median = float(np.quantile(best_mvd_plot, 0.5))
    q3 = float(np.quantile(best_mvd_plot, 0.75))
    mae = np.absolute(best_mvd_plot).sum() / N_valid
    mse = np.square(best_mvd_plot).sum() / N_valid
    
    text = "mean: {:.3f}\nstd: {:.3f}\nkurt: {:.3f}\nskew: {:.3f}\nq1: {:.3f}\nmed: {:.3f}\nq3: {:.3f}\niqr: {:.3f}\nmae: {:.3f}\nmse: {:.3f}\n".format(mean, std_dev, kurt, skewn, q1, median, q3, q3-q1, mae, mse)
    
    ax[1].hist(best_mvd_plot, alpha = 0.3, label = "validation set", bins = 100, density = True, range = (-16, 16))
    ax[1].set(title = "TTF error distributions at best epoch")
    ax[1].set_xlabel("Error (seconds)", fontsize = 20)
    ax[1].set_ylabel("Density", fontsize = 20)
    ax[1].hist(best_mtd_plot, alpha = 0.3, label = "training set", bins = 100, density = True, range = (-16, 16))
    ax[1].annotate(text, xy =(-15, 0.1))
    ax[1].legend()


    plt.gcf()
    plt.savefig('Models/' + model_name + '/' + model_name + "_plot.jpg")
    plt.show()

    model_features = {"N_samples": N_samples, "N_train": N_train, "N_valid": N_valid, \
                      "overlap_rate": overlap_rate, "learning_rate": learning_rate, "num_epochs": num_epochs, \
                     "seed": seed, "batch_size": batch_size, "train_losses": train_losses, "valid_losses": valid_losses, \
                     "valid_differences": valid_differences, "best_mtd": best_mtd, "best_mvd": best_mvd, "min_tl": min_tl, \
                      "min_vl": min_vl}

    pickle.dump(model_features, open('Models/' + model_name + '/' + model_name + ".p", "wb" ))
    
    model_features_display = {"N_samples": N_samples, "N_train": N_train, "N_valid": N_valid, \
                  "overlap_rate": overlap_rate, "learning_rate": learning_rate, "num_epochs": num_epochs, \
                 "seed": seed, "batch_size": batch_size}
    
    pickle.dump(model_features_display, open('Models/' + model_name + '/' + model_name + "display.p", "wb" ))

# *LSTM*

LSTM is a type of recurrent neural network and is a relevant architecture to treat temporal sequences. 

Pytorch Notes:

"Before getting to the example, note a few things. Pytorch’s LSTM expects all of its inputs to be 3D tensors. The semantics of the axes of these tensors is important. The first axis is the sequence itself, the second indexes instances in the mini-batch, and the third indexes elements of the input."

https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html


In [None]:
### Hyperparameters specific to LSTM

num_epochs = 1000 #1000 epochs
learning_rate = 0.001 #0.001 lr
hidden_size = 1 #number of features in hidden state
num_layers = 1 #number of stacked lstm layers => should stay at 1 unless you want to combine two LSTMs together
N_sub_patches = 250
N_features = 10
device = "cuda"
n_batch = 1 # One batch with all training data

In [None]:
def feature_expansion(X, N_sub_patches, L_sub_patch):
    
    #print(type(X))
    N_samples = X.shape[0]
    x = X.reshape(N_samples, N_sub_patches, L_sub_patch)
    x_mean = np.mean(x, axis = 2).reshape(N_samples, N_sub_patches, 1)
    x_std = np.std(x, axis = 2).reshape(N_samples, N_sub_patches, 1)
    x_skew = np.array(scipy.stats.skew(x, axis = 2), dtype = np.double).reshape(N_samples, N_sub_patches, 1)
    x_kurt = np.array(scipy.stats.kurtosis(x, axis = 2), dtype = np.double).reshape(N_samples, N_sub_patches, 1)
    x_min = np.min(x, axis = 2).reshape(N_samples, N_sub_patches, 1)
    x_max = np.max(x, axis = 2).reshape(N_samples, N_sub_patches, 1)
    x_q1 = np.quantile(x, 0.25, axis = 2).reshape(N_samples, N_sub_patches, 1)
    x_med = np.quantile(x, 0.5, axis = 2).reshape(N_samples, N_sub_patches, 1)
    x_q3 = np.quantile(x, 0.75, axis = 2).reshape(N_samples, N_sub_patches, 1)
    x_iqr = x_q3 - x_q1
    
    X_rearranged = np.concatenate((x_mean, x_std, x_skew, x_kurt, x_min, x_max, x_q1, x_med, x_q3, x_iqr), axis = 2)
    return X_rearranged
    

def lstm_feature_engineering(X_train, Y_train, X_valid, Y_valid, patch_size, N_sub_patches):

    X_train = np.array(th.squeeze(X_train)); Y_train = np.array(Y_train)
    X_valid = np.array(th.squeeze(X_valid)); Y_valid = np.array(Y_valid)

    L_seq = X_train.shape[1]
    L_sub_patch = int(L_seq / N_sub_patches)
    N_features = 10 # mean, std, skew, kurt, min, max, quantiles 0.25, 0.5, 0.75, inter-quartile range
    
    X_train_rearranged = feature_expansion(X_train, N_sub_patches, L_sub_patch)
    X_valid_rearranged = feature_expansion(X_valid, N_sub_patches, L_sub_patch)

    X_train = th.from_numpy(X_train_rearranged)
    X_valid = th.from_numpy(X_valid_rearranged)

    Y_train = th.from_numpy(Y_train); Y_valid = th.from_numpy(Y_valid)
    #Y_train = th.unsqueeze(Y_train, -1); Y_valid = th.unsqueeze(Y_valid, -1)

    return X_train, Y_train, X_valid, Y_valid

In [None]:
X_train_, Y_train_, X_valid_, Y_valid_ = lstm_feature_engineering(X_train, Y_train, X_valid, Y_valid, patch_size, N_sub_patches = N_sub_patches)
N_features = X_train_.shape[-1]

In [None]:
#Inspired from https://cnvrg.io/pytorch-lstm/

class Lstm(nn.Module):
    
    def __init__(self, N_features, hidden_size, num_layers, seq_length):
        super(Lstm, self).__init__()

        self.num_layers = num_layers # number of layers
        self.N_features = N_features # number of features
        self.hidden_size = hidden_size # hidden state
        self.seq_length = seq_length # sequence length

        self.lstm = nn.LSTM(input_size=N_features, hidden_size=hidden_size,
                          num_layers=num_layers, batch_first=True) #lstm => Input Shape : (N_batch, L_seq, N_feature)
    
    def forward(self,x):
        
        output, (_,_) = self.lstm(x.float())
        out = output[:,-1,0] # Retrieving predicted time at the end of the training
        
        return out

In [None]:
L_seq = N_sub_patches

In [None]:
model = Lstm(N_features, hidden_size, num_layers, L_seq) #our lstm class
model.cuda()

In [None]:
loss = th.nn.MSELoss()    # mean-squared error for regression
optimizer = th.optim.Adam(model.parameters(), lr=learning_rate) 

In [None]:
# Training LSTM

th.cuda.empty_cache()

train_losses, valid_losses = [], []
best_mvd, best_mtd = [], []

min_vl = 1000
min_tl = 1000

start = time.perf_counter()

for epoch in range(num_epochs):
    
    outputs = model.forward(X_train_.cuda()) #forward pass
    optimizer.zero_grad() #calculate the gradient, manually setting to 0
    # obtain the loss function
    _loss = loss(outputs, Y_train_.cuda())
    _loss.backward() #calculates the loss of the loss function
    running_loss = _loss.item()
    optimizer.step() #improve from loss, i.e backprop


    model.eval()
    with th.no_grad():
        
        Y_valid_pred = th.squeeze(model(X_valid_.cuda())).cpu()
        valid_loss = loss(Y_valid_pred, Y_valid_)
        valid_differences = Y_valid_pred[:] - Y_valid_[:]
        valid_differences = valid_differences.numpy()
        
        valid_losses.append(valid_loss.item())
        train_losses.append(running_loss / n_batch)
        
        if valid_loss < min_vl:

            min_vl = valid_loss
            min_tl = running_loss

            best_mvd = valid_differences

            Y_final = th.squeeze(model(X_train_.cuda())).cpu()

            best_mtd = Y_train_ - Y_final
            best_mtd = best_mtd.cpu().detach().numpy()
            
    model.train() 
    
    if epoch%10 == 0:
        print("Epoch: {}\t".format(epoch),
                "train Loss: {:.5f}.. ".format(train_losses[-1]),
                "valid Loss: {:.5f}.. ".format(valid_losses[-1])) 
        
print("---------- Best : {:.3f}".format(min(valid_losses)), " at epoch " 
    , np.fromiter(valid_losses, dtype=float).argmin(), " / ", epoch + 1)
    
end = time.perf_counter()
print("\ntime elapsed: {:.3f}".format(end - start))

In [None]:
#model_name = input("Choose a name for the model: ")
model_name = "Lstm"
os.mkdir('Models/' + model_name)

In [None]:
plot_and_save_results(train_losses, valid_losses, best_mvd, best_mtd, min_tl, min_vl, mm, model_name)
th.save(model.state_dict(), 'Models/' + model_name + '/' + model_name)

In [None]:
del X_train; gc.collect()

In [None]:
del X_valid; gc.collect()
del Y_valid; gc.collect()
del Y_train; gc.collect()

# *Retrieve Test Data*

In [None]:
### Retrieve Test Data

#rootpath = os.getcwd() + "/"
rootpath = "../input/LANL-Earthquake-Prediction/"

X_test_ = []

for filename in os.listdir(rootpath + "test"):
    temp_df = pd.read_csv(rootpath + "test/" + filename)
    X_test_.append(temp_df)

patch_size = X_test_[0].shape[0]
sample_submission = pd.read_csv(rootpath + "sample_submission.csv")

X_test = th.zeros((len(X_test_), patch_size))
for i in range(len(X_test_)):
    X_test[i,:] = th.tensor(X_test_[i]["acoustic_data"], dtype = th.float32)

del X_test_; gc.collect()

Y_test = th.tensor(sample_submission["time_to_failure"])

N_test = X_test.shape[0]
X_test = X_test.reshape([N_test, 1, patch_size])

# *Submission*

In [None]:
### Load Model

model = Lstm(N_features, hidden_size, num_layers, L_seq).cuda()
model.load_state_dict(th.load('Models/Lstm/Lstm'))

In [None]:
L_sub_patch = int(patch_size / N_sub_patches)
X_test = np.array(th.squeeze(X_test))
#X_test.to('cuda')
X_test_expanded = feature_expansion(X_test, N_sub_patches, L_sub_patch)
X_test_expanded = th.from_numpy(X_test_expanded).cuda()

model.eval()
with th.no_grad():
    Y_test_predicted = model(X_test_expanded)

In [None]:
#Y_test_predicted.cpu()
Y_test_predicted = np.array(Y_test_predicted.cpu()).squeeze()

In [None]:
sample_submission['time_to_failure'] = Y_test_predicted

In [None]:
sample_submission.to_csv('submission.csv', index = False)