In [None]:
import numpy as np 
import pandas as pd 
import os
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt
from lightgbm import LGBMRegressor
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE 
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as mse




root_filepath = "../input/predict-volcanic-eruptions-ingv-oe/"
save_filepath = "./"
os.mkdir("./preprocessed")
os.mkdir("./predictions")

In [None]:
def get_signals(id, folder="train"):
    return pd.read_csv(root_filepath + f"{folder}/{str(id)}.csv")


Read the input files and do a little **preprocessing**

In [None]:
train = pd.read_csv(root_filepath+"train.csv")
test = pd.read_csv(root_filepath+"sample_submission.csv").drop("time_to_eruption", axis=1)
sorted_train = train.sort_values("time_to_eruption")
plt.hist((train["time_to_eruption"]), bins=100)
plt.show()

Showing the furier transform for the signals of low and hight time eruption examples.

In [None]:
plt.figure(figsize=(25,20))
for i in range(20):
    plt.subplot(10,2,i+1)
    if i%2==0:
        plt.plot(np.fft.fftshift(np.real(np.fft.fft(get_signals(list(sorted_train["segment_id"])[12])[f"sensor_{i//2+1}"].fillna(0)))))
        plt.title(f"{i//2+1} MIN sensor")
    else:
        plt.plot(np.fft.fftshift(np.real(np.fft.fft(get_signals(list(sorted_train["segment_id"])[-12])[f"sensor_{i//2+1}"].fillna(0)))))
        plt.title(f"{i//2+1} maX sensor")
        


Showing the same signals without fft

In [None]:
plt.figure(figsize=(25,20))
for i in range(20):
    plt.subplot(10,2,i+1)
    if i%2==0:
        plt.plot(get_signals(list(sorted_train["segment_id"])[12])[f"sensor_{i//2+1}"].fillna(0))
        plt.title(f"{i//2+1} MIN sensor")
    else:
        plt.plot(get_signals(list(sorted_train["segment_id"])[-12])[f"sensor_{i//2+1}"].fillna(0))
        plt.title(f"{i//2+1} maX sensor")


Lets make the following features:
    - for every sensor:
        * 5,10,25,30,60,70,90% quantile
        * signal mean
        * signal std
        * signal variance
        * skew
    - for its fft compute:
        * min
        * max
        * mean
        * std

In [None]:
def create_feature_particle(signal, segment_id, sensor_id):
    output = pd.DataFrame()
    signal = signal.fillna(0)
    furier = np.real(np.fft.fft(signal))
    output.loc[segment_id, f"5th_quantile_s{sensor_id}"] = np.quantile(signal, 0.05)
    output.loc[segment_id, f"10th_quantile_s{sensor_id}"] = np.quantile(signal, 0.1)    
    output.loc[segment_id, f"25th_quantile_s{sensor_id}"] = np.quantile(signal, 0.25)    
    output.loc[segment_id, f"30th_quantile_s{sensor_id}"] = np.quantile(signal, 0.3)    
    output.loc[segment_id, f"60th_quantile_s{sensor_id}"] = np.quantile(signal, 0.6)    
    output.loc[segment_id, f"70th_quantile_s{sensor_id}"] = np.quantile(signal, 0.7)    
    output.loc[segment_id, f"90th_quantile_s{sensor_id}"] = np.quantile(signal, 0.9) 
    output.loc[segment_id, f"mean_s{sensor_id}"] = signal.mean()
    output.loc[segment_id, f"std_s{sensor_id}"] = signal.std()
    output.loc[segment_id, f"var_s{sensor_id}"] = signal.var()
    output.loc[segment_id, f"skew_s{sensor_id}"] = signal.skew()
    output.loc[segment_id, f"fft_mean_s{sensor_id}"] = furier.mean()
    output.loc[segment_id, f"fft_std_s{sensor_id}"] = furier.std()
    output.loc[segment_id, f"fft_min_s{sensor_id}"] = furier.min()
    output.loc[segment_id, f"fft_max_s{sensor_id}"] = furier.max() 
    
    return output



    

In [None]:
def create_na_feat(segment_id, folder="train"):
    output = pd.DataFrame()
    data = get_signals(segment_id, folder=folder)
    for i in range(1,11):
        output.loc[segment_id, f"na_percent_s{i}"] = data[f"sensor_{i}"].isna().sum()/len(data[f"sensor_{i}"])
    return output

In [None]:
def make_features(train, folder="train"):
    segments = []
    ci = 0

    for seg in train["segment_id"]:
        signals = get_signals(seg, folder=folder)
        segment_row = []
        if ci % 100 == 0:
            print(ci)
        for i in range(1,11):
            segment_row.append(create_feature_particle(signals[f"sensor_{i}"], seg, i))
        segments.append(pd.concat(segment_row + [create_na_feat(seg, folder=folder)], axis=1))
        ci += 1

    featured_train = pd.concat(segments, axis=0)
    featured_train = featured_train.reset_index()
    featured_train = featured_train.rename(columns={featured_train.columns[0]:"segment_id"})
    return featured_train

# **A cell to preprocess the train and the test set**

In [None]:
"""featured_train = pd.merge(make_features(train), train, on="segment_id")
featured_train.to_csv(save_filepath + "preprocessed/featured_train.csv")
featured_test = make_features(test, folder="test")
featured_test.to_csv(save_filepath + "preprocessed/featured_test.csv")
print("Save successful!")"""

In [None]:
featured_train = pd.read_csv("../input/featured-train/featured_train.csv")
featured_test = pd.read_csv("../input/featured-train/featured_test.csv")



In [None]:
f_train = featured_train.drop(["segment_id", "time_to_eruption"]+list(featured_train.columns)[-10:], axis=1)
corr_matrix = f_train.corrwith(featured_train["time_to_eruption"])
fig = plt.figure(figsize=(8, 30))
sns.scatterplot(x=list(corr_matrix), y=corr_matrix.index)

In [None]:
dropped_cols = [i for i in corr_matrix.index if abs(corr_matrix[i])<0.01]
print(dropped_cols)

In [None]:
X = featured_train.drop(["segment_id", "time_to_eruption"], axis=1)
y = featured_train["time_to_eruption"]
rfe_test = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=70, step=3)
rfe_test.fit(X,y)

In [None]:
mask = rfe_test.support_
removed = [col for ind, col in zip(mask, list(X.columns)[:-10]) if not ind]
print(removed)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=103, test_size=0.2, shuffle=True)

In [None]:
def rmse(true, pred):
    return np.sqrt(mse(true,pred))

In [None]:
params = {
    'num_leaves': 29,
    'n_estimators': 289,
    'max_depth': 8,
    'min_child_samples': 507,
    'learning_rate': 0.0812634327662599,
    'min_data_in_leaf': 13,
    'bagging_fraction': 0.020521665677937423,
    'feature_fraction': 0.05776459974779927,
    'random_state': 101
}

lgb_model = LGBMRegressor(**params)
lgb_model.fit(X_train, y_train)
pred = lgb_model.predict(X_val)

print(rmse(pred, y_val))    

In [None]:
submission = pd.concat([test,pd.DataFrame(lgb_model.predict(featured_test.drop(["segment_id"], axis=1)))], axis=1)
submission.to_csv("./submission.csv")