In [None]:
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# import progress bar
from tqdm import tqdm

# import library for filter
from scipy.signal import butter,filtfilt

# import library for stat
from scipy.stats import skew,kurtosis

# import library for FFT and power spectra calculations
from numpy.fft import fft,fftfreq

# Import the functions we'll use for the STFT
import librosa as lr
from librosa.core import stft, amplitude_to_db


# from lightgbm import LGBMRegressor
# # import library for machine learning
from sklearn import preprocessing, model_selection, metrics
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score


# Import the necessary modules
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center>Data loading and features extraction</center></h3>

## 1: load the data
### 1.1: create df with the sequence ID in training set

In [None]:
# create empty list: filename_list 
filename_list = [] 
file_path = r'/kaggle/input/predict-volcanic-eruptions-ingv-oe/train'
all_files = glob.glob(file_path + "/*.csv")

# add filename to filename_list
filename_list.append(all_files)

# Get the sequence number: remove the path and finaly the file type
list_sequence = [] 
for file in all_files:
    file = file.split("/")[-1]
    file = file.split(".")[-2]
    list_sequence.append(int(file))

df_list_sequence = pd.DataFrame(list_sequence)
df_list_sequence.columns=['segment_id']
df_list_sequence.head()

### 1.2: create df with the sequence ID in testing set

In [None]:
# create empty list: filename_list 
filename_list_test = [] 
file_path_test = r'/kaggle/input/predict-volcanic-eruptions-ingv-oe/test'
all_files_test = glob.glob(file_path_test + "/*.csv")

# add filename to filename_list
filename_list_test.append(all_files_test)

# add filename to filename_list
filename_list_test.append(all_files_test)

# Get the sequence number: remove the path and finaly the file type
list_sequence_test = [] 
for file in all_files_test:
    file = file.split("/")[-1]
    file = file.split(".")[-2]
    list_sequence_test.append(int(file))

df_list_sequence_test = pd.DataFrame(list_sequence_test)
df_list_sequence_test.columns=['segment_id']
df_list_sequence_test.head()

## 1.3: load file with segment id and time to eruption

In [None]:
train = pd.read_csv("../input/predict-volcanic-eruptions-ingv-oe/train.csv")
train.head()

In [None]:
_= train['time_to_eruption'].hist(bins=20)
print(train['time_to_eruption'].skew())

## 2: attributes for feature engineering

In [None]:
# # Filter requirements.
# T = 600.0         # Sample Period
# fs = 100.0       # sample rate, Hz
# cutoff = 20      # desired cutoff frequency of the filter, Hz ,
# nyq = 0.5 * fs  # Nyquist Frequency
# normal_cutoff = cutoff / nyq
# order = 2       # sin wave can be approx represented as quadratic
# n = int(T * fs) # total number of samples

# # Prepare the STFT
# HOP_LENGTH = 2**6
# SIZE_WINDOW = 2**9

In [None]:
# def butter_lowpass_filter(data, cutoff, fs, order):
#     """
#     remove frequency lower than cutoff
#     Parameters
#     ----------
#     data : array_like
#     cutoff: int
#     fs: float
#     order: int
#     Returns
#     -------
#     y:numpy.ndarray
#     """
#     # Get the filter coefficients 
#     b, a = butter(order, normal_cutoff, btype='low', analog=False)
#     y = filtfilt(b, a, data)
#     return y

# def compute_stat(var):
#     var_sum = np.sum(var)
#     var_max = np.max(var)
#     var_mean = np.mean(var)
#     var_std = np.std(var)
# #     var_p01 = np.percentile(var,1)
# #     var_p05 = np.percentile(var,5)
# #     var_p15 = np.percentile(var,15)
#     var_p25 = np.percentile(var,25)
# #     var_p35 = np.percentile(var,35)
# #     var_p45 = np.percentile(var,45)
#     var_p50 = np.percentile(var,50)
# #     var_p55 = np.percentile(var,55)
#     var_p75 = np.percentile(var,75)
# #     var_p85 = np.percentile(var,85)
# #     var_p90 = np.percentile(var,90)
#     var_p95 = np.percentile(var,95)
#     var_p99 = np.percentile(var,99)
#     var_skew = skew(var)
#     var_kurtosis = kurtosis(var)
#     result=[[var_sum,var_max,var_mean,var_std,var_skew,var_kurtosis,\
#             var_p25,var_p50,var_p75,var_p95,var_p99]]
#     X = pd.DataFrame(result)
    
#     return X

In [None]:
# def extract_feature(list_files,list_sequence):
#     features_env  = pd.DataFrame()
#     features_fft  = pd.DataFrame()
#     features_centroids = pd.DataFrame()
#     features_bandwidths = pd.DataFrame()

#     for filename in list_files:
#         # load the file
#         sequence = pd.read_csv(filename)
#         # replace NaN value by '0'
#         sequence = sequence.fillna(0)

#         # -----------------------------     filter the signals
#         df_filter = pd.DataFrame()    
#         for i in range(0,sequence.shape[1]):
#             filter_sequence = pd.DataFrame(butter_lowpass_filter(sequence.iloc[:,i], cutoff, fs, order=2))
#             df_filter = pd.concat([df_filter,filter_sequence],axis=1)

#         # ------------------------------       envelop
#         # calculate absolute signal
#         df_sequence_abs = df_filter.apply(np.abs)
#         features_env_1 = pd.DataFrame()
#         for i in range(0,df_sequence_abs.shape[1]):
#             stat = compute_stat(df_sequence_abs.iloc[:,i])
#             features_env_1 = pd.concat([features_env_1,stat],axis=1)
            
#         features_env = pd.concat([features_env,features_env_1]) 
        
#         # ------------------------------        fft
#         # # FFT parameters
#         n=len(filter_sequence)    # number of point
#         Lx =600                   # time period
#         freqs = fftfreq(n)        # Creates all the necessary frequencies
#         mask = freqs > 0          # mask array to be used for power spectra
#         ## fft
# #         fft_stat = pd.DataFrame()
#         features_fft_1 = pd.DataFrame()
#         features_bandwidths_1 = pd.DataFrame()
#         features_centroids_1  = pd.DataFrame()
#         for i in range(0,df_filter.shape[1]):
#             #----------------------------- extract features from fft
#             fft_vals = fft(df_filter.iloc[:, i].values)
#             fft_theo = 2.0*np.abs(fft_vals/n)
#             stat_fft = compute_stat(fft_theo[mask])
#             features_fft_1 = pd.concat([features_fft_1,stat_fft],axis=1) 
            
#             # ------------------------ extract features from spectogram
#             # convert signal to numpy array float
#             arr = np.array(df_filter.iloc[:, i].values.astype(float))
#             # create the spectogramme
#             spec = stft(arr, hop_length=HOP_LENGTH, n_fft=SIZE_WINDOW)
#             # Convert into decibels
#             spec_db = amplitude_to_db(np.abs(spec))
#             # Calculate the spectral centroid and bandwidth for the spectrogram
#             bandwidths = lr.feature.spectral_bandwidth(S=np.abs(spec_db))[0]
#             centroids  = lr.feature.spectral_centroid(S=np.abs(spec_db))[0]
#             # remove high values at the beginning and end
#             centroids = centroids[3:-3]
#             bandwidths = bandwidths[3:-3]
#             # feature extraction 
#             stat_bandwidths  = compute_stat(bandwidths)
#             stat_centroids   = compute_stat(centroids)
#             features_bandwidths_1  = pd.concat([features_bandwidths_1,stat_bandwidths],axis=1)
#             features_centroids_1  = pd.concat([features_centroids_1,stat_centroids],axis=1)
         
#         features_fft = pd.concat([features_fft,features_fft_1])
#         features_bandwidths = pd.concat([features_bandwidths,features_bandwidths_1])
#         features_centroids = pd.concat([features_centroids,features_centroids_1])
        
#     # add column name
#     name_envelop = ['env_s1_sum','env_s1_max','env_s1_mean','env_s1_std','env_s1_skew','env_s1_kurtosis','env_s1_p25','env_s1_p50','env_s1_p75','env_s1_p95','env_s1_p99',
#                     'env_s2_sum','env_s2_max','env_s2_mean','env_s2_std','env_s2_skew','env_s2_kurtosis','env_s2_p25','env_s2_p50','env_s2_p75','env_s2_p95','env_s2_p99',
#                     'env_s3_sum','env_s3_max','env_s3_mean','env_s3_std','env_s3_skew','env_s3_kurtosis','env_s3_p25','env_s3_p50','env_s3_p75','env_s3_p95','env_s3_p99',
#                     'env_s4_sum','env_s4_max','env_s4_mean','env_s4_std','env_s4_skew','env_s4_kurtosis','env_s4_p25','env_s4_p50','env_s4_p75','env_s4_p95','env_s4_p99',
#                     'env_s5_sum','env_s5_max','env_s5_mean','env_s5_std','env_s5_skew','env_s5_kurtosis','env_s5_p25','env_s5_p50','env_s5_p75','env_s5_p95','env_s5_p99',
#                     'env_s6_sum','env_s6_max','env_s6_mean','env_s6_std','env_s6_skew','env_s6_kurtosis','env_s6_p25','env_s6_p50','env_s6_p75','env_s6_p95','env_s6_p99',
#                     'env_s7_sum','env_s7_max','env_s7_mean','env_s7_std','env_s7_skew','env_s7_kurtosis','env_s7_p25','env_s7_p50','env_s7_p75','env_s7_p95','env_s7_p99',
#                     'env_s8_sum','env_s8_max','env_s8_mean','env_s8_std','env_s8_skew','env_s8_kurtosis','env_s8_p25','env_s8_p50','env_s8_p75','env_s8_p95','env_s8_p99',
#                     'env_s9_sum','env_s9_max','env_s9_mean','env_s9_std','env_s9_skew','env_s9_kurtosis','env_s9_p25','env_s9_p50','env_s9_p75','env_s9_p95','env_s9_p99',
#                     'env_s10_sum','env_s10_max','env_s10_mean','env_s10_std','env_s10_skew','env_s10_kurtosis','env_s10_p25','env_s10_p50','env_s10_p75','env_s10_p95','env_s10_p99']
#     features_env.columns = name_envelop
# # #
#     name_fft = ['fft_s1_sum','fft_s1_max','fft_s1_mean','fft_s1_std','fft_s1_skew','fft_s1_kurtosis','fft_s1_p25','fft_s1_p50','fft_s1_p75','fft_s1_p95','fft_s1_p99',
#                     'fft_s2_sum','fft_s2_max','fft_s2_mean','fft_s2_std','fft_s2_skew','fft_s2_kurtosis','fft_s2_p25','fft_s2_p50','fft_s2_p75','fft_s2_p95','fft_s2_p99',
#                     'fft_s3_sum','fft_s3_max','fft_s3_mean','fft_s3_std','fft_s3_skew','fft_s3_kurtosis','fft_s3_p25','fft_s3_p50','fft_s3_p75','fft_s3_p95','fft_s3_p99',
#                     'fft_s4_sum','fft_s4_max','fft_s4_mean','fft_s4_std','fft_s4_skew','fft_s4_kurtosis','fft_s4_p25','fft_s4_p50','fft_s4_p75','fft_s4_p95','fft_s4_p99',
#                     'fft_s5_sum','fft_s5_max','fft_s5_mean','fft_s5_std','fft_s5_skew','fft_s5_kurtosis','fft_s5_p25','fft_s5_p50','fft_s5_p75','fft_s5_p95','fft_s5_p99',
#                     'fft_s6_sum','fft_s6_max','fft_s6_mean','fft_s6_std','fft_s6_skew','fft_s6_kurtosis','fft_s6_p25','fft_s6_p50','fft_s6_p75','fft_s6_p95','fft_s6_p99',
#                     'fft_s7_sum','fft_s7_max','fft_s7_mean','fft_s7_std','fft_s7_skew','fft_s7_kurtosis','fft_s7_p25','fft_s7_p50','fft_s7_p75','fft_s7_p95','fft_s7_p99',
#                     'fft_s8_sum','fft_s8_max','fft_s8_mean','fft_s8_std','fft_s8_skew','fft_s8_kurtosis','fft_s8_p25','fft_s8_p50','fft_s8_p75','fft_s8_p95','fft_s8_p99',
#                     'fft_s9_sum','fft_s9_max','fft_s9_mean','fft_s9_std','fft_s9_skew','fft_s9_kurtosis','fft_s9_p25','fft_s9_p50','fft_s9_p75','fft_s9_p95','fft_s9_p99',
#                     'fft_s10_sum','fft_s10_max','fft_s10_mean','fft_s10_std','fft_s10_skew','fft_s10_kurtosis','fft_s10_p25','fft_s10_p50','fft_s10_p75','fft_s10_p95','fft_s10_p99']
#     features_fft.columns = name_fft

#     name_centroid = ['cent_s1_sum','cent_s1_max','cent_s1_mean','cent_s1_std','cent_s1_skew','cent_s1_kurtosis','cent_s1_p25','cent_s1_p50','cent_s1_p75','cent_s1_p95','cent_s1_p99',
#                     'cent_s2_sum','cent_s2_max','cent_s2_mean','cent_s2_std','cent_s2_skew','cent_s2_kurtosis','cent_s2_p25','cent_s2_p50','cent_s2_p75','cent_s2_p95','cent_s2_p99',
#                     'cent_s3_sum','cent_s3_max','cent_s3_mean','cent_s3_std','cent_s3_skew','cent_s3_kurtosis','cent_s3_p25','cent_s3_p50','cent_s3_p75','cent_s3_p95','cent_s3_p99',
#                     'cent_s4_sum','cent_s4_max','cent_s4_mean','cent_s4_std','cent_s4_skew','cent_s4_kurtosis','cent_s4_p25','cent_s4_p50','cent_s4_p75','cent_s4_p95','cent_s4_p99',
#                     'cent_s5_sum','cent_s5_max','cent_s5_mean','cent_s5_std','cent_s5_skew','cent_s5_kurtosis','cent_s5_p25','cent_s5_p50','cent_s5_p75','cent_s5_p95','cent_s5_p99',
#                     'cent_s6_sum','cent_s6_max','cent_s6_mean','cent_s6_std','cent_s6_skew','cent_s6_kurtosis','cent_s6_p25','cent_s6_p50','cent_s6_p75','cent_s6_p95','cent_s6_p99',
#                     'cent_s7_sum','cent_s7_max','cent_s7_mean','cent_s7_std','cent_s7_skew','cent_s7_kurtosis','cent_s7_p25','cent_s7_p50','cent_s7_p75','cent_s7_p95','cent_s7_p99',
#                     'cent_s8_sum','cent_s8_max','cent_s8_mean','cent_s8_std','cent_s8_skew','cent_s8_kurtosis','cent_s8_p25','cent_s8_p50','cent_s8_p75','cent_s8_p95','cent_s8_p99',
#                     'cent_s9_sum','cent_s9_max','cent_s9_mean','cent_s9_std','cent_s9_skew','cent_s9_kurtosis','cent_s9_p25','cent_s9_p50','cent_s9_p75','cent_s9_p95','cent_s9_p99',
#                     'cent_s10_sum','cent_s10_max','cent_s10_mean','cent_s10_std','cent_s10_skew','cent_s10_kurtosis','cent_s10_p25','cent_s10_p50','cent_s10_p75','cent_s10_p95','cent_s10_p99']
#     features_centroids.columns = name_centroid

#     name_bandwidths = ['band_s1_sum','band_s1_max','band_s1_mean','band_s1_std','band_s1_skew','band_s1_kurtosis','band_s1_p25','band_s1_p50','band_s1_p75','band_s1_p95','band_s1_p99',
#                     'band_s2_sum','band_s2_max','band_s2_mean','band_s2_std','band_s2_skew','band_s2_kurtosis','band_s2_p25','band_s2_p50','band_s2_p75','band_s2_p95','band_s2_p99',
#                     'band_s3_sum','band_s3_max','band_s3_mean','band_s3_std','band_s3_skew','band_s3_kurtosis','band_s3_p25','band_s3_p50','band_s3_p75','band_s3_p95','band_s3_p99',
#                     'band_s4_sum','band_s4_max','band_s4_mean','band_s4_std','band_s4_skew','band_s4_kurtosis','band_s4_p25','band_s4_p50','band_s4_p75','band_s4_p95','band_s4_p99',
#                     'band_s5_sum','band_s5_max','band_s5_mean','band_s5_std','band_s5_skew','band_s5_kurtosis','band_s5_p25','band_s5_p50','band_s5_p75','band_s5_p95','band_s5_p99',
#                     'band_s6_sum','band_s6_max','band_s6_mean','band_s6_std','band_s6_skew','band_s6_kurtosis','band_s6_p25','band_s6_p50','band_s6_p75','band_s6_p95','band_s6_p99',
#                     'band_s7_sum','band_s7_max','band_s7_mean','band_s7_std','band_s7_skew','band_s7_kurtosis','band_s7_p25','band_s7_p50','band_s7_p75','band_s7_p95','band_s7_p99',
#                     'band_s8_sum','band_s8_max','band_s8_mean','band_s8_std','band_s8_skew','band_s8_kurtosis','band_s8_p25','band_s8_p50','band_s8_p75','band_s8_p95','band_s8_p99',
#                     'band_s9_sum','band_s9_max','band_s9_mean','band_s9_std','band_s9_skew','band_s9_kurtosis','band_s9_p25','band_s9_p50','band_s9_p75','band_s9_p95','band_s9_p99',
#                     'band_s10_sum','band_s10_max','band_s10_mean','band_s10_std','band_s10_skew','band_s10_kurtosis','band_s10_p25','band_s10_p50','band_s10_p75','band_s10_p95','band_s10_p99']
#     features_bandwidths.columns = name_bandwidths


#     # add segment_id
#     features = pd.concat([list_sequence,features_env,features_fft,features_centroids,features_bandwidths],axis=1)
# #     features = features_bandwidths
#     return features


## FEATURES EXTRACTION

In [None]:
# train_features = extract_feature(all_files,df_list_sequence)
# train_features = pd.merge(train_features, train, on=['segment_id', 'segment_id'])
# train_features.to_csv('train_features2.csv')

In [None]:
test_features = extract_feature(all_files_test,df_list_sequence_test)
test_features.to_csv('test_features2.csv')
test_features.head()

## 2: Load the feature data

In [None]:
train_features = pd.read_csv("../input/features2/train_features2.csv")
target = train_features[['time_to_eruption']]
train_features = train_features.drop(columns=['Unnamed: 0','time_to_eruption','index'])
train_features.head(2)

In [None]:
test_features = pd.read_csv("../input/features2/test_features2.csv")
test_features = test_features.drop(columns=['Unnamed: 0','index'])
# test_features = test_features.drop(columns=['segment_id'])
test_features.head(2)

### 2.1: Scale the data

In [None]:
from sklearn.preprocessing import RobustScaler
scaler= RobustScaler()
# transform "train_features"
train_features_scaled = scaler.fit_transform(train_features)
# transform "test_features"
test_features_scaled = scaler.transform(test_features)

In [None]:
df_train_features_scaled = pd.DataFrame(train_features_scaled)
df_test_features_scaled = pd.DataFrame(test_features_scaled)

### 2.2: calculate correlation between feature data and time to eruption

In [None]:
df_cor = abs(df_train_features_scaled[df_train_features_scaled.columns[1:-1]].apply(lambda x: x.corr(target['time_to_eruption']))).sort_values(ascending=False)
df_cor

### 2.3: select meaningful features

In [None]:
best_corr = pd.DataFrame(df_cor[:100])
best_corr.reset_index(inplace = True)
best_corr.head(2)

In [None]:
col_name_feature_importance = best_corr.iloc[:,0]
col_name_feature_importance

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center>3: MODEL</center></h3>

## 3.1: function to evaluate the models

In [None]:
import math
def score_RMSE(y_pred):
    return str(math.sqrt(mean_squared_error(y_test, y_pred)))

def get_best_score(grid):
    
    best_score = np.sqrt(-grid.best_score_)
    print(best_score)    
    print(grid.best_params_)
    print(grid.best_estimator_)
    
    return best_score

from statsmodels.graphics.api import abline_plot
def model_evaluation(prediction):
    print("R2 (explained variance):", round(metrics.r2_score(y_test, prediction), 2))
    print("Mean Absolute Perc Error (Σ(|y-pred|/y)/n):", np.mean(np.abs((y_test-prediction)/prediction)))
    print("Mean Absolute Error (Σ|y-pred|/n):", "{:,f}".format(metrics.mean_absolute_error(y_test, prediction)))
    print("Root Mean Squared Error (sqrt(Σ(y-pred)^2/n)):", "{:,f}".format(np.sqrt(metrics.mean_squared_error(y_test, prediction))))
    ## residuals
    prediction = prediction.reshape(len(prediction),1)
    residuals = y_test - prediction
    if abs(max(residuals)) > abs(min(residuals)):
        max_error = max(residuals)  
    else:
        max_error = min(residuals) 
    max_idx = list(residuals).index(max(residuals)) if abs(max(residuals)) > abs(min(residuals)) else list(residuals).index(min(residuals))
    max_true = y_test[max_idx]
    max_pred = prediction[max_idx]
    print("Max Error:", "{}".format(max_error))
    
    ## Plot predicted vs true
    fig, ax = plt.subplots(nrows=1, ncols=2,figsize=(10,5))
    ax[0].scatter(prediction, y_test, color="black")
    abline_plot(intercept=0, slope=1, color="red", ax=ax[0])
    ax[0].vlines(x=max_pred, ymin=max_true, ymax=max_true-max_error, color='red', linestyle='--', alpha=0.7, label="max error")
    ax[0].grid(True)
    ax[0].set(xlabel="Predicted", ylabel="True", title="Predicted vs True")
    ax[0].legend()

    ## Plot predicted vs residuals
    ax[1].scatter(prediction, residuals, color="red")
    ax[1].vlines(x=max_pred, ymin=0, ymax=max_error, color='black', linestyle='--', alpha=0.7, label="max error")
    ax[1].grid(True)
    ax[1].set(xlabel="Predicted", ylabel="Residuals", title="Predicted vs Residuals")
    ax[1].hlines(y=0, xmin=np.min(prediction), xmax=np.max(prediction))
    ax[1].legend()
    plt.show()


## 3.1: Linear regression

### 3.1.1: Find the optimal number of features to use for ML

In [None]:
# linreg_score = []
# number_feature = []
# for i in range (1,100):
#     # select diferent 
#     columns = col_name_feature_importance[:i]
#     X = df_train_features_scaled[columns].values
#     y = target.values
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)
    
#     linreg = LinearRegression()
#     parameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
#     grid_linear = GridSearchCV(linreg, parameters, cv=12, verbose=1 , scoring = 'neg_mean_squared_error')
#     grid_linear.fit(X_train, y_train)

#     sc_linear = get_best_score(grid_linear)
#     linreg_score.append(sc_linear)
#     number_feature.append(i)

# result =  pd.DataFrame(zip(number_feature,linreg_score),columns = ['number_feat', 'best_score'])

In [None]:
# # PLOT RESULT:
# result.plot('number_feat', 'best_score')

# # Returns index of minimun best_score
# index = result[['best_score']].idxmin() 

# # get the number of features used to have the best score 
# print(result['number_feat'][index])

### 3.1.2: Gridsearch

In [None]:
# # get the 15 best correlated columns
# columns = col_name_feature_importance[:11]

# # slipt the data
# X = df_train_features_scaled.values
# y = target.values
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

# # linear regression model and gridsearch
# linreg = LinearRegression()
# parameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
# grid_linear = GridSearchCV(linreg, parameters, cv=12, verbose=1 , scoring = 'neg_mean_squared_error')
# grid_linear.fit(X_train, y_train)

# sc_linear = get_best_score(grid_linear)

### 3.1.3: run linear regression with best score

In [None]:
# model_LinReg = linear_model.LinearRegression(copy_X= True, fit_intercept= True, normalize= False)
# model_LinReg.fit(X_train,y_train)
# prediction = model_LinReg.predict(X_test)
# model_evaluation(prediction)

## 3.2: XGBRegressor

In [None]:
from xgboost import XGBRegressor

X = df_train_features_scaled
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)
    
xgb = XGBRegressor(n_estimators=1500)
xgb.fit(X_train, y_train)

In [None]:
xgb.feature_importances_

In [None]:
# sort it with descending
sorted_idx = np.argsort(xgb.feature_importances_)[::-1]

In [None]:
feature_importance =[]
for index in sorted_idx:
    print([X_train.columns[index], xgb.feature_importances_[index]]) 
    feature_importance.append(X_train.columns[index])

In [None]:
from xgboost import plot_importance
plot_importance(xgb, max_num_features = 100)
plt.show()

In [None]:
# Import xgboostxgboost as xgb
import xgboost as xgb

best_feature_importance = feature_importance[:78]

X = df_train_features_scaled[best_feature_importance]
y = target

# X = train_features[best_feature_importance]
y = target.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

# # Create the DMatrix: housing_dmatrix
train_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

# # Instantiate the regressor: gbm
# gbm = xgb.XGBRegressor(objective='reg:linear',n_estimators=1500)

# gbm_param_grid = {
#     'colsample_bytree': [0.1,0.2,0.3,0.4,0.6,0.8],
#     'max_depth': range(3,10,1),   
#     'eta' : [0.01,0.02,0.03,0.04,0.05,0.1,0.15,0.2,0.3,0.4],
#     'lambda': [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
#     'alpha': [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
#     'subsample': [0.2,0.3,0.4,0.5,0.6,0.7,0.8]
# }

# randomized_mse = RandomizedSearchCV(estimator=gbm,
#                                     param_distributions=gbm_param_grid,
#                                     scoring="neg_mean_squared_error",
#                                     n_iter=200,cv=4, verbose=1)

# # Fit randomized_mse to the data
# grid_xgb =  randomized_mse.fit(X_train,y_train)

# # Print the best parameters and lowest RMSE
# sc_xgb = get_best_score(grid_xgb)

In [None]:
# Instantiate the XGBRegressor: xg_reg
import xgboost as xgb

# Create the DMatrix: housing_dmatrix
train_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

xgb  = xgb.XGBRegressor(objective='reg:linear',n_estimators = 1500,subsample = 0.6,
                        max_depth = 9, eta = 0.01,colsample_bytree = 0.4,alpha = 0.3,reg_lambda = 0.2)

# subsample': 0.4, 'max_depth': 9, 'lambda': 0.8, 'eta': 0.02, 'colsample_bytree': 0.8, 'alpha': 0.2
                        
# Fit the regressor to the training set
xgb.fit(X_train,y_train)

## Predicting the target value based on "Test_x"
y_pred = xgb.predict(X_test)

model_evaluation(y_pred)

with feature: 66 
Best parameters found:  {objective='reg:linear',n_estimators = 1500,subsample = 0.5,max_depth = 9, eta = 0.02,colsample_bytree = 0.8,alpha = 0,reg_lambda = 0.3}
Lowest RMSE found:  4,262,837.775371

with feature: 68 
Best parameters found:  {subsample = 0.6,max_depth = 8, eta = 0.01,colsample_bytree = 1,alpha = 0.5,reg_lambda = 0.2)}
Lowest RMSE found:  4,259,511.986089

with feature: 41
Best parameters found:  {'subsample = 0.6,max_depth = 9, eta = 0.01,colsample_bytree = 0.4,alpha = 0.3,reg_lambda = 0.2}
Lowest RMSE found:  4,259,582.235790

with feature: 51
Best parameters found:  {'subsample': 0.7, 'max_depth': 7, 'eta': 0.1, 'colsample_bytree': 0.8}
Lowest RMSE found:  7,005,340.813945933

with feature: 56
Best parameters found:  {'subsample': 0.8, 'max_depth': 8, 'eta': 0.1, 'colsample_bytree': 0.8}
Lowest RMSE found:  6,986,756.825361339

with feature: 61
Best parameters found:  {'subsample': 0.8, 'max_depth': 8, 'eta': 0.1, 'colsample_bytree': 0.6}
Lowest RMSE found:  6,796,230.816416972

with feature: 71
Best parameters found:  {'subsample': 0.7, 'max_depth': 8, 'eta': 0.15, 'colsample_bytree': 0.8}
Lowest RMSE found:  6,862,673.756157105

with feature: 61
Best parameters found:  {'subsample': 0.4, 'n_estimators': 500, 'max_depth': 9, 'eta': 0.05, 'colsample_bytree': 0.6}
Lowest RMSE found:  6,569,617.031801722

with feature: 66
Best parameters found:  {'subsample': 0.6, 'n_estimators': 500, 'max_depth': 8, 'eta': 0.03, 'colsample_bytree': 0.4}
Lowest RMSE found:  6,532,065.060200441

with feature: 66
Best parameters found:  {'subsample': 0.5, 'n_estimators': 900, 'max_depth': 7, 'eta': 0.04, 'colsample_bytree': 0.6}
Lowest RMSE found:  6,410,294.69215607

with feature: 67
Best parameters found:  {'subsample': 0.5, 'n_estimators': 900, 'max_depth': 7, 'eta': 0.02, 'colsample_bytree': 0.8}
Lowest RMSE found:  6441142.32956806

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center> XGBRegressor: xg_reg</center></h3>

In [None]:
# Import xgboostxgboost as xgb
import xgboost as xgb

best_feature_importance = feature_importance[:68]

X = df_train_features_scaled[best_feature_importance]
y = target.values

# # Create the DMatrix: housing_dmatrix
train_dmatrix = xgb.DMatrix(data=X, label=y)

# # Instantiate the regressor: gbm
# gbm = xgb.XGBRegressor(objective='reg:linear',n_estimators=1500)

In [None]:
# Instantiate the XGBRegressor: xg_reg
import xgboost as xgb

best_feature_importance = feature_importance[:68]

X = df_train_features_scaled[best_feature_importance]
y = target.values

# # Create the DMatrix: housing_dmatrix
train_dmatrix = xgb.DMatrix(data=X, label=y)

xgb  = xgb.XGBRegressor(objective='reg:linear',n_estimators = 1500,subsample = 0.6,
                        max_depth = 8, eta = 0.01,colsample_bytree = 1,alpha = 0.5,reg_lambda = 0.2)
                       
# Fit the regressor to the training set
xgb.fit(X,y)

test_features_scaled = df_test_features_scaled[best_feature_importance]

## Predicting the target value based on "Test_x"
y_pred = xgb.predict(test_features_scaled)


In [None]:
submission = pd.DataFrame()
submission['segment_id'] = df_list_sequence_test["segment_id"]
submission['time_to_eruption'] = y_pred
submission.head()
submission.to_csv('submission5.csv', header=True, index=False)

In [None]:
submission