In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rcParams

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        # print(os.path.join(dirname, filename))
        # x = filename.replace('.csv','')
        # print(f'File:{filename}')
        pass

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option('display.max_columns', None)

### File paths

In [None]:
# path for train data - segments
file_path_train_segments = '/kaggle/input/predict-volcanic-eruptions-ingv-oe/train/'
# path for test data - segments
file_path_test_segments = '/kaggle/input/predict-volcanic-eruptions-ingv-oe/test/'
# path for train data with label
train_data_path = '/kaggle/input/predict-volcanic-eruptions-ingv-oe/train.csv'
# path for saample submission
sample_submissioin_path = '/kaggle/input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv'

# Exploratory Data Analysis [EDA]

### Load train.csv

In [None]:
# Load the train.csv
train_data = pd.read_csv(train_data_path)

### Perform basic checks

In [None]:
print('Shape :',train_data.shape)
print('Missing data:')
print(train_data.isnull().sum())

In [None]:
rcParams['figure.figsize'] = [15,4]
train_data.time_to_eruption.plot()
plt.show()

In [None]:
# Statistics data
train_data.describe()

In [None]:
# Plot a graph

def plot_graph(df, features):

    rcParams['figure.figsize'] = [15,5]
    for cols in features:
        df.iloc[0:100][cols].plot()
    plt.legend()
    plt.show()

In [None]:
cols_org  = ['sensor_1','sensor_2','sensor_3','sensor_4','sensor_5',
                    'sensor_6','sensor_7','sensor_8','sensor_9','sensor_10']
cols_rms  = ['sensor_1_rms','sensor_2_rms','sensor_3_rms','sensor_4_rms','sensor_5_rms',
                    'sensor_6_rms','sensor_7_rms','sensor_8_rms','sensor_9_rms','sensor_10_rms']
cols_mean  = ['sensor_1_mean','sensor_2_mean','sensor_3_mean','sensor_4_mean','sensor_5_mean',
                    'sensor_6_mean','sensor_7_mean','sensor_8_mean','sensor_9_mean','sensor_10_mean']

In [None]:
# Load one train data segment[ 1136037770 ]
train_data_segments = pd.read_csv('/kaggle/input/predict-volcanic-eruptions-ingv-oe/train/1136037770.csv')
plot_graph(train_data_segments, cols_org)

In [None]:
# Load train/test CSV data

def load_train_csv(file_path, filename, df):
        
        input_data = pd.read_csv(file_path+str(filename)+'.csv')
        
        total_count = input_data.shape[0]
        
        for cols in input_data.columns:
            
            df.loc[df['segment_id']== filename, cols+'_rms'] = \
            np.sqrt(np.sum(input_data[cols].apply(lambda x: x**2)) / total_count)
            
            df.loc[df['segment_id']== filename, cols+'_mean'] = input_data[cols].mean()
            
        return()

In [None]:
import datetime as dt
print('Started at : ', dt.datetime.now().time())

i = 0

# for idx, cols in train_data.iterrows():
    
    filename = int(cols['segment_id'])
    
    # load_train_csv(file_path_train_segments, filename, train_data)
    
    i += 1
    
    if i%500 == 0:
        print(i,' -> processed')

print('Finished at : ', dt.datetime.now().time())

# train_data.to_csv('train_data.csv', index=False)

In [None]:
# Save the file
train_data.to_csv('train_data.csv', index=False)

### Line plot

In [None]:
plot_graph(train_data, cols_rms)
plot_graph(train_data, cols_mean)

### Boxplot

In [None]:
train_data.boxplot(cols_rms)
plt.show()
train_data.boxplot(cols_mean)
plt.show()

### Plot graph : Time to eruption vs Sensor data [ RMS and Mean ]

In [None]:
rcParams['figure.figsize'] = [15,4]
for cols in cols_rms:
    sns.scatterplot(cols,'time_to_eruption', data=train_data)
plt.show()
for cols in cols_mean:
    sns.scatterplot(cols,'time_to_eruption', data=train_data)
plt.show()

### Check on linear relationship between Time to eruption and sensor data [ RMS ]

In [None]:
rcParams['figure.figsize'] = [15,5]

fig, ax = plt.subplots(2,5)

i = 0

for cols in cols_rms:
    if i<5:
        row = 0
    else:
        row = 1
    col = i%5
    
    i += 1
    ax[row][col].plot(cols,'time_to_eruption','+', data=train_data)
    ax[row][col].set_xlabel(cols)
plt.show()

### Check on linear relationship between Time to eruption and sensor data [ Mean ]

In [None]:
rcParams['figure.figsize'] = [15,5]

fig, ax = plt.subplots(2,5)

i = 0

for cols in cols_mean:
    if i<5:
        row = 0
    else:
        row = 1
    col = i%5
    
    i += 1
    ax[row][col].plot(cols,'time_to_eruption','+', data=train_data)
    ax[row][col].set_xlabel(cols)
    
plt.show()

### Split the data

In [None]:
y = train_data.time_to_eruption
X = train_data[cols_rms]

### Scale the data

In [None]:
# X = StandardScaler().fit_transform(X) 
X = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=15)

### Build the model

In [None]:
model_xgb = XGBRegressor(random_state=15,
                        learning_rate = 0.1,
                        n_estimators = 1200,
                        reg_lambda = 6,
                        reg_alpha = 80,
                        max_depth = 4)

### Cross validate the model

In [None]:
xgb_score = cross_val_score(model_xgb, X_train, y_train, cv=10, verbose=1)

In [None]:
print('Scores  : ', xgb_score)
print('Average : ', xgb_score.mean())
print('STD     : ', xgb_score.std())

### Train the model

In [None]:
model_xgb.fit(X_train, y_train,
          eval_metric = "mae",
          verbose     = False ,
          early_stopping_rounds = 100,
          eval_set=[(X_train, y_train), (X_test,y_test)])

### Model learning curve

In [None]:
rcParams['figure.figsize'] = [5,4]
results = model_xgb.evals_result()
epoch = len(results['validation_0']['mae'])
x_axes = range(0,epoch)
plt.plot(x_axes,results['validation_0']['mae'], label='Training')
plt.plot(x_axes,results['validation_1']['mae'], label='Validation')
plt.legend()
plt.ylabel("mae")
plt.xlabel("Estimators")
plt.show()

In [None]:
model_xgb.best_ntree_limit

### Test the model

In [None]:
y_predict = model_xgb.predict(X_test)
print(f'R2 Score : {r2_score(y_test, y_predict)*100}')
print(f'MAE      : {mean_absolute_error(y_test,y_predict)}')

In [None]:
rcParams['figure.figsize'] = [15,4]
plt.plot(range(y_test.shape[0]), y_test)
plt.show()
plt.plot(range(y_test.shape[0]), y_predict)
plt.show()
plt.plot(range(y_test.shape[0]), abs(y_test - y_predict))
plt.show()

### Preparation for sample submission

In [None]:
sample_submissioin_data = pd.read_csv(sample_submissioin_path)

In [None]:
import datetime as dt

print('Started at : ', dt.datetime.now().time())
i = 0
# for idx, cols in sample_submissioin_data.iterrows():
    
    filename = int(cols['segment_id'])
    
    # load_train_csv(file_path_test_segments, filename, sample_submissioin_data)
    
    #i += 1
    #if i % 500 == 0:
    #    print(i, '-> processed')
    
print('Finished at : ', dt.datetime.now().time())
# sample_submissioin_data.to_csv('sample_submissioin_data.csv', index=False)

In [None]:
rcParams['figure.figsize'] = [15,4]
for cols in cols_rms:
    sns.scatterplot(range(sample_submissioin_data.shape[0]), cols, data=sample_submissioin_data)
plt.show()
for cols in cols_mean:
    sns.scatterplot(range(sample_submissioin_data.shape[0]), cols, data=sample_submissioin_data)
plt.show()

In [None]:
rcParams['figure.figsize'] = [15,5]

fig, ax = plt.subplots(2,5)

i = 0

for cols in cols_rms:
    if i<5:
        row = 0
    else:
        row = 1
    col = i%5
    
    i += 1
    ax[row][col].plot(range(sample_submissioin_data.shape[0]),cols,'+', c='y', data=sample_submissioin_data)
    ax[row][col].set_xlabel(cols)
plt.show()

In [None]:
sample_test_data = sample_submissioin_data[cols_rms]

In [None]:
sample_test_predict = model_xgb.predict(sample_test_data)

In [None]:
sample_test_predict

In [None]:
sample_submissioin = pd.DataFrame({'segment_id' : sample_submissioin_data.segment_id, 'time_to_eruption':sample_test_predict})

### Write to a file

In [None]:
sample_submissioin.to_csv('sample_submissioin.csv')