In [None]:
!pip install colabcode

In [None]:
from colabcode import ColabCode
ColabCode(port = 10000, password = 'bluebrown')

In [None]:
from tqdm import tqdm
import os
import pandas as pd
import numpy as np

from joblib import Parallel, delayed

import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import metrics

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.set_option('display.max_columns', None)

train_csv_path = '../input/predict-volcanic-eruptions-ingv-oe/train/'
test_csv_path = '../input/predict-volcanic-eruptions-ingv-oe/test/'

In [None]:
train_time = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/train.csv')

In [None]:
def load_csv(name, df, path):
    fname = str(name) + '.csv'
    return pd.read_csv(path + fname), df[df.segment_id == name].time_to_eruption.values[0]



def get_top_freq(df, n = 5):
    top_freq = pd.DataFrame()
    freq_col_names = ['freq_' + str(i+1) for i in range(n)]
    sensor_col_names = ['sensor_' + str(i+1) for i in range(1, 11)]
    
    for i in df.columns:        
        sf = len(df)
        fft_val = np.fft.fft(df[i])
        fft_theo = 2/sf * abs(fft_val)

        freq = np.fft.fftfreq(len(df[i]))
        freq_axis = sf/2 * np.linspace(0, 1, int(sf/2))

        temp = pd.Series(data = fft_theo[0:len(freq_axis)], index = freq_axis)
        temp = temp.nlargest(n).index.values
        temp = pd.DataFrame(temp).T
        temp.columns = freq_col_names
        temp = temp.add_prefix(i + '_')
        top_freq = pd.concat([top_freq, temp], axis = 1)
    
    return top_freq



def combine_data(df, meta_stats = True, freq = True, n = 5, pca = False):
    out_df = pd.DataFrame()
    col_names_for_percentiles = list(range(1, 11))
    
    for i in tqdm(df.segment_id):
        empty = pd.DataFrame()
        
        eg, val = load_csv(i, train_time, train_csv_path)
        eg = eg.fillna(0)
        
        if pca:
            eg = pd.DataFrame(pca.fit_transform(eg))
            eg.columns = col_names_for_percentiles
            eg = eg.add_prefix('sensor_')
        
        if meta_stats:
            # Mean, Min., Max., 25%. 75%, 90%, 95%, 99%, Std., Top n frequencies
            temp = pd.DataFrame(eg.mean()).T
            temp = temp.add_suffix('_mean')
            empty = temp

            temp = pd.DataFrame(eg.min()).T
            temp = temp.add_suffix('_min')
            empty = pd.concat([empty, temp], axis = 1)

            temp = pd.DataFrame(eg.max()).T
            temp = temp.add_suffix('_max')
            empty = pd.concat([empty, temp], axis = 1)
        
            percentiles = [25, 75, 90, 95, 99]
            for p in percentiles:
                temp = pd.DataFrame(np.percentile(eg, q = p, axis = 0)).T
                temp.columns = col_names_for_percentiles
                temp = temp.add_prefix('sensor_')
                temp = temp.add_suffix(f'_{p}%')
                empty = pd.concat([empty, temp], axis = 1)
        
        if freq:
            temp = get_top_freq(eg, n)
            empty = pd.concat([empty, temp], axis = 1)
        
        out_df = pd.concat([out_df, empty], axis = 0)
    return out_df

In [None]:
train_time

In [None]:
data = combine_data(train_time)

In [None]:
data.to_csv('./combined_data.csv')

In [None]:
from IPython.display import FileLink

FileLink('./combined_data.csv')

In [None]:
data = pd.read_csv('./combined_data.csv')
data.drop('Unnamed: 0', axis = 1, inplace = True)
data.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data, train_time.time_to_eruption, test_size = 0.1, random_state = 0)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

## Random Forest

In [None]:
p1 = {'n_estimators':range(260, 281, 5)}

rforest = RandomForestRegressor(n_jobs = -1, random_state = 0)

grid = GridSearchCV(rforest, p1, cv = 5, n_jobs = -1, scoring = 'neg_mean_absolute_error')

grid.fit(data, train_time.time_to_eruption)

In [None]:
pd.DataFrame(grid.cv_results_)

In [None]:
rforest = RandomForestRegressor(n_estimators = 265, n_jobs = -1, random_state = 0)
rforest.fit(x_train, y_train)

# y_pred = grid.predict(x_test)
y_pred = rforest.predict(x_test)

metrics.mean_absolute_error(y_test, y_pred)

## Gradient Boosting Regressor

In [None]:
gboost = GradientBoostingRegressor(learning_rate = 0.1)

In [None]:
p2 = {'n_estimators':range(500, 800, 50)}

grid2 = GridSearchCV(gboost, p2, cv = 5, n_jobs = -1, scoring = 'neg_mean_absolute_error')

grid2.fit(data, train_time.time_to_eruption)

In [None]:
pd.DataFrame(grid2.cv_results_)