In [None]:
import numpy as np
import pandas as pd
import os

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
import lightgbm as lgbm
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from tqdm import tqdm

file_list = []
file_list_train = []
file_list_test = []

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        file_list.append(os.path.join(dirname, filename))
        
PATH = '/kaggle/input/predict-volcanic-eruptions-ingv-oe/'

for dirname, _, filenames in os.walk('/kaggle/input/predict-volcanic-eruptions-ingv-oe/train'):
    for filename in filenames:
        file_list_train.append(os.path.join(dirname, filename))
        
for dirname, _, filenames in os.walk('/kaggle/input/predict-volcanic-eruptions-ingv-oe/test'):
    for filename in filenames:
        file_list_test.append(os.path.join(dirname, filename))

In [None]:
print(file_list[2])


print(pd.read_csv(file_list[0]))
print(pd.read_csv(file_list[2]).isna().sum())

In [None]:
print(file_list_train[0])
print(pd.read_csv(file_list_train[0]))
print(len(file_list_train))
print(len(file_list_test))

files_test = [file.split('/')[-1].split('.')[-2] for file in file_list_test]
files_train = [file.split('/')[-1].split('.')[-2] for file in file_list_train]

print(files_train[0:10])
print(files_test[0:10])

In [None]:
test_set = set(files_test)
train_set = set(files_train)
inter = test_set.intersection(train_set)

print(inter)

In [None]:
train = pd.read_csv(PATH+'train.csv')

In [None]:
sns.distplot(train['time_to_eruption'],
            hist=True,
            kde=True,
            bins=100,
            color='red',
            hist_kws={'edgecolor':'black'})

In [None]:
train['time_to_eruption'].describe()

In [None]:
df_segment_id = pd.read_csv(PATH+'test/473253715.csv')

df_segment_id.plot(figsize=(20,20),
                  subplots=True,
                  layout=(10,1),
                  rot=0,
                  lw=1,
                  title='sergemnt id #473253715')
plt.show()

In [None]:
display(train.sort_values('time_to_eruption', axis=0, ascending=True).iloc[[0,-1],:])

segment_id_min = 601524801
segment_id_max = 1923243961

df_segment_id_min = pd.read_csv(PATH+'train/'+str(segment_id_min)+'.csv')

df_segment_id_max = pd.read_csv(PATH+'train/'+str(segment_id_max)+'.csv')

In [None]:
df_segment_id_min.plot(figsize=(20,20),
                  subplots=True,
                  layout=(10,1),
                  rot=0,
                  lw=1,
                  title='sergemnt id min')
plt.show()

In [None]:
df_segment_id_max.plot(figsize=(20,20),
                  subplots=True,
                  layout=(10,1),
                  rot=0,
                  lw=1,
                  title='sergemnt id max')
plt.show()

In [None]:
def build_features(signal, ts, sensor_id):
    X = pd.DataFrame()
    f = np.fft.fft(signal)
    f_real = np.real(f)
    X.loc[ts, f'{sensor_id}_sum'] = signal.sum()
    X.loc[ts, f'{sensor_id}_mean'] = signal.mean()
    X.loc[ts, f'{sensor_id}_std'] = signal.std()
    X.loc[ts, f'{sensor_id}_var'] = signal.var()
    X.loc[ts, f'{sensor_id}_max'] = signal.max()
    X.loc[ts, f'{sensor_id}_min'] = signal.min()
    X.loc[ts, f'{sensor_id}_skew'] = signal.skew()
    X.loc[ts, f'{sensor_id}_mad'] = signal.mad()
    X.loc[ts, f'{sensor_id}_kurtosis'] = signal.kurtosis()
    X.loc[ts, f'{sensor_id}_quantile99'] = np.quantile(signal, 0.99)
    X.loc[ts, f'{sensor_id}_quantile95'] = np.quantile(signal, 0.95)
    X.loc[ts, f'{sensor_id}_quantile85'] = np.quantile(signal, 0.85)
    X.loc[ts, f'{sensor_id}_quantile75'] = np.quantile(signal, 0.75)
    X.loc[ts, f'{sensor_id}_quantile55'] = np.quantile(signal, 0.55)
    X.loc[ts, f'{sensor_id}_quantile45'] = np.quantile(signal, 0.45)
    X.loc[ts, f'{sensor_id}_quantile25'] = np.quantile(signal, 0.25)
    X.loc[ts, f'{sensor_id}_quantile15'] = np.quantile(signal, 0.15)
    X.loc[ts, f'{sensor_id}_quantile05'] = np.quantile(signal, 0.05)
    X.loc[ts, f'{sensor_id}_quantile01'] = np.quantile(signal, 0.01)
    X.loc[ts, f'{sensor_id}_fft_real_mean'] = f_real.mean()
    X.loc[ts, f'{sensor_id}_fft_real_std'] = f_real.std()
    X.loc[ts, f'{sensor_id}_fft_real_max'] = f_real.max()
    X.loc[ts, f'{sensor_id}_fft_real_min'] = f_real.min()
    
    return X

In [None]:
train_set = list()
seg = 0

for seg, segment_id in tqdm(enumerate(train.segment_id)):
    signals = pd.read_csv(PATH+'train/'+str(segment_id)+'.csv')
    train_row = []
    
    if seg%200 == 0:
        print('Processing segment_id={}'.format(seg))
        
    for sensor in range(0, 10):
        sensor_id = f'sensor_{sensor+1}'
        train_row.append(build_features(signals[sensor_id].fillna(0), segment_id, sensor_id))
        
    train_row = pd.concat(train_row, axis=1)
    train_set.append(train_row)
    seg+=1
    
train_set = pd.concat(train_set)

In [None]:
train_set = train_set.reset_index()
train_set = train_set.rename(columns={'index':  'segment_id'})

train_set = pd.merge(train_set, train, on='segment_id')

In [None]:
print(train_set.head(3))

In [None]:
test_files = []
for dirname, _, filenames in os.walk(PATH+'test/'):
    for filename in filenames:
        test_files.append(filename[:-4])
        
test = pd.DataFrame(test_files, columns=['segment_id'])

In [None]:
test_set = list()
seg = 0

for seg, segment_id in tqdm(enumerate(test.segment_id)):
    signals = pd.read_csv(PATH+'test/'+str(segment_id)+'.csv')
    test_row = []
    
    if seg%200 == 0:
        print('Processing segment_id={}'.format(seg))
        
    for sensor in range(0, 10):
        sensor_id = f'sensor_{sensor+1}'
        test_row.append(build_features(signals[sensor_id].fillna(0), segment_id, sensor_id))
        
    test_row = pd.concat(test_row, axis=1)
    test_set.append(test_row)
    seg+=1
    
test_set = pd.concat(test_set)

In [None]:
test_set = test_set.reset_index()
test_set = test_set.rename(columns={'index':  'segment_id'})

test_set = pd.merge(test_set, test, on='segment_id')

print(test_set.head(3))

In [None]:
X = train_set.drop(['segment_id', 'time_to_eruption'], axis=1)
y = train_set['time_to_eruption']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.2,
                                                      random_state=42)

In [None]:
print(X_train.head(3))
print('np.shape(X_train) = ', np.shape(X_train))

In [None]:
print(y_train.head(3))
print('np.shape(y_train) = ', np.shape(y_train))

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(max_depth=20, random_state=0)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_valid)

In [None]:
from sklearn.metrics import r2_score

conf_mat = r2_score(y_valid, y_pred)
print(conf_mat)

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_valid, y_pred)
fig = plt.figure()
mulreg = fig.add_subplot(1, 1, 1)
mulreg.scatter(y_valid, y_pred, color='r')
mulreg.set_title('Nonlinear Regression') 

In [None]:
prediction = model.predict(test_set.drop(columns=['segment_id']))

In [None]:
submission = pd.DataFrame()  
submission['segment_id'] = test_set['segment_id']
submission['time_to_eruption'] = prediction
submission.to_csv('submission.csv', header=True, index=False)

In [None]:
from math import isnan
sensor_result = {}
for i in range(1,11):
    sensor_result[f"sensor_{i}"] =  0
    
print(sensor_result)

In [None]:
for i in file_list_test:
    df_test_stat = pd.read_csv(i)
    for j in range(1,11):
        if isnan(df_test_stat[f"sensor_{j}"].mean()):
            sensor_result[f"sensor_{j}"] += 1

print(sensor_result)

In [None]:
plt.figure(figsize=(20,5))
plt.bar(sensor_result.keys(), sensor_result.values(), width=.5, color='blue')