In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
import os
from tqdm import tqdm
import gc
import pickle

# train and submission

In [None]:
train = pd.read_csv("../input/predict-volcanic-eruptions-ingv-oe/train.csv")
sample_submission = pd.read_csv("../input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv")

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.isna().sum(axis=0)

In [None]:
sns.distplot(train['time_to_eruption'])

In [None]:
sample_submission.head()

In [None]:
sample_submission.shape

# Data files

### example1 in train

In [None]:
# take 1000015382.csv for example
test_data = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/train/1000015382.csv')

In [None]:
print('shape of 1000015382.csv', test_data.shape)
test_data.head()

In [None]:
fig, axs = plt.subplots(2, 5, figsize=(50, 30))

for i, ax in enumerate(axs.ravel()):
    ax.plot(test_data['sensor_'+str(i+1)])
    ax.set_title('sensor_'+str(i+1))

In [None]:
sns.distplot(test_data['sensor_1'], label='sensor_1')
sns.distplot(test_data['sensor_2'], label='sensor_2')
sns.distplot(test_data['sensor_3'], label='sensor_3')
plt.legend()

In [None]:
print(train.loc[train['segment_id']==1000015382])

### example2 in test

In [None]:
test_data2 = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/test/1001028887.csv')

fig, axs = plt.subplots(2, 5, figsize=(50, 30))

for i, ax in enumerate(axs.ravel()):
    ax.plot(test_data2['sensor_'+str(i+1)])
    ax.set_title('sensor_'+str(i+1))

In [None]:
sns.distplot(test_data2['sensor_1'], label='sensor_1')
sns.distplot(test_data2['sensor_4'], label='sensor_4')
sns.distplot(test_data2['sensor_3'], label='sensor_3')
plt.legend()

### example 3 in train

In [None]:
test_data3 = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/train/1000745424.csv')

fig, axs = plt.subplots(2, 5, figsize=(50, 30))

for i, ax in enumerate(axs.ravel()):
    ax.plot(test_data3['sensor_'+str(i+1)])
    ax.set_title('sensor_'+str(i+1))

In [None]:
sns.distplot(test_data3['sensor_1'], label='sensor_1')
sns.distplot(test_data3['sensor_2'], label='sensor_2')
sns.distplot(test_data3['sensor_3'], label='sensor_3')
plt.legend()

In [None]:
print(train.loc[train['segment_id']==1000745424])

##  the datas in the ten minutes of logs are very different

***MODEL***

In [None]:
# generate feature
# collect mean / std / 5 / 10 / 20 / 40 percentile / min / max / +5000 / +10000 / +20000 self-corr
def generate_feature():
    
    def helper(path):
        data = []
        for file in tqdm(os.listdir(path)):
            tmp = []
            file_path = os.path.join(path, file)
            d = pd.read_csv(file_path)
            tmp.append(eval(file[:-4]))
            # mean
            tmp += d.mean(axis=0).values.astype('float32').tolist()
            # std
            tmp += d.std(axis=0).values.astype('float32').tolist()
            # min
            tmp += d.min(axis=0).values.astype('float32').tolist()
            # max
            tmp += d.max(axis=0).values.astype('float32').tolist()
            # 5 percentile
            tmp += d.quantile(0.05, axis=0).values.astype('float32').tolist()
            # 10 percentile
            tmp += d.quantile(0.1, axis=0).values.astype('float32').tolist()
            # 20 percentile
            tmp += d.quantile(0.2, axis=0).values.astype('float32').tolist()
            # 40 percentile
            tmp += d.quantile(0.4, axis=0).values.astype('float32').tolist()
            # shift
            for col in d:
                d[col+'_5000'] = d[col].shift(5000)
                d[col+'_10000'] = d[col].shift(5000)
                d[col+'_20000'] = d[col].shift(5000)
            # +5000 / +10000 / +20000 self-corr
            for col in d.columns[:10]:
                col1 = col+'_5000'
                col2 = col+'_10000'
                col3 = col+'_20000'
                tmp1 = d.loc[:, [col, col1]].dropna()
                tmp2 = d.loc[:, [col, col2]].dropna()
                tmp3 = d.loc[:, [col, col3]].dropna()
                tmp += [tmp1[col].corr(tmp1[col1]), tmp2[col].corr(tmp2[col2]), tmp3[col].corr(tmp3[col3])]
                
            data.append(tmp)
        return data
                   
    print('train_part: ')
    train_part_fea = helper('../input/predict-volcanic-eruptions-ingv-oe/train')
    print('test_part: ')
    test_part_fea = helper('../input/predict-volcanic-eruptions-ingv-oe/test')
    
    return train_part_fea, test_part_fea

In [None]:
#def na_mark(data, file_has_na_name):
#    name = set([eval(i[:-4]) for i in file_has_na_name])
#    data['na_mark'] = 0
#    data.loc[data['segment_id'].isin(name), 'na_mark'] = 1

In [None]:
train_part_fea, test_part_fea = generate_feature()

with open('train_part_fea.pkl', 'wb') as f1:
    pickle.dump(train_part_fea, f1)
    
with open('test_part_fea.pkl', 'wb') as f2:
    pickle.dump(test_part_fea, f2)

In [None]:
#train_part_fea = pd.read_pickle('../input/ingv-eda-basemodel/train_part_fea.pkl')
#test_part_fea = pd.read_pickle('../input/ingv-eda-basemodel/test_part_fea.pkl')

In [None]:
base_colname = ['sensor_'+str(i) for i in range(1, 11)]
fea_colname = ['segment_id'] + [j + '_mean' for j in base_colname] + [j + '_std' for j in base_colname] + \
                [j + '_min' for j in base_colname] + [j + '_max' for j in base_colname] + \
                    [j + '_5_quant' for j in base_colname] + [j + '_10_quant' for j in base_colname] + \
                        [j + '_20_quant' for j in base_colname] + [j + '_40_quant' for j in base_colname] + \
                    [j + i for j in base_colname for i in ['_5000_self_corr', '_10000_self_corr', '_20000_self_corr']]

train = pd.merge(train, pd.DataFrame(train_part_fea, columns=fea_colname), on='segment_id', how='left')
sample_submission = pd.merge(sample_submission, pd.DataFrame(test_part_fea, columns=fea_colname), on='segment_id', how='left')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train.drop(['segment_id', 'time_to_eruption'], axis=1).values, 
                                                    train['time_to_eruption'].values, 
                                                    test_size=0.25, random_state=42)

In [None]:
import lightgbm as lgb

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, y_val, reference=train_data,)

#params = {'objective': 'mae', 
#          'num_iterations': ,
#          'learning_rate': , 
#          'num_leaves': ,
#          'seed': ,
#          'metric': 'mae'}

params = { 'num_leaves': 85,
          'n_estimators': 6000,
    'min_data_in_leaf': 10, 
    'objective':'mae',
    'max_depth': -1,
    'learning_rate': 0.01,
    'max_bins': 2048,
    "boosting": "gbdt",
    "feature_fraction": 0.91,
    "bagging_freq": 1,
    "bagging_fraction": 0.91,
    "bagging_seed": 42,
    "metric": 'mae',
    "lambda_l1": 0.1,
    "verbosity": -1,
    "nthread": -1,
    "random_state": 42}

model = lgb.train(params=params, train_set=train_data, valid_sets=[train_data, val_data], valid_names=['train', 'val'], 
                  early_stopping_rounds=50)

## predict

In [None]:
submission = pd.DataFrame({'segment_id': sample_submission['segment_id'].values, 
    'time_to_eruption': model.predict(sample_submission.iloc[:, 2:].values)})
submission.to_csv('submission.csv', index=False)