# ****Volcanic Explorer****

### This notebook is in its preliminary phase of data analysis. So, I will be performing some basic signal visualiztion and EDA.

In [None]:
import pandas  as pd
import numpy   as np
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from pathlib import Path

plt.rcParams['figure.figsize'] = (15, 10)

random_state = 10


So, let's begin with with some data loading.

In [None]:
train    = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/train.csv')
sample   = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv')

#mean = train['time_to_eruption'].mean()

#sample.iloc[:,1:] = mean
#sample.to_csv('submission.csv',index=False)

In [None]:
train.head()

In [None]:
sample.head()

The 'train' metadeta comprises 4431 stations, each stations containing 10 sensors for collecting seismic events. Small scale earthquakes in the vicinity of volcanic area can help to guess if there is an imminent volcanic activities. 

### Now, lets take a look at an actual reading from one of the station.

In [None]:
train_csvs = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/train/1000015382.csv')

In [None]:
print(train_csvs.shape)

In [None]:
train_csvs.describe()

### Looks like one of the sensor from this station might have missing values. 

In [None]:
train_csvs.isna().sum()

### Lets take a look at the seismograms.
A seismogram is a graphical display of the seismic measurement by a seismograph or geophones. It capture the motion of the ground whenever a seismic wave travels through it. Usually, it is a measure of acceleration. The actual measurement consists of time in one axis and amplitude in the other axis. The ***large spikes*** in the seismogram display in the below image tells certain activity at a given point of time. Larger the amplitude, stronger the event is. The weaker or low amplitude measurements may contain a lot of noises such as ground rolls, winds, or even electric interference depending upon what kind of sensor it is used.

In [None]:
train_csvs.plot()
plt.show

### From the wiggle display, it seems that the the not all the sensor have same amplitude characteristics. I would expect if all of these sensors to have similar amplitude range. It is possible that these sensors might be far apart and therefore different characteristics. Lets look at them side by side.

In [None]:
curves = train_csvs.columns
num_curves = len(train_csvs.columns)

f, ax = plt.subplots(nrows=1, ncols = num_curves)

for ic, col in enumerate(curves):
    if np.all(np.isnan(train_csvs[col])):
        curve = np.empty(train_csvs[col].values.shape)
        curve[:] = np.nan
    else:
        curve = train_csvs[col]
        
    ax[ic].plot(curve, curve.index)
    ax[ic].set_xlabel(col)
    ax[ic].invert_yaxis()
    #ax[ic].set_xlim(1,60000)
    #ax[ic].set_yticklabels([])   


### It looks like these sensors are not located in the same geographical vicinity. As you can see some sensors dont have the spikes at the same time. If a sensor is farther from the source/epicenter of an Earthquake, the longer it takes the seismic waves to travel and reach the sensor. The image below (example from notebook by [Jasper Dramsch](https://www.kaggle.com/jesperdramsch/introduction-to-volcanology-seismograms-and-lgbm)) gives you an idea on how sensors are laid out in the field:
Digital elevation model from Etna from ([Bonaccorso 2011]
![Bonaccorso 2011 DEM of Etna](https://i.imgur.com/2b99LHc.jpg)
(https://agupubs.onlinelibrary.wiley.com/doi/full/10.1029/2010GC003480))




### Looking at the seismogram display earlier, sensors 2,3 & 10 are quite similar, sensor 1, 4 & 9 are quate similar. Other sensors, especially 6 and 7 seems to pick up quite a lot of noise as compared to other sensors which responds to the stronger event nicely. 

To be continued...

# ****Prepare train and test set for simple learning

In [None]:
def agg_stats(df, idx):
    df = df.agg(['sum','min', 'mean', 'std', 'median', 'skew', 'kurtosis'])
    df_flat = df.stack()
    df_flat.index = df_flat.index.map('{0[1]}_{0[0]}'.format)
    df_out = df_flat.to_frame().T
    df_out["segment_id"] = int(idx)
    return df_out

In [None]:
summary_stats = pd.DataFrame()
for csv in tqdm(Path("../input/predict-volcanic-eruptions-ingv-oe/train/").glob("**/*.csv"), total=4501):
    df = pd.read_csv(csv)
    summary_stats = summary_stats.append(agg_stats(df, csv.stem))

In [None]:
test_data = pd.DataFrame()
for csv in tqdm(Path("../input/predict-volcanic-eruptions-ingv-oe/test/").glob("**/*.csv"), total=4501):
    df = pd.read_csv(csv)
    test_data = test_data.append(agg_stats(df, csv.stem))

In [None]:
features = list(summary_stats.drop(["segment_id"], axis=1).columns)
target_name = ["time_to_eruption"]
summary_stats = summary_stats.merge(train, on="segment_id")
summary_stats.head()

In [None]:
summary_stats.describe()

In [None]:
### Training with LGBM
import lightgbm as lgbm
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import gc
%%timeit

n_fold = 2
folds = KFold(n_splits=n_fold, shuffle=True, random_state=random_state)

data = summary_stats

params = {
    "n_estimators": 100,
    "boosting_type": "gbdt",
    "metric": "mae",
    "num_leaves": 66,
    "learning_rate": 0.1,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "agging_freq": 3,
    "max_bins": 2048,
    "verbose": 0,
    "random_state": random_state,
    "nthread": -1,
    #"device": "gpu",
    }

oof_preds = np.zeros(data.shape[0])
sub_preds = np.zeros(test_data.shape[0])
feature_importance = pd.DataFrame(index=list(range(n_fold)), columns=features)

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(data)):
    X_train, y_train = data[features].iloc[trn_idx], data[target_name].iloc[trn_idx]
    X_val, y_val = data[features].iloc[val_idx], data[target_name].iloc[val_idx]
    
    model = lgbm.LGBMRegressor(**params)
    
    model.fit(X_train, y_train,  
            eval_set= [(X_train, y_train), (X_val, y_val)], 
            eval_metric="mae", verbose=0, early_stopping_rounds=150)
    
    feature_importance.iloc[n_fold, :] = model.feature_importances_
    
    oof_preds[val_idx] = model.predict(X_val, num_iteration = model.best_iteration_)
    sub_preds += model.predict(test_data[features], num_iteration = model.best_iteration_) / folds.n_splits
    
    
    print('Fold %2d AUC: %.6f' % (n_fold+1, roc_auc_score(val_y, oof_preds[val_idx])))
    del clf, trn_x, trn_y, val_x, val_y
    gc.collect()
    
print('Full AUC score %.6f' % roc_auc_score(y,oof_preds))
       
    

In [None]:
best = feature_importance.mean().sort_values(ascending=False)
best_idx = best[best > 5].index

plt.figure(figsize=(14,26))
sns.boxplot(data=feature_importance[best_idx], orient="h")
plt.title("Features Importance per Fold")
plt.tight_layout()

## Submit Prediction

In [None]:
submission = pd.DataFrame() 
submission['segment_id'] = test_data["segment_id"] 
submission['time_to_eruption'] = sub_preds 
submission.to_csv('submission.csv', header = True, index = False)