In [None]:
import gc
import os
import time
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
from scipy import stats
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
rows = 150000

test_path = '../input/LANL-Earthquake-Prediction/test'
submission_path = '../input/LANL-Earthquake-Prediction/sample_submission.csv'
train_path = '../input/LANL-Earthquake-Prediction/train.csv'

train_df = pd.read_csv(train_path,
                       dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})
segments = int(np.floor(train_df.shape[0] / rows))
print("Số lượng segments là: ", segments)


In [None]:


sample_test_df = pd.read_csv(test_path + "/seg_00030f.csv")
sample_test_df

fig, ax1 = plt.subplots(figsize=(16, 8))

plt.title("Xu hướng của acoustic_data và time_to_failure trong 1% dữ liệu (mỗi điểm cách nhau 100 bước nhảy)")
plt.plot(train_df.acoustic_data.values[::100], color='b')
ax1.set_ylabel('acoustic_data', color='b')
plt.legend(['acoustic_data'])

ax2 = ax1.twinx()
plt.plot(train_df.time_to_failure.values[::100], color='r')
ax2.set_ylabel('time_to_failure', color='r')
plt.legend(['time_to_failure'], loc=(0.875, 0.9))

fig, ax = plt.subplots(2,1, figsize=(20,12))
ax[0].plot(train_df.index.values[:7500000], train_df.time_to_failure.values[:7500000], c="red")
ax[0].set_title("Đồ thị của time_to_failure trong 7.5 triệu hàng đầu tiên")
ax[0].set_ylabel("time_to_failure (ms)")

ax[1].plot(train_df.index.values[:7500000], train_df.acoustic_data.values[:7500000], c="green")
ax[1].set_title("Đồ thị của acoustic_data trong 7.5 triệu hàng đầu tiên")
ax[1].set_ylabel("acoustic_data")

plt.subplots(figsize=(25,8))

plt.plot(train_df.index.values[:150000], train_df.time_to_failure.values[:150000], c="red")
plt.ylabel("Time_to_failure (ms)")
plt.title("Sự thay đổi của time_to_failure trong 150000 điểm dữ liệu đầu")
plt.show()

fig, ax = plt.subplots(2,1,figsize=(25,10))

ax[0].plot(train_df.index.values[0:149999], np.diff(train_df.time_to_failure.values[0:150000]), c="r")
ax[0].set_ylabel("Độ lớn các bước giảm")
ax[0].set_title("Chênh lệch độ lớn giữa các giá trị time_to_failure");

ax[1].plot(train_df.index.values[0:4000], train_df.time_to_failure.values[0:4000], c="r")
ax[1].set_ylabel("time_to_failure (ms)")
ax[1].set_title("Xu hướng của time_to_failure trong một đường ngang");

train_7m5 = train_df[:7500000]
train_7m5.describe()

train_7m5['acoustic_data'].hist(bins=30, range = [-15,15], align='mid')
plt.title("Mật độ của acoustic_data trong 7500000 giá trị đầu")
plt.xlabel('acoustic data')
plt.ylabel('examples')
plt.show()

nameOfFileTest = ['seg_0012b5.csv', 'seg_00030f.csv', 'seg_00184e.csv']
for name in nameOfFileTest:
    plt.subplots(figsize=(16, 8))
    seg = pd.read_csv(test_path  + '/' + name)
    plt.plot(seg.acoustic_data.values, c="green")
    plt.ylabel("acoustic_data")
    plt.title("Test " + name);
    plt.show()
    
    X_train = pd.DataFrame(index=range(segments), dtype=np.float64)
y_train = pd.DataFrame(index=range(segments), dtype=np.float64, columns=['time_to_failure'])

xc = pd.Series(seg['acoustic_data'].values)
def feature_generate(df, x, seg):
    df.loc[seg, 'ave'] = x.mean() 
    df.loc[seg, 'std'] = x.std()  
    df.loc[seg, 'max'] = x.max()
    df.loc[seg, 'min'] = x.min()
    df.loc[seg, 'sum'] = x.sum()
    df.loc[seg, 'mad'] = x.mad()
    df.loc[seg, 'skew'] = x.skew()
    
    df.loc[seg, 'abs_min'] = np.abs(x).min()
    df.loc[seg, 'abs_max'] = np.abs(x).max()
    df.loc[seg, 'abs_mean'] = np.abs(x).mean()
    df.loc[seg, 'abs_std'] = np.abs(x).std()
    
    df.loc[seg, 'q01'] = np.quantile(x,0.01)
    df.loc[seg, 'q05'] = np.quantile(x,0.05)
    df.loc[seg, 'q95'] = np.quantile(x,0.95)
    df.loc[seg, 'q99'] = np.quantile(x,0.99)
    
    df.loc[seg, 'abs_q01'] = np.quantile(np.abs(x), 0.01)
    df.loc[seg, 'abs_q05'] = np.quantile(np.abs(x), 0.05)
    df.loc[seg, 'abs_q95'] = np.quantile(np.abs(x), 0.95)
    df.loc[seg, 'abs_q99'] = np.quantile(np.abs(x), 0.99)
    
    df.loc[seg, 'average_first_10000'] = x[:10000].mean()
    df.loc[seg, 'average_last_10000']  =  x[-10000:].mean()
    df.loc[seg, 'average_first_50000'] = x[:50000].mean()
    df.loc[seg, 'average_last_50000'] = x[-50000:].mean()
    
    df.loc[seg, 'std_first_10000'] = x[:10000].std()
    df.loc[seg, 'std_last_10000']  =  x[-10000:].std()
    df.loc[seg, 'std_first_50000'] = x[:50000].std()
    df.loc[seg, 'std_last_50000']  =  x[-50000:].std()
    
    df.loc[seg, 'std_first_10000'] = x[:10000].min()
    df.loc[seg, 'std_last_10000']  =  x[-10000:].min()
    df.loc[seg, 'std_first_50000'] = x[:50000].min()
    df.loc[seg, 'std_last_50000']  =  x[-50000:].min()
    
    df.loc[seg, 'std_first_10000'] = x[:10000].max()
    df.loc[seg, 'std_last_10000']  =  x[-10000:].max()
    df.loc[seg, 'std_first_50000'] = x[:50000].max()
    df.loc[seg, 'std_last_50000']  =  x[-50000:].max()
    
    df.loc[seg, '10q'] = np.percentile(x, 0.10)
    df.loc[seg, '25q'] = np.percentile(x, 0.25)
    df.loc[seg, '50q'] = np.percentile(x, 0.50)
    df.loc[seg, '75q'] = np.percentile(x, 0.75)
    df.loc[seg, '90q'] = np.percentile(x, 0.90)
    
    zc = np.fft.fft(xc)
    realFFT = np.real(zc)
    imagFFT = np.imag(zc)
    df.loc[seg, 'Rmean'] = realFFT.mean()
    df.loc[seg, 'Rstd'] = realFFT.std()
    df.loc[seg, 'Rmax'] = realFFT.max()
    df.loc[seg, 'Rmin'] = realFFT.min()
    df.loc[seg, 'Imean'] = imagFFT.mean()
    df.loc[seg, 'Istd'] = imagFFT.std()
    df.loc[seg, 'Imax'] = imagFFT.max()
    df.loc[seg, 'Imin'] = imagFFT.min()
    df.loc[seg, 'Rmean_last_5000'] = realFFT[-5000:].mean()
    df.loc[seg, 'Rstd__last_5000'] = realFFT[-5000:].std()
    df.loc[seg, 'Rmax_last_5000'] = realFFT[-5000:].max()
    df.loc[seg, 'Rmin_last_5000'] = realFFT[-5000:].min()
    df.loc[seg, 'Rmean_last_15000'] = realFFT[-15000:].mean()
    df.loc[seg, 'Rstd_last_15000'] = realFFT[-15000:].std()
    df.loc[seg, 'Rmax_last_15000'] = realFFT[-15000:].max()
    df.loc[seg, 'Rmin_last_15000'] = realFFT[-15000:].min()
    
    for w in [10, 100, 1000, 10000]:
        x_roll_abs_mean = x.abs().rolling(w).mean().dropna().values
        x_roll_mean = x.rolling(w).mean().dropna().values
        x_roll_std = x.rolling(w).std().dropna().values
        x_roll_min = x.rolling(w).min().dropna().values
        x_roll_max = x.rolling(w).max().dropna().values
        
        df.loc[seg, 'ave_roll_std_' + str(w)] = x_roll_std.mean()
        df.loc[seg, 'std_roll_std_' + str(w)] = x_roll_std.std()
        df.loc[seg, 'max_roll_std_' + str(w)] = x_roll_std.max()
        df.loc[seg, 'min_roll_std_' + str(w)] = x_roll_std.min()
        df.loc[seg, 'q01_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.01)
        df.loc[seg, 'q05_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.05)
        df.loc[seg, 'q10_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.10)
        df.loc[seg, 'q95_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.95)
        df.loc[seg, 'q99_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.99)
        
        df.loc[seg, 'ave_roll_mean_' + str(w)] = x_roll_mean.mean()
        df.loc[seg, 'std_roll_mean_' + str(w)] = x_roll_mean.std()
        df.loc[seg, 'max_roll_mean_' + str(w)] = x_roll_mean.max()
        df.loc[seg, 'min_roll_mean_' + str(w)] = x_roll_mean.min()
        df.loc[seg, 'q05_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.05)
        df.loc[seg, 'q95_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.95)
        
        df.loc[seg, 'ave_roll_abs_mean_' + str(w)] = x_roll_abs_mean.mean()
        df.loc[seg, 'std_roll_abs_mean_' + str(w)] = x_roll_abs_mean.std()
        df.loc[seg, 'q05_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.05)
        df.loc[seg, 'q95_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.95)
        
        df.loc[seg, 'std_roll_min_' + str(w)] = x_roll_min.std()
        df.loc[seg, 'max_roll_min_' + str(w)] = x_roll_min.max()
        df.loc[seg, 'q05_roll_min_' + str(w)] = np.quantile(x_roll_min, 0.05)
        df.loc[seg, 'q95_roll_min_' + str(w)] = np.quantile(x_roll_min, 0.95)

        df.loc[seg, 'std_roll_max_' + str(w)] = x_roll_max.std()
        df.loc[seg, 'min_roll_max_' + str(w)] = x_roll_max.min()
        df.loc[seg, 'q05_roll_max_' + str(w)] = np.quantile(x_roll_max, 0.05)
        df.loc[seg, 'q95_roll_max_' + str(w)] = np.quantile(x_roll_max, 0.95)
    return df

for s in range(segments):
    seg = train_df.iloc[s*150000:s*150000+150000]
    x = pd.Series(seg['acoustic_data'].values)
    y = seg['time_to_failure'].values[-1]
    y_train.loc[s, 'time_to_failure'] = y
    X_train = feature_generate(X_train,x,s)
    
    def plot_feature(feature, X=X_train):
        fig, ax = plt.subplots(figsize=(20, 8)) 
        ax.set_xlabel(feature)
        ax.set_ylabel('time_to_failure')
        plt.title('{} - time_to_falure correlation'.format(feature), color='r')
        plt.scatter(x = X[feature], y = y_train)

In [None]:
for feature in X_train.columns:
    plot_feature(feature)


In [None]:
X_train = X_train.drop(columns=['abs_min', 'abs_q01', 'q05_roll_abs_mean_10000', 'q95_roll_mean_10000', 'q05_roll_mean_10000', 'ave_roll_mean_10000','q05_roll_abs_mean_1000', 
                                'ave_roll_mean_1000', 'q05_roll_abs_mean_100', 'ave_roll_mean_100', 'min_roll_std_100', 'ave_roll_mean_10', 
                                'average_first_50000', 'average_last_50000', 'average_first_10000', 'average_last_10000', 'sum', 'ave'])

In [None]:

    

    
scaler = StandardScaler()
scaler.fit(X_train)
scaled_X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)

submission = pd.read_csv(submission_path, index_col='seg_id')

X_test = pd.DataFrame(columns=scaled_X_train.columns, dtype=np.float64, index=submission.index)
for s in X_test.index:
    seg = pd.read_csv(test_path + '/' + s + '.csv')
    x = pd.Series(seg['acoustic_data'].values)
    X_test = feature_generate(X_test,x,s)
    
X_test = X_test.drop(columns=['abs_min', 'abs_q01', 'q05_roll_abs_mean_10000', 'q95_roll_mean_10000', 'q05_roll_mean_10000', 'ave_roll_mean_10000','q05_roll_abs_mean_1000', 
                            'ave_roll_mean_1000', 'q05_roll_abs_mean_100', 'ave_roll_mean_100', 'min_roll_std_100', 'ave_roll_mean_10', 
                            'average_first_50000', 'average_last_50000', 'average_first_10000', 'average_last_10000', 'sum', 'ave'])

scaled_X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import re
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)
scaled_X_train = scaled_X_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

predictions_svr = np.zeros(len(scaled_X_test))

for fold_, (train_idx, val_idx) in enumerate(folds.split(scaled_X_train,y_train.values)):
    strLog = "fold **{}**".format(fold_)
    print(strLog)
    
X_tr, X_val = scaled_X_train.iloc[train_idx], scaled_X_train.iloc[val_idx]
y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

model = SVR(kernel='rbf', C=1.0, epsilon=0.1, gamma='scale')

model.fit(X_tr, y_tr)

y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print("MAE: ", mae)
predictions_svr += model.predict(scaled_X_test) / folds.n_splits

In [None]:
params = {'num_leaves': 51,
         'min_data_in_leaf': 10, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.001,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9,
         "bagging_seed": 42,
         "metric": 'mae',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "random_state": 42}

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import re
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)
scaled_X_train = scaled_X_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

predictions_svr = np.zeros(len(scaled_X_test))

for fold_, (train_idx, val_idx) in enumerate(folds.split(scaled_X_train,y_train.values)):
    strLog = "fold {}".format(fold_)
    print(strLog)
    
    X_tr, X_val = scaled_X_train.iloc[train_idx], scaled_X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = SVR(kernel='rbf', C=1.0, epsilon=0.1, gamma='scale')
    
    model.fit(X_tr, y_tr)
    
    y_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred)
    print("MAE: ", mae)
    predictions_svr += model.predict(scaled_X_test) / folds.n_splits

In [None]:
model.score(X_tr, y_tr)

In [None]:
params = {'num_leaves': 51,
         'min_data_in_leaf': 10, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.001,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9,
         "bagging_seed": 42,
         "metric": 'mae',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "random_state": 42}

In [None]:
from sklearn.model_selection import KFold
import re
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)
scaled_X_train = scaled_X_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

oof = np.zeros(len(scaled_X_train))
predictions = np.zeros(len(scaled_X_test))

for fold_, (train_idx, val_idx) in enumerate(folds.split(scaled_X_train,y_train.values)):
    strLog = "fold {}".format(fold_)
    print(strLog)
    
    X_tr, X_val = scaled_X_train.iloc[train_idx], scaled_X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = lgb.LGBMRegressor(**params, n_estimators = 10000, n_jobs = -1)
    
    model.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr), (X_val, y_val)], eval_metric='mae',verbose=1000, early_stopping_rounds=500)
    
    oof[val_idx] = model.predict(X_val, num_iteration=model.best_iteration_)
    predictions += model.predict(scaled_X_test, num_iteration=model.best_iteration_) / folds.n_splits