In [None]:
import os
import gc
import numpy as np
import pandas as pd

from time import time
from time import ctime

import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm_notebook
from tqdm import tqdm
import lightgbm as lgb
import joblib
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()-1

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

def plotfig (ypred, yactual, strtitle, y_max):
    plt.scatter(ypred, yactual.values.ravel())
    plt.title(strtitle)
    plt.plot([(0, 0), (y_max, y_max)], [(0, 0), (y_max, y_max)])
    plt.xlim(0, y_max)
    plt.ylim(0, y_max)
    plt.xlabel('Predicted', fontsize=12)
    plt.ylabel('Actual', fontsize=12)
    plt.show()

In [None]:
train = pd.read_csv('../input/ingv-tsfresh-7730/train.csv', sep = ';')
train.set_index('Unnamed: 0', inplace = True)
test = pd.read_csv('../input/ingv-tsfresh-7730/test.csv', sep = ';')
test.set_index('Unnamed: 0', inplace = True)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
test_index = test.index

In [None]:
train_rf = train.copy()

In [None]:
train_rf = train_rf.fillna(0)

In [None]:
train_rf.head()

In [None]:
x = train_rf.drop('time_to_eruption', axis=1)
y = train_rf['time_to_eruption']

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=1, max_depth=7)
model.fit(x,y)

In [None]:
feature_scores = pd.Series(model.feature_importances_, index=x.columns).sort_values(ascending=False)
feature_scores

In [None]:
selected_feature = feature_scores[:350].index

In [None]:
target = train['time_to_eruption']
all_data = pd.concat([train, test], ignore_index = True)
all_data.head()

In [None]:
all_data = pd.concat([all_data[selected_feature], all_data['time_to_eruption']], axis=1)
all_data.head()

In [None]:
# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [None]:
missing_values = missing_values_table(all_data)
missing_values

In [None]:
all_data = all_data.fillna(all_data.mode())

In [None]:
header = all_data.columns
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
all_data[header] = scaler.fit_transform(all_data)
all_data.head()

In [None]:
all_data = all_data.drop('time_to_eruption', axis=1)
all_data.var()

In [None]:
all_data.corr()

In [None]:
missing_values = missing_values_table(all_data)
missing_values

In [None]:
all_data = all_data.fillna(all_data.min())

In [None]:
# from sklearn.decomposition import KernelPCA 
from sklearn.decomposition import PCA
pca = PCA() 
all_data_pca = pca.fit_transform(all_data)
all_data_pca = pd.DataFrame(all_data_pca)
all_data_pca.head()

In [None]:
train = all_data_pca[:train.shape[0]]
test = all_data_pca[train.shape[0]:]

In [None]:
Y = target
X = train
X_test = test

n_fold = 5
cv = KFold(n_splits=n_fold, shuffle=True, random_state=123)

oof = np.zeros(len(X))
cat_prediction = np.zeros(len(X_test))
mae, r2 = [], []

PARAMS = {

        'random_seed': 123,
        'eval_metric': 'MAE',
        'task_type': 'GPU',
        'bagging_temperature': 0.41010395885331385,
        'border_count': 186,
        'depth': 10,
        'iterations': 700,
        'l2_leaf_reg': 30,
        'learning_rate': 0.067,
        'random_strength': 3.230824361824754e-06,
    }

for fold_n, (train_index, valid_index) in enumerate(cv.split(X)):
    print('\nFold', fold_n, 'started at', ctime())

    X_train = X.iloc[train_index,:]
    X_valid = X.iloc[valid_index,:]
    
    Y_train = Y.iloc[train_index]
    Y_valid = Y.iloc[valid_index]
          
    best_model = CatBoostRegressor(**PARAMS, thread_count = -1)  
    
    train_dataset = Pool(data=X_train,
                     label=Y_train,
                     )
    
    eval_dataset = Pool(data=X_valid,
                    label=Y_valid,
                    )
    
    best_model.fit(train_dataset,
              use_best_model=True,
              verbose = False,
              plot = True,
              eval_set=eval_dataset,
              early_stopping_rounds=70)

   
    y_pred = best_model.predict(Pool(data=X_valid))

    mae.append(mean_absolute_error(Y_valid, y_pred))
    r2.append(r2_score(Y_valid, y_pred))

    print('MAE: ', mean_absolute_error(Y_valid, y_pred))
    print('R2: ', r2_score(Y_valid, y_pred))

    cat_prediction += best_model.predict(Pool(data=X_test))
        
cat_prediction /= n_fold

print('='*45)
print('CV mean MAE: {0:.4f}, std: {1:.4f}.'.format(np.mean(mae), np.std(mae)))
print('CV mean R2:  {0:.4f}, std: {1:.4f}.'.format(np.mean(r2), np.std(r2)))

In [None]:
# plotfig(best_model.predict(X), Y, 'Predicted vs. Actual responses for Catboost', max(Y) + 0.1*max(Y))

In [None]:
# train_features = train
# train_targets = pd.DataFrame({'target':target})
# test_features = test

In [None]:
# hyper_params = {
#     'task': 'train',
#     'boosting_type': 'gbdt',
#     'objective': 'regression',
#     'metric': ['mae'],
#     'learning_rate': 0.067,
#     'feature_fraction': 0.9,
#     'subsample': 0.85,
#     'subsample_freq': 2,
#     'verbose': 500,
#     "max_depth": -1,
#     "num_leaves": 31,  
#     "max_bin": 128,
#     "num_iterations": 10000,
#     'device': 'gpu',
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0
# }

In [None]:
# from sklearn.metrics import mean_absolute_error
# import lightgbm as lgb
# import math  
# from sklearn.model_selection import KFold, StratifiedKFold

# score = []

# skf = KFold(n_splits = 5, shuffle=True, random_state=123)
# skf.get_n_splits(train_features, train_targets)
# oof_lgbm_df = pd.DataFrame()
# predictions = pd.DataFrame(test_index)
# x_test = test_features


# for fold, (trn_idx, val_idx) in enumerate(skf.split(train_features, train_targets)):
#     x_train, y_train = train_features.iloc[trn_idx], train_targets.iloc[trn_idx]['target']
#     x_valid, y_valid = train_features.iloc[val_idx], train_targets.iloc[val_idx]['target']
#     index = x_valid.index
#     p_valid = 0
#     yp = 0
#     gbm = lgb.LGBMRegressor(**hyper_params)
#     gbm.fit(x_train, y_train,
#         eval_set=[(x_valid, y_valid)],
#         eval_metric='mae',
#         verbose = 500,
#         early_stopping_rounds=100)
#     score.append(mean_absolute_error(gbm.predict(x_valid), y_valid))
#     yp += gbm.predict(x_test)
#     fold_pred = pd.DataFrame({'ID': index,
#                               'label':gbm.predict(x_valid)})
#     oof_rfr_df = pd.concat([oof_lgbm_df, fold_pred], axis=0)
#     predictions['fold{}'.format(fold+1)] = yp

In [None]:
# pred = (predictions['fold1'] + predictions['fold2'] + predictions['fold3'] + predictions['fold4'] + predictions['fold5'])/5

In [None]:
submission = pd.DataFrame()
submission['segment_id'] = test_index
submission['time_to_eruption'] = cat_prediction
submission.to_csv('submission.csv', header=True, index=False)