In [None]:
!/opt/conda/bin/python3.7 -m pip install --upgrade pip -q
!pip install --upgrade xgboost

Functions copied from: https://www.kaggle.com/obougacha/ingv-xgboost-baseline/comments?select=Train.csv

In [None]:
from xgboost import XGBRegressor, plot_tree
import tensorflow as tf
import numpy as np 
import pandas as pd
import os
import random
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import xgboost as xgb

In [None]:
ls /kaggle/input/volcano-pca

In [None]:
sample_submission = pd.read_csv('/kaggle/input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv') 

# y = pd.read_csv('/kaggle/input/predict-volcanic-eruptions-ingv-oe/train.csv')['time_to_eruption']
y = np.load('/kaggle/input/volcano-pca/y_aug2.npy')

# X = pd.read_csv('/kaggle/input/volcanobench/train_p.csv').to_numpy()
# X_test = pd.read_csv('/kaggle/input/volcanobench/test_p.csv').to_numpy()

X = pd.read_csv('/kaggle/input/volcano-pca/reduced_X_norm.csv')[:y.shape[0]].to_numpy()
X_test = np.load('/kaggle/input/volcano-pca/reduced_X_test_norm.csv.npy')

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=16)

# dtrain = xgb.DMatrix(X_train, label=y_train)
# dval = xgb.DMatrix(X_val, label=y_val)
# dtest = xgb.DMatrix(X_test)

In [None]:
def submit_prediction(pred, write=True, file_name="", Return=True):
    """
        Submits prediction and modifies the negative values.
        
        input:
            pred: the predicted values
            write: boolean value indicating if the predictions should be
                written to a csv file.
            file_name: name of the csv file to write
            Return: if the edited predictions should be returned
        
        return:
            returns the edited predictions if asked for (Return==1)
    """
    # Replace the negative values with the mean of the data
    pred = np.where(pred < 0, np.mean(pred), pred)
    
    sample_submission['time_to_eruption'] = pred  # Formatting
    if write: sample_submission.to_csv(f'{file_name}.csv', index=False)  # Write the file to a csv file
    
    if Return: return pred
    

def kfold_validation(X, y, X_test,n_fold=5, seeds=[0, 1, 2, 3, 4, 5, 6]):
    """
        Runs a repeated KFold on a given model and data
        
        input:
            model: model to traing the data
            X, y, X_test: datasets needed for training and prediction
            n_rep, n_fold: parameters of Repeated KFold
        
        return:
            model: trained model
            preds: predictions at each fold
            avg_preds: average of all predictions
    """
    index = 0  # Keep track of the loss and val_loss (history object)
    prediction = np.zeros((X_test.shape[0])) # For every single prediction
    preds = np.empty((len(seeds) * n_fold, X_test.shape[0])) # Saving all the predictions
    
#     param = {
#         'booster': 'gbtree',
#         'max_depth': 20,
#         'gamma': 1e4,
#         'min_child_weight': 5,  
#         'tree_method': 'gpu_hist',
#         'objective':'reg:squarederror',
#         'n_jobs':-1,
#         'reg_lambda':1e-3,
#         'eta':0.05,  
#         'eval_metric': 'mae',
#         'verbosity': 1,
#         'predictor': 'gpu_predictor'
#     }
    
    param = {
        'booster': 'dart',
        'sample_type': 'weighted',
        'rate_drop': 0.6,
        'one_drop': 1,
        'max_depth': 20,
        'gamma': 1e4,
        'min_child_weight': 5,  
        'tree_method': 'gpu_hist',
        'objective':'reg:squarederror',
        'n_jobs':-1,
        'reg_lambda':1e-3,
        'eta':0.3,  
        'eval_metric': 'mae',
        'verbosity': 1,
        'predictor': 'gpu_predictor'
    }
    
    dtest = xgb.DMatrix(X_test)
    
    for seed in seeds:
        kf = KFold(n_splits=n_fold, random_state=seed,shuffle=True)
        LOAD_MODEL = False
    
        for train_indices, val_indices in kf.split(X, y):
            # Data divided into Train and Validation splits
            X_train, X_val = X[train_indices, :], X[val_indices,: ]
            y_train, y_val = y[train_indices], y[val_indices]
            
            dtrain = xgb.DMatrix(X_train, label=y_train)
            dval = xgb.DMatrix(X_val, label=y_val)

            print(f'{seed}{index + 1}th fold, Validation Indices: ')
            
            if not LOAD_MODEL:
                bst = xgb.train(param, 
                    dtrain, 
                    num_boost_round=1000, 
                    evals=[(dtrain, 'train'), (dval, 'eval')],
                    verbose_eval =True,
                    early_stopping_rounds=50
                   )
                    
                LOAD_MODEL = True
            else:
                bst = xgb.train(param, 
                    dtrain, 
                    num_boost_round=1000, 
                    evals=[(dtrain, 'train'), (dval, 'eval')],
                    verbose_eval =True,
                    early_stopping_rounds=50,
                    xgb_model=f'/kaggle/working/xg{seed}'
                   )
                
            bst.save_model(f'xg{seed}')
    

            #------------------ Predictions -------------------
            model_prediction = bst.predict(dtest)

            model_prediction = submit_prediction(model_prediction, 
                                                 write=False, 
                                                 file_name=f"sub{index}", 
                                                 Return=True)

            # Saving the predictions for each fold
            preds[index] = model_prediction
            index += 1

            # Starting different fold or end of folding
            print('#----------------#----------------#----------------#----------------#----------------#')
        
    # Averaging the predictions
    p = pd.DataFrame(preds)
    p = p.sum() / (n_fold * len(seeds))
        
    avg_pred = submit_prediction(p, 
                      write=True, 
                      file_name=f"S_avg", 
                      Return=True)
    
    return preds, avg_pred

In [None]:
preds, avg_pred = kfold_validation(X, y, X_test, 5, seeds=[11])

In [None]:
pd.DataFrame(preds).to_csv('preds_xg10.csv', index=False)

In [None]:
avg_pred[[1,2,3]] * 100