If you want to work on colab, you would need to install Optuna (for hyper-parametter tunning) and also to update XGBoost package.

In [None]:
!pip install -U xgboost

Collecting xgboost
[?25l  Downloading https://files.pythonhosted.org/packages/35/cc/fd3d5fc6b6616a03385a0f6492cc77a253940d1026406ecc07597095e381/xgboost-1.2.1-py3-none-manylinux2010_x86_64.whl (148.9MB)
[K     |████████████████████████████████| 148.9MB 78kB/s 
Installing collected packages: xgboost
  Found existing installation: xgboost 0.90
    Uninstalling xgboost-0.90:
      Successfully uninstalled xgboost-0.90
Successfully installed xgboost-1.2.1


In [None]:
!pip install optuna

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
import xgboost
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
import optuna

I processed the sesors signal and extracted some features from that, including FTT signal, mean, median, quantile and etc. All of then are saved in `out.csv` file which I am going to load.

In [None]:
path='drive/My Drive/Soheil/volcan/new3/'
train_df=pd.read_csv(path+'out.csv',).to_numpy()
X=train_df[:,1:-1]
y=train_df[:,-1]

Some of my features are useless as their variances is zeros. I am going to remove them along with features with correlation more than 0.99

In [None]:
sel = VarianceThreshold(threshold=0.0)
sel.fit(X)
drop_var=np.where(sel.variances_==0)[0]
corr_mat=np.corrcoef(X.T)
cols_=[]
for i in range(1,corr_mat.shape[1]):
  for j in range(i):
    if corr_mat[i,j]>0.99:
      cols_.append(i)
all_drop=np.unique(np.hstack((drop_corr,drop_var)))
X_=np.delete(X,all_drop,1)


  self.variances_ = np.nanvar(X, axis=0)
  self.variances_ = np.nanmin(compare_arr, axis=0)
  (self.variances_ <= self.threshold)):


VarianceThreshold(threshold=0.0)

To have similar distribution in my folds by using KFold, I am going to sectionize target and then to use StraifiedKfold instead of Kfold.

In [None]:
def reg2class(series_):

    count, division = np.histogram(series_,bins=50)
    min_count=np.min(count)
    class_=[]
    for j_ in range(series_.shape[0]):
        for i_ in range(division.shape[0]-1):
            if series_.iloc[j_]<=division[i_+1] and series_.iloc[j_]>=division[i_]:
                class_.append(i_)
    return np.array(pd.DataFrame(class_)[0]),min_count
y_class,max_fold_=reg2class(pd.DataFrame(y[:,None])[0])

In below cell, I am going to find the most important features by using XGBoost.

In [None]:

n_folds=10
importance_=np.zeros((X_.shape[1],n_folds))
mae=[]
for i,(tr_,ts_) in enumerate(StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=1370).split(X_,y_class)) :
    model1 = xgboost.XGBRegressor(n_estimators=3000,tree_method='gpu_hist',max_depth=10,
                              learning_rate=0.005,
                              min_child_weight=7,
                              eta= 0.005,
                              subsample=0.8, 
                              colsample_bytree=0.7, 
                              reg_alpha=1e-05,
                              gamma=0.4)

    eval_set = [(X_[ts_,:], y[ts_])]
    model1.fit(X_[tr_,:], y[tr_],early_stopping_rounds=5,eval_metric='rmse', eval_set=eval_set, verbose=1)
    importance_[:,i]=model1.feature_importances_
    mae_=model1.best_score
    print(mae_)
    mae.append(mae_)

I dropped features with scores less than quantile 0.75


In [None]:
val_=np.mean(importance_,axis=1)
drop_imp=np.where(val_<np.quantile(val_,[0.75])[0])[0]
X__=np.delete(X_,drop_imp,1)

In [None]:
test_df=pd.read_csv(path+'out_test.csv').to_numpy()
X_test=test_df[:,1:]
X_test_=np.delete(X_test,all_drop,1)
X_test__=np.delete(X_test_,drop_imp,1)


In [None]:
X_test__.shape,X__.shape

Hyper-parameter tuning!

In [None]:
def objective(trial):


  n_folds=10
  mae=[]
  for i,(tr_,ts_) in enumerate(StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=1370).split(X__,y_class)) :

    model1 = xgboost.XGBRegressor(n_estimators=3000,tree_method='gpu_hist',
                                    gamma=trial.suggest_loguniform("gamma", 0.3, 0.5),
                                    min_child_weight=trial.suggest_int("min_child_weight", 1, 20),
                                    max_depth=trial.suggest_int("max_depth", 3, 20),
                                    learning_rate=trial.suggest_loguniform("learning_rate", 0.0005, 0.5),
                                    eta=trial.suggest_loguniform("eta",  0.0005, 0.5),
                                    subsample=trial.suggest_loguniform("subsample", 0.1,1),
                                    colsample_bytree=trial.suggest_loguniform("colsample_bytree", 0.1, 1),
                                    reg_alpha=trial.suggest_loguniform("reg_alpha", 0.001, 50))

    eval_set = [(X__[ts_,:], y[ts_])]
    model1.fit(X__[tr_,:], y[tr_],early_stopping_rounds=5,eval_metric='mae', eval_set=eval_set, verbose=False)

    mae_=model1.best_score
    print(f'{mae_} {i}')
    mae.append(mae_)

  return np.mean(np.array(mae))

In [None]:
study = optuna.create_study()
study.optimize(objective,n_trials=200)

using optimum hyper-parametter, doing prediction and submiting the results!

In [None]:
n_folds=10
repeat_=3
res=np.zeros((X_test__.shape[0],n_folds*repeat_))
mae=[]
for j in range(repeat_):
  for i,(tr_,ts_) in enumerate(StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=j**2).split(X__,y_class)) :
    model1 = xgboost.XGBRegressor(n_estimators=3000,tree_method='gpu_hist',max_depth=19,eta=0.009747282892152175,
                                learning_rate=0.002369858098148533,
                                gamma=0.39999429394579983,subsample=0.7990451509767214,colsample_bytree= 0.3476157291201921,reg_alpha=1.7168175901019114,min_child_weight= 5)
    
    eval_set = [(X__[ts_,:], y[ts_])]
    model1.fit(X__[tr_,:], y[tr_],early_stopping_rounds=5,eval_metric='mae', eval_set=eval_set, verbose=False)

    mae_=model1.best_score
    print(mae_)
    mae.append(mae_)
    res[:,i+j*n_folds]=model1.predict(X_test__)

In [None]:
sample_submission_df=pd.read_csv(path+'sample_submission.csv')
sample_submission_df['time_to_eruption']=res.median(axis=1)[:,None]
sample_submission_df.to_csv('7folds_optimized_dim_red4.csv',index=False)