In [None]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from xgboost import XGBRegressor

In [None]:
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

In [None]:
# pd.set_option('display.max_columns',None) 
# pd.set_option('display.max_rows', None)

**Loading Data**

In [None]:
df = pd.read_csv('../input/tps-aug-2021-train-with-folds/train_folds.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/test.csv')
sample_sub = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv')

In [None]:
features = [col for col in df.columns if col not in ('id','loss','Kfold')]
df_test = df_test[features]

In [None]:
SEED = 27
N_ESTIMATORS = 10000
VERBOSE = 1000

lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'n_estimators': N_ESTIMATORS,
    'random_state': SEED,
    'learning_rate': 5e-3,
    'subsample': 0.8,
    'subsample_freq': 1,
    'colsample_bytree': 0.6,
    'reg_alpha': 6.4,
    'reg_lambda': 1.8,
    'min_child_weight': 256,
    'min_child_samples': 20,
    'importance_type': 'gain',
    }

ctb_params = {
    'bootstrap_type': 'Poisson',
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': SEED,
    'task_type': 'GPU',
    'max_depth': 8,
    'learning_rate': 5e-3,
    'n_estimators': N_ESTIMATORS,
    'max_bin': 280,
    'min_data_in_leaf': 64,
    'l2_leaf_reg': 0.01,
    'subsample': 0.8
}

# xgb_params = {
#     'objective': 'reg:squarederror',
#     'learning_rate': 5e-3,
#     'seed': SEED,
#     'subsample': 0.8,
#     'colsample_bytree': 0.6,
#     'n_estimators': N_ESTIMATORS,
#     'max_depth': 11,
#     'alpha': 20,
#     'lambda': 9,
#     'min_child_weight': 256,
#     'importance_type': 'total_gain',
    
# }



In [None]:
# Model hyperparameters
#  xgb_params = {'n_estimators': 10000,
#               'learning_rate': 0.35,
#               'subsample': 0.926,
#               'colsample_bytree': 0.84,
#               'max_depth': 2,
#               'booster': 'gbtree', 
#               'reg_lambda': 35.1,
#               'reg_alpha': 34.9, 'random_state':27,
#               'n_jobs': 4}
xgb_params ={'n_estimators': 2000,
             'subsample': 0.6,
             'colsample_bytree': 0.9,
             'eta': 0.007939812697028495,
             'reg_alpha': 46, 'reg_lambda': 64, 'max_depth': 12,
             'min_child_weight': 20, 'random_state': 27, 'n_jobs':4}
# xgb_params = {'colsample_bytree': 0.8413485408956082,
#               'gamma': 7.169901458931625,
#               'learning_rate': 0.41588056022137915,
#               'max_depth': 12,'min_child_weight': 7.0,
#               'n_estimators': 10000,'random_state': 21,
#               'reg_alpha': 72,'reg_lambda': 55,'subsample': 0.8772846596931277}

In [None]:
scaler = StandardScaler()

preds = []
scores = []
final_pred_valid ={}
for fold in range(10):
    X_train = df[df.Kfold != fold].reset_index(drop=True)
    X_valid = df[df.Kfold == fold].reset_index(drop=True)
    X_test = df_test.copy()
    
    valid_ids = X_valid.id.values.tolist()
    
    y_train, y_valid = X_train.loss, X_valid.loss
    X_train, X_valid = X_train[features],X_valid[features]
    
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)
    X_test  = scaler.transform(X_test)
    
    model = XGBRegressor(**xgb_params)
#                          tree_method='gpu_hist', 
#                          gpu_id=0, predictor="gpu_predictor")
        
    model.fit(X_train, y_train,verbose=False,
              eval_set=[(X_train,y_train),(X_valid, y_valid)],
              eval_metric='rmse', early_stopping_rounds=100)
    
    pred_valid = model.predict(X_valid)
    pred_test = model.predict(X_test)
    
    preds.append(pred_test)
    final_pred_valid.update(dict(zip(valid_ids, pred_valid)))
    
    rmse = mean_squared_error(pred_valid, y_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)
    
print(np.mean(scores), np.std(scores))

final_pred_valid = pd.DataFrame.from_dict(final_pred_valid, orient='index').reset_index()
final_pred_valid.columns = ['id','pred_1']
final_pred_valid.to_csv('train_pred_1.csv', index=False)

sample_sub.loss = np.mean(np.column_stack(preds), axis=1)
sample_sub.columns = ["id", "pred_1"]
sample_sub.to_csv("test_pred_1.csv", index=False)

In [None]:
# preds = []
# scores = []
# final_pred_valid ={}
# for fold in range(10):
#     X_train = df[df.Kfold != fold].reset_index(drop=True)
#     X_valid = df[df.Kfold == fold].reset_index(drop=True)
#     X_test = df_test.copy()
    
#     valid_ids = X_valid.id.values.tolist()
    
#     y_train, y_valid = X_train.loss, X_valid.loss
#     X_train, X_valid = X_train[features],X_valid[features]
    
#     X_train = scaler.fit_transform(X_train)
#     X_valid = scaler.transform(X_valid)
#     X_test  = scaler.transform(X_test)
    
#     model = LGBMRegressor(**lgb_params)
        
#     model.fit(X_train, y_train, verbose=VERBOSE,
#              eval_set=[(X_train,y_train),(X_valid, y_valid)],
#              eval_metric='rmse', early_stopping_rounds=100)
    
#     pred_valid = model.predict(X_valid)
#     pred_test = model.predict(X_test)
    
#     preds.append(pred_test)
#     final_pred_valid.update(dict(zip(valid_ids, pred_valid)))
    
#     rmse = mean_squared_error(pred_valid, y_valid, squared=False)
#     print(fold, rmse)
#     scores.append(rmse)
    
# print(np.mean(scores), np.std(scores))
# final_pred_valid = pd.DataFrame.from_dict(final_pred_valid, orient='index').reset_index()
# final_pred_valid.columns = ['id','pred_2']
# final_pred_valid.to_csv('train_pred_2.csv', index=False)

# sample_sub.loss = np.mean(np.column_stack(preds), axis=1)
# sample_sub.columns = ["id", "pred_1"]
# sample_sub.to_csv("test_pred_1.csv", index=False)

In [None]:
# final_pred_valid = pd.DataFrame.from_dict(final_pred_valid, orient='index').reset_index()
# final_pred_valid.columns = ['id','pred_2']
# final_pred_valid.to_csv('train_pred_2.csv', index=False)

# sample_sub.loss = np.mean(np.column_stack(preds), axis=1)
# sample_sub.columns = ["id", "pred_1"]
# sample_sub.to_csv("test_pred_1.csv", index=False)

In [None]:
# final_prediction = np.mean(np.column_stack(preds),axis=1)

In [None]:
# sample_sub.loss = final_prediction
# sample_sub.to_csv("submission.csv", index=False)