In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
import catboost as cb
import optuna
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/test.csv')
submission = pd.read_csv( '/kaggle/input/tabular-playground-series-feb-2021/sample_submission.csv', index_col='id')

In [None]:
data = pd.concat([train, test])

In [None]:
# scaler = StandardScaler()
# data.iloc[:,11:-1]= scaler.fit_transform(data.iloc[:,11:-1])

In [None]:

data = pd.get_dummies(data)
train = data[:300000]
test = data[300000:]
del data

In [None]:
X=train.drop(['id', 'target'],axis=1)
y=train.target
test = test.drop(['id', 'target'],axis=1)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=23)

In [None]:
categ_features = X.iloc[:,14:].columns

In [None]:
def objective(trial):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)
    params = {
        'cat_features': categ_features,
        'eval_metric': 'RMSE',
        'loss_function' :'RMSE',
        'random_state': 23,
        'use_best_model':True,
        'task_type': 'GPU',
        'iterations': trial.suggest_int('iterations', 100, 10000),
        'depth': trial.suggest_int('depth', 2, 16),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1.0),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 100),
#         'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 1e-2, 10.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }
    
    model = cb.CatBoostRegressor(**params)
    
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0, early_stopping_rounds=200)
    predictions = model.predict(X_valid)
    score = mean_squared_error(y_valid, predictions)
    return score

In [None]:
%%time
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=40)

In [None]:
trial = study.best_trial
print(f" best RMSE: {trial.value}")
print("best params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

In [None]:
# %%time
# params1 = { 
#     'iterations': 7827,
#     'depth': 6, 
#     'learning_rate': 0.06115296942277834, 
#     'l2_leaf_reg': 0.00018537808841101856, 
#     'bagging_temperature': 7.2769130916283125, 
#     'min_child_samples': 95
# }


# model_cb = cb.CatBoostRegressor(**params1, cat_features=categ_features ,
#                                 loss_function='RMSE', eval_metric='RMSE')

# model_cb.fit(X, y, cat_features=categ_features)
# submission['target'] = model_cb.predict(test)
# submission.to_csv('catboost4.csv')

In [None]:
# %%time
# params2 = {
#     'iterations': 6961, 
#     'depth': 13, 
#     'learning_rate': 0.1420713844582006, 
#     'l2_leaf_reg': 0.0019005856806734837, 
#     'bagging_temperature': 5.060337163809838, 
#     'min_child_samples': 49
# }


# model_cb = cb.CatBoostRegressor(**params2, cat_features=categ_features ,
#                                 loss_function='RMSE', eval_metric='RMSE')

# model_cb.fit(X, y, cat_features=categ_features)
# submission['target'] = model_cb.predict(test)
# submission.to_csv('catboost3.csv')

In [None]:
n_folds = 10
train_oof = np.zeros((300000,))
test_preds = 0

skf = KFold(n_splits=n_folds, random_state=23, shuffle=True)

for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    
    print(f'FOLD    {fold+1}')
    
    X_train, X_valid = pd.DataFrame(X.iloc[train_index]), pd.DataFrame(X.iloc[test_index])
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
       
    cat_params = { 
        'iterations': 7827,
        'depth': 6, 
        'learning_rate': 0.06115296942277834, 
        'l2_leaf_reg': 0.00018537808841101856, 
        'bagging_temperature': 7.2769130916283125, 
        'min_child_samples': 95
    }
    
    model = cb.CatBoostRegressor(
        **cat_params
    )
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=500,
        early_stopping_rounds=200,
    )
    preds = model.predict(X_valid)
    test_preds += model.predict(test) / n_folds
    train_oof[test_index] = preds
    print("")
    

print(f": RMSE = {mean_squared_error(y, train_oof)}")

submission['target'] = test_preds
submission.to_csv('catboost_optuna_scaled.csv')