In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
TRAIN_PATH = "../input/tabular-playground-series-may-2021/train.csv"
TEST_PATH = "../input/tabular-playground-series-may-2021/test.csv"

In [None]:
def load_csv(file_path):
    df = pd.read_csv(file_path)
    return df

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import log_loss
import time


def calculate_score_and_time(model,X,y):
    cv = RepeatedStratifiedKFold(n_splits=5,n_repeats=2,random_state=42)
    start = time.time()
    scores = cross_val_score(model,X,y,cv=cv,n_jobs=-1)
    end = time.time()
    mean_score = np.mean(scores)
    return end-start,mean_score    

In [None]:
train = pd.read_csv(TRAIN_PATH)
train["target"] = train["target"].map({"Class_1":1,"Class_2":2,"Class_3":3,"Class_4":4})
X = train.drop(columns=["id","target"])
y = train.target

In [None]:
model_names = ["hgbc","xgbc","cbc","lgbm"]
models = [HistGradientBoostingClassifier(),XGBClassifier(),CatBoostClassifier(verbose=False),LGBMClassifier()]
scores = {}
            
for model_name,model in zip(model_names,models):
    print("Running for {}".format(model_name))
    delta_time,mean_score = calculate_score_and_time(model,X,y)
    print("Time:{} | Mean score: {}".format(delta_time,mean_score))
    scores[model_name] = delta_time,mean_score

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

In [None]:
def return_log_loss_catboost(params,x_train,x_test,y_train,y_test):
    model = CatBoostClassifier(**params)
    model.fit(x_train,y_train,eval_set=[(x_test,y_test)])
    y_preds = model.predict_proba(x_test)
    return log_loss(y_test,y_preds)

In [None]:
import optuna

In [None]:
def objective(trial):
    params = {
        "learning_rate": trial.suggest_uniform("learning_rate",0.03,0.3),
        "iterations": trial.suggest_int("iterations",500,1200),
        "depth": trial.suggest_int("depth",3,10),
        "verbose": False,
        "random_seed": 42,
        #"subsample": trial.suggest_uniform("subsample",0.2,1.0),
        "l2_leaf_reg": trial.suggest_loguniform('reg_lambda' , 1e-8 , 30)
        
    }
    loss = return_log_loss_catboost(params,x_train,x_test,y_train,y_test)
    return loss

In [None]:
# study = optuna.create_study(direction="minimize")
# study.optimize(objective,n_trials=300)
# best = study.best_trial
# best.params

In [None]:
params = {
    "learning_rate": 0.12259970474275227,
    "verbose": False,
    "iterations": 800,
    "depth": 4,
    "reg_lambda": 29.369429883581308
}
cbc = CatBoostClassifier(**params)
cbc.fit(x_train,y_train)
preds = cbc.predict_proba(x_test)
print(log_loss(y_test,preds))

In [None]:
test = pd.read_csv(TEST_PATH)
test.head()

In [None]:
preds = cbc.predict_proba(test.drop(columns=["id"]))
preds_df = pd.DataFrame(preds,columns=["Class_1","Class_2","Class_3","Class_4"])
drop_cols = ["feature_"+str(i) for i in range(50)]
print(drop_cols)
sub = test.drop(columns=drop_cols)
submission = sub.join(preds_df)
submission.head()
print(submission)
submission.to_csv("./submission.csv",index=False)