In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
n_Trials=5

In [None]:
df=pd.read_csv('../input/train-folds/train_folds.csv')

In [None]:
import sklearn
from sklearn.ensemble import RandomForestClassifier
import xgboost as xg
import joblib
import lightgbm as lgb
import optuna
from sklearn.tree import DecisionTreeClassifier
import catboost as cb

In [None]:
def objective(trial):
    

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        'eval_metric' : 'AUC',
        'verbose' : 1000,
        
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    
    
    roc_auc_score=0
    clf = cb.CatBoostClassifier(**param)
    for fold in range(1):
        
        df_train=df[df.kfold!=fold]
        df_test=df[df.kfold==fold]

        x_train=df_train.drop(['target','id','kfold'],axis=1).values
        y_train=df_train.target.values

        x_test=df_test.drop(['target','id','kfold'],axis=1).values
        y_test=df_test.target.values

    #     clf=RandomForestClassifier()
        clf.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=100)

        y_pred = clf.predict_proba(x_test)[:,1]
        roc_auc_score = roc_auc_score+sklearn.metrics.roc_auc_score(y_test,y_pred)
    
    return roc_auc_score

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=n_Trials, timeout=600)

In [None]:
study.best_params

In [None]:
n_folds=len(df['kfold'].value_counts())
print(n_folds)

In [None]:
def run_folds(df,fold):
    df_train=df[df.kfold!=fold]
    df_test=df[df.kfold==fold]
    
    x_train=df_train.drop(['target','id','kfold'],axis=1).values
    y_train=df_train.target.values
    
    x_test=df_test.drop(['target','id','kfold'],axis=1).values
    y_test=df_test.target.values
    
    param=study.best_params
    param['eval_metric']= 'AUC'
    param['verbose']= 1000
        
        
    clf= cb.CatBoostClassifier(**param)
    clf.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=0, early_stopping_rounds=100)
    
    y_pred = clf.predict_proba(x_test)[:,1]
    roc_auc_score = sklearn.metrics.roc_auc_score(y_test,y_pred)
    print(f"Fold={fold}, roc_auc_score={roc_auc_score}")
    
    File_name = 'model_catbclf' + str(fold)
    joblib.dump(
    clf,File_name)
    
    df_test.loc[:,"catbclf_pred"]=y_pred
    
    return df_test[['id','target','kfold','catbclf_pred']]


dfs=[]
    
for i in range(n_folds):
    temp_df=run_folds(df,i)
    dfs.append(temp_df)
df_pred=pd.concat(dfs)

print(df_pred.shape)

In [None]:
df_pred.to_csv("catb_clf_predictions.csv",index=False)

In [None]:
df_pred.head()

In [None]:
df.head()