In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier

import optuna

import warnings
warnings.filterwarnings("ignore")

Read all the required datasets

In [None]:
df = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/test.csv")
ss = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv")

Creating folds using Startified kfold

In [None]:
df["kfold"] = -1
df = df.sample(frac=1).reset_index(drop=True)
y = df.target
kf = StratifiedKFold(n_splits=5)
for f, (t_,v_) in enumerate(kf.split(X=df,y=y)):
  df.loc[v_,"kfold"] = f

In [None]:
df.shape,test.shape

# CATBOOST

In [None]:
df.columns

In [None]:
cat_features = [f"feature_{i}" for i in range(0,50)]

In [None]:
cat = CatBoostClassifier(task_type='GPU',
                         iterations=3000,
                         loss_function='MultiClass',
                         random_state = 42,
                         early_stopping_rounds=500,
                         verbose=100)

In [None]:
logloss = []
cat_pred = 0
for f in range(5): # Looping around 5 folds
    
    #Splitting the data into train and validation set
    train = df[df.kfold!= f].reset_index(drop=True) 
    valid = df[df.kfold== f].reset_index(drop=True)
    
    #Creating X_train and y_train
    X_train = train.drop(["id","target", "kfold"], axis=1)
    y_train = train.target
    X_valid = valid.drop(["id","target", "kfold"], axis=1)
    y_valid = valid.target
    X_test = test.drop(["id"], axis=1)
    
    #Creating pool
    train_pool = Pool(data=X_train,label=y_train,cat_features=cat_features)
    valid_pool = Pool(data=X_valid,label=y_valid,cat_features=cat_features)
    
    #Fitting the model
    cat.fit(train_pool, eval_set=valid_pool,verbose=100)
    
    #Predicting for valid and test datasets
    valid_preds = cat.predict_proba(X_valid)
    cat_pred += cat.predict_proba(X_test)/5
    
    #Calculating log loss
    logloss.append(log_loss(y_valid,valid_preds))
    
print(logloss)
print(sum(logloss)/len(logloss))

# LGBM

In [None]:
lgbm = LGBMClassifier(random_state=42)
logloss = []
lgbm_pred = 0
for f in range(5): # Looping around 5 folds
    
    #Splitting the data into train and validation set
    train = df[df.kfold!= f].reset_index(drop=True) 
    valid = df[df.kfold== f].reset_index(drop=True)
    
    #Creating X_train and y_train
    X_train = train.drop(["id","target", "kfold"], axis=1)
    y_train = train.target
    X_valid = valid.drop(["id","target", "kfold"], axis=1)
    y_valid = valid.target
    X_test = test.drop(["id"], axis=1)
    
    #Creating pool
    #train_pool = Pool(data=X_train,label=y_train,cat_features=cat_features)
    #valid_pool = Pool(data=X_valid,label=y_valid,cat_features=cat_features)
    
    #Fitting the model
    lgbm.fit(X_train,y_train)
    
    #Predicting for valid and test datasets
    valid_preds = lgbm.predict_proba(X_valid)
    lgbm_pred += lgbm.predict_proba(X_test)/5
    
    #Calculating log loss
    logloss.append(log_loss(y_valid,valid_preds))
    
print(logloss)
print(sum(logloss)/len(logloss))

# Blending

Taking the average of both the predictions

In [None]:
avg_pred = []
avg_pred.append((cat_pred[:,0] + lgbm_pred[:,0]) / 2)
avg_pred.append((cat_pred[:,1] + lgbm_pred[:,1]) / 2)
avg_pred.append((cat_pred[:,2] + lgbm_pred[:,2]) / 2)
avg_pred.append((cat_pred[:,3] + lgbm_pred[:,3]) / 2)

In [None]:
ss["Class_1"] = avg_pred[0]
ss["Class_2"] = avg_pred[1]
ss["Class_3"] = avg_pred[2]
ss["Class_4"] = avg_pred[3]
ss.to_csv("/kaggle/working/cat_lgbm_pred.csv", index=False)

Thank you