In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold # For creating folds
from sklearn.metrics import log_loss # Evaluation metrics
import copy

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Reading the train, test and sample submission file

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/test.csv")
ss = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv")

In [None]:
print(f"Shape of train : {train.shape}")
print(f"Shape of test : {test.shape}")
print(f"Shape of sample submission : {ss.shape}")

In [None]:
train.head()

In [None]:
train.isnull().sum()/train.shape[0]

In [None]:
test.isnull().sum()/test.shape[0]

There are **no missing values** in the both train and test datasets.

In [None]:
train.info()

**Since it is a baseline/starter model,I am not doing EDA and directly moving onto model building part.**

# Basline model

Defining a variable with all the categorical features to pass to catboost classifier

In [None]:
cat_features =['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4',
       'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9',
       'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14',
       'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19',
       'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24',
       'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29',
       'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34',
       'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39',
       'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44',
       'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49']

Creating folds for the train dataset, so that we can train the model for the n folds, to avoid overfitting.

In [None]:
train["kfold"] = -1
train = train.sample(frac=1).reset_index(drop=True)
y = train.target
kf = StratifiedKFold(n_splits=5)
for f, (t_,v_) in enumerate(kf.split(X=train,y=y)):
  train.loc[v_,"kfold"] = f

In [None]:
cat = CatBoostClassifier(task_type='GPU',
                         iterations=1000,
                         loss_function='MultiClass',
                         random_state = 42,
                         verbose=100
                         )

Traing and evaluating the model in the 5 folds cross validation manner.

In [None]:
df = copy.deepcopy(train)
logloss = []
preds = []
for f in range(5): # Looping around 5 folds
    
    #Splitting the data into train and validation set
    train = df[df.kfold!= f].reset_index(drop=True) 
    valid = df[df.kfold== f].reset_index(drop=True)
    
    #Creating X_train and y_train
    X_train = train.drop(["id","target", "kfold"], axis=1)
    y_train = train.target
    X_valid = valid.drop(["id","target", "kfold"], axis=1)
    y_valid = valid.target
    X_test = test.drop(["id"], axis=1)
    
    #Creating pool
    train_pool = Pool(data=X_train,label=y_train,cat_features=cat_features)
    valid_pool = Pool(data=X_valid,label=y_valid,cat_features=cat_features)
    
    #Fitting the model
    cat.fit(train_pool, eval_set=valid_pool,verbose=100)
    
    #Predicting for valid and test datasets
    valid_preds = cat.predict_proba(X_valid)
    preds.append(cat.predict_proba(X_test)) #Appending the predicted probablities for test data into a list
    
    #Calculating log loss
    logloss.append(log_loss(y_valid,valid_preds))
    
print(logloss)
print(sum(logloss)/len(logloss))

**The average log loss is 1.0926706588358275**

Now we have the predictions from 5 models, one from each folds. We are going the take the avearge of these. The result will be more reliable since we have predicted in the cross validated manner. 

Lets check the shape of preds

In [None]:
np.array(preds).shape

We have a 5 two dimentional array in the shape of sample submission. Lets take the average of it and write to sample submission to create a submission file.

In [None]:
for i in range(50000):
    input_val = [preds[0][i], preds[1][i], preds[2][i], preds[3][i], preds[4][i]]
    avg_pred = [np.mean(x) for x in zip(*input_val)]

avg_pred has the average of the 5 predictions. Lets write it to the ss

In [None]:
ss["Class_1"] = pred[0]
ss["Class_2"] = pred[1]
ss["Class_3"] = pred[2]
ss["Class_4"] = pred[3]

In [None]:
ss.to_csv("base_line_sub.csv", index=False)

# Next steps

This is a very basic code and have lot of room for improvement.
1. We can check for outliers or do feature engineering to adjust the data.
2. We can tune the hyper parameters for the CatBoostClassifier to get better scores.

Thank you.