# Load Packages

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt    
import seaborn as sns
import os
from scipy.stats import norm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


%matplotlib inline
pd.set_option('display.max_columns', None)   # for showing all columns of dataset
#pd.set_option('display.max_rows', None)     # for showing all rows of dataset

# Load Dataset

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/train.csv')
test = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/test.csv")

In [None]:
# shape of dataset(rows,columns)
print(train.shape)              
print(test.shape)

In [None]:
# Lets check our train dataset
train.head()     

In [None]:
train.info()    # Information about train dataset

As we can see there are 51 columns and 100000 rows in train dataset and most of them having "int64" datatype only target variable's data type is 'object' .




In [None]:
# id column is not useful so we drop it
train.drop('id',axis=1,inplace=True)   
test.drop('id',axis=1,inplace=True)

# EDA

In [None]:
# describe the Statistics of dataset 
train.describe().T.style.bar(subset=['mean'])\
                            .background_gradient(subset=['std'])\
                            .background_gradient(subset=['50%'])\
                            .background_gradient(subset=['max'])

- **Count** : Number of rows in data set.
- **mean**  : mean is the average value of particular feature.
- **std.**  : std stands for Standard Deviation.It measures the spread of a data distribution. The more spread out a data distribution is, the greater its standard deviation.
- **min**   : Minimum value of Feature.
- **25%**   : It shows the 25% value of that feature.
- **50%**   : It shows the 50% value of that feature.
- **75%**   : It shows the 75% value of that feature.
- **max**   : Maximum value of that feature

Some of you think why we need them.Above all gives us the basic statistical information which will be very helpful in our EDA.


In [None]:
test.describe().T.style.bar(subset=['mean'])\
                            .background_gradient(subset=['std'])\
                            .background_gradient(subset=['50%'])\
                            .background_gradient(subset=['max'])

In [None]:
# compare between train and test dataset

def diff_color(x):
    color = 'red' if x<0 else ('green' if x > 0 else 'black')
    return f'color: {color}'

(train.describe() - test.describe())[test.columns].T.iloc[:,1:].style\
        .bar(subset=['mean', 'std'], align='mid', color=['#d65f5f', '#5fba7d'])\
        .applymap(diff_color, subset=['min', 'max'])

In [None]:
# Null values
train.isnull().sum()

As we can see there no null value




**Lets See Distribution of Target.**

In [None]:
# check target variable
sns.countplot(train['target'])
train.target.value_counts()

In [None]:
# Distribution of features in respect of target variable
feature_columns = train.columns.drop('target')
num_rows, num_cols = 10,5
f, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(20, 30))


for index, column in enumerate(feature_columns):
    i,j = (index // num_cols, index % num_cols)

    sns.kdeplot(train.loc[train['target'] == 'Class_1', column], shade=True, ax=axes[i,j])
    sns.kdeplot(train.loc[train['target'] == 'Class_2', column], shade=True, ax=axes[i,j])
    sns.kdeplot(train.loc[train['target'] == 'Class_3', column], shade=True, ax=axes[i,j])
    sns.kdeplot(train.loc[train['target'] == 'Class_4', column], shade=True, ax=axes[i,j])


plt.tight_layout()
plt.show()

**Correlation matrix**

In [None]:
fig, ax = plt.subplots(figsize=(10 , 10))

mask = np.zeros_like(train.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True



sns.heatmap(train.corr(),
        square=True, center=0, linewidth=0.2,
        cmap='Reds',
        mask=mask, ax=ax) 

ax.set_title('Feature Correlation', loc='center', fontweight='bold')
plt.show()

As we can see the data is not internally correlated thus all variables can be used in features selecton. Internal correlation may leads to strong correlation and covariation signals making other relations underated

# Feature Engineering

In [None]:
# Label Encoder
from sklearn.preprocessing import LabelEncoder
le =LabelEncoder()
le.fit(train['target'])
train['target'] = le.transform(train['target'])


In [None]:
X = train.drop('target',axis=1)
y = train['target']

In [None]:
X

# Models
- XGBOOST
- Catboost
- LGBM


In [None]:
folds = 5
SEED = 24
basic_model =[]

**XGBOOST**

In [None]:
y_oof_pred = np.zeros((train.shape[0], 4))
y_test_pred_xgb = np.zeros((test.shape[0], 4))
sf = StratifiedKFold(n_splits = folds, shuffle=True, random_state=SEED)
for fold,(train_idx,val_idx) in enumerate(sf.split(X,y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    xgb =XGBClassifier(objective ='multi:softprob',random_state=SEED)
    xgb.fit(X_train, y_train,
                 eval_set = [(X_train, y_train),(X_val, y_val)],
                 verbose = 50)
    y_val_pred = xgb.predict_proba(X_val)
    print(f"Fold {fold + 1} Logloss: {log_loss(y_val, y_val_pred)}")
    y_oof_pred[val_idx] = y_val_pred
    y_test_pred_xgb += xgb.predict_proba(test)


y_test_pred_xgb = y_test_pred_xgb / folds

print(f"Overall OOF Logloss: {log_loss(y, y_oof_pred)}")

basic_model.append({'model': 'xgboost', 'logloss': log_loss(y, y_oof_pred)})
    
    

**CatBoost**

In [None]:
y_oof_pred = np.zeros((train.shape[0], 4))
y_test_pred_catb = np.zeros((test.shape[0], 4))

kf = StratifiedKFold(n_splits = folds, shuffle=True, random_state=SEED)
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] 
        
    catb = CatBoostClassifier(random_state=SEED)
    catb.fit(X_train, y_train,
                 eval_set = [(X_train, y_train),(X_val, y_val)],
                 verbose = 200)

    y_val_pred = catb.predict_proba(X_val)

    print(f"Fold {fold + 1} Logloss: {log_loss(y_val, y_val_pred)}")

    y_oof_pred[val_idx] = y_val_pred
    y_test_pred_catb += catb.predict_proba(test)


y_test_pred_catb = y_test_pred_catb / folds

print(f"Overall OOF Logloss: {log_loss(y, y_oof_pred)}")
basic_model.append({'model': 'catboost', 'logloss': log_loss(y, y_oof_pred)})

**LGBM**

In [None]:
y_oof_pred = np.zeros((train.shape[0], 4))
y_test_pred_lgbm = np.zeros((test.shape[0], 4))

kf = StratifiedKFold(n_splits = folds, shuffle= True, random_state=SEED)

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
    lgbm = LGBMClassifier(random_state=SEED)

    lgbm.fit(X_train, y_train,
                 eval_set = [(X_train, y_train),(X_val, y_val)],
                 verbose = 200, early_stopping_rounds=150)

    y_val_pred = lgbm.predict_proba(X_val)

    print(f"Fold {fold + 1} Logloss: {log_loss(y_val, y_val_pred)}")

    y_oof_pred[val_idx] = y_val_pred
    y_test_pred_lgbm += lgbm.predict_proba(test)


y_test_pred_lgbm= y_test_pred_lgbm/ folds

print(f"-- Overall OOF Logloss: {log_loss(y, y_oof_pred)}")
basic_model.append({'model': 'lgbm', 'logloss': log_loss(y, y_oof_pred)})

In [None]:
df = pd.DataFrame(basic_model, index=None)
df

In [None]:

sns.catplot(y="model", x="logloss", data=df,kind='violin')
plt.show()

As plot shows us CatBoost is better than lgbm and xgboost

In [None]:
sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv')


submission = pd.DataFrame(y_test_pred_catb)
submission.columns = ['Class_1', 'Class_2','Class_3','Class_4']




submission['id'] = sample_submission['id']
subm = submission[['id','Class_1', 'Class_2','Class_3','Class_4']]

subm.to_csv("submission.csv", index=True)

Future vision Update for this notebook: 
1. Hyperparameter optimization for the above models
2. Use other ML Algorithms also
3. create ANN and then hypertune the Ann model