# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
import optuna

# Load train and test dataset

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
train.head()

In [None]:
test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
test.head()

In [None]:
sample_submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')
sample_submission.head()

# Exploratory Data Analysis (EDA)

In [None]:
train.drop('id', axis=1, inplace= True)

In [None]:
test.drop('id', axis=1, inplace= True)

In [None]:
train.describe().T\
        .style.bar(subset=['mean'], color=px.colors.qualitative.G10[0])\
        .background_gradient(subset=['std'], cmap='Greens')\
        .background_gradient(subset=['50%'], cmap='BuGn')

In [None]:
for i in train.columns:
  train[i] = pd.Categorical(train[i])

In [None]:
for i in test.columns:
  test[i] = pd.Categorical(test[i])

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
n_rows = 11
n_cols = 5
init = 1
features = train.columns.values

fig = plt.figure(figsize=(100,90))

for i in features:
    plt.subplot(n_rows, n_cols, init)
    plt.xlabel(i)
    sns.countplot(data=train, x=train[i])
    init = init + 1
plt.show()

In [None]:
n_rows = 10
n_cols = 5
init = 1
features = test.columns.values

fig = plt.figure(figsize=(100,90))

for i in features:
    plt.subplot(n_rows, n_cols, init)
    plt.xlabel(i)
    sns.countplot(data=train, x=train[i])
    init = init + 1
plt.show()

In [None]:
X = train.iloc[:,0:50]
y = train.iloc[:,50:]

In [None]:
label = LabelEncoder()
y = label.fit_transform(y)

In [None]:
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

# Model Building

## LGBM

In [None]:
lgbm = LGBMClassifier(objective='multiclass', n_estimators = 1000, learning_rate=0.05,num_leaves=125, max_depth=17, reg_alpha= 7.704678722351078,
                   reg_lambda= 24.86952078162094, colsample_bytree= 0.07893216649715043, subsample= 0.1052205406887246, cat_smooth= 30.802384732840533)
lgbm.fit(X_train,y_train)

In [None]:
lgbm.score(X_test,y_test)

## BernoulliNB

In [None]:
bnb = BernoulliNB()
bnb.fit(X_train,y_train)

In [None]:
bnb.score(X_test,y_test)

In [None]:
params_bnb = {
    'alpha': [1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0]
}

In [None]:
bnb_search = GridSearchCV(bnb, params_bnb)
bnb_search.fit(X_train,y_train)

In [None]:
bnb = bnb_search.best_estimator_

In [None]:
bnb

In [None]:
bnb_search.best_score_

In [None]:
bnb.fit(X_train,y_train)

In [None]:
bnb.score(X_test,y_test)

## CatBoost

In [None]:
cat_features = ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4',
       'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9',
       'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14',
       'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19',
       'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24',
       'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29',
       'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34',
       'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39',
       'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44',
       'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49']

In [None]:
cat = CatBoostClassifier(task_type='GPU', cat_features= cat_features, verbose=False, depth=4,
            max_ctr_complexity=15,
            iterations=50000,
            od_wait=1000, 
            od_type='Iter',
            learning_rate=0.01,
            min_data_in_leaf=1,
            #use_best_model=True,
            loss_function='MultiClass')

In [None]:
# cat = CatBoostClassifier(task_type='GPU', 
#             cat_features= cat_features, 
#             verbose=False, 
#            # depth=4,
#             max_ctr_complexity=15,
#             iterations=50000,
#             od_wait=1000, 
#             od_type='Iter',
#             learning_rate= 0.03748196035681244,
#             reg_lambda = 13.796757401995501,
#             subsample =  0.374946416768779,
#             random_strength =  0.5160756387634352,
#             depth= 6,
#             min_data_in_leaf= 48,
#             num_leaves = 44,
#             leaf_estimation_iterations=1,             
#             #learning_rate=0.01,
#            # min_data_in_leaf=1,
#             #use_best_model=True,
#             loss_function='MultiClass')

In [None]:
#cat.fit(X_train,y_train, eval_set=(X_test,y_test), plot=True)
cat.fit(X_train,y_train, plot=True)

In [None]:
cat.score(X_test,y_test)

In [None]:
cat.get_all_params()

# Submission

In [None]:
vote = VotingClassifier([('lgbm', lgbm),('bnb',bnb),('cat',cat)], voting='soft')

In [None]:
vote.fit(X_train,y_train)

In [None]:
vote.score(X_test,y_test)

In [None]:
probability_predictions= vote.predict_proba(test)
probability_predictions

In [None]:
sample_submission.iloc[:,1:5] = probability_predictions
sample_submission

In [None]:
sample_submission.to_csv('submission.csv',index=False)