In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h2 style=color:green align='left'> Table of Contents </h2>

##### 1) Load Required Libraries
##### 2) Read Data
##### 3) EDA (Exploratory Data Analysis)

>    3.1) Drop Unwanted Columns

>    3.2) Missing Values

>    3.3) Variable Analysis

>    3.4) Outliers

>    3.5) Relation between Features 

>    3.6) Skewness and Kurtosis 

##### 4) Model Building and Evaluation

>    4.1) XGBoost

>    4.2) LightAutoML

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 1) Load Required Libraries </h1>

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

plt.style.use("fivethirtyeight")
sns.set_style("darkgrid")

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, auc, roc_curve, roc_auc_score

from xgboost import XGBClassifier, plot_importance
from lightgbm import LGBMClassifier, plot_importance

from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 2) Read Data </h1>

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/test.csv")
sub = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv")

In [None]:
trainoriginal = train.copy()
testoriginal = test.copy()

In [None]:
display(train.head())
display(test.head())
display(sub.head())

In [None]:
print("Size of the train:", train.shape)
print("\nSize of the test:", test.shape)
print("\nSize of the Submission:", sub.shape)

In [None]:
print('Columns in Train data:\n\n', train.columns)
print('-'*80)
print('\n\nColumns in Test data:\n\n', test.columns)

In [None]:
print('Datatypes of Train dataset:\n', train.info())
print('-'*50)
print('\n\nDatatypes of Train dataset:\n', test.info())

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 3) EDA (Exploratory Data Analysis) </h1>

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:180%; text-align:left;"> 3.1) Drop Unwanted Columns </h1>

In [None]:
train = train.drop(['id'], axis=1)
test = test.drop(['id'], axis=1)

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:180%; text-align:left;"> 3.2) Missing Values </h1>

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:180%; text-align:left;"> 3.2) Variable Analysis </h1>

In [None]:
train['target'].value_counts().sort_index(ascending=True)

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x='target', data=train, order=train['target'].value_counts().index);

In [None]:
train['target1'] = train['target'].map({'Class_1':0, 'Class_2':1, 'Class_3':2, 'Class_4':3})

In [None]:
train['feature_0'].value_counts()

In [None]:
train['feature_2'].value_counts()

In [None]:
train['feature_38'].value_counts().sort_index(ascending=False)

### Below features have negative values
> feature_19 (-1, -2)

> feature_30 (-1)

> feature_31 (-1)

> feature_32 (-1, -2)

> feature_35 (-2)

> feature_38 (-1, -2, -3, -5, -8)

> feature_39 (-1, -2, -3, -5)

> feature_42 (-1, -2)

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:180%; text-align:left;"> 3.3) Outliers </h1>

In [None]:
plt.figure(figsize=(18,25))
sns.boxplot(data=train, orient="h");

In [None]:
plt.figure(figsize=(18,25))
sns.boxplot(data=test.iloc[:,1:], orient="h");

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:180%; text-align:left;"> 3.4) Relation between Features </h1>

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:180%; text-align:left;"> 3.4.1) The correlation between the continuos variables </h1>

a. Pearson Correlation

b. Spearman Correlation

c. kendall

In [None]:
# Pearson Correlation
plt.figure(figsize=(18,10))
sns.heatmap(train.corr(method='pearson'), cbar=False, annot=True, fmt='.1f', linewidth=0.2, cmap='coolwarm');

In [None]:
# Spearman Correlation
plt.figure(figsize=(18,12))
sns.heatmap(train.corr(method='spearman'), cbar=False, annot=True, fmt='.1f', linewidth=0.2, cmap='coolwarm');

In [None]:
fig, ax = plt.subplots(figsize=(18, 12))
corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool))
ax.text(-1.1, -0.7, 'Correlation between the Features', fontsize=20, fontweight='bold', fontfamily='serif')
sns.heatmap(corr, mask=mask, annot=False, fmt='.2f', linewidth=0.2, cbar=True, cmap='coolwarm');

In [None]:
# kendall
fig, ax = plt.subplots(1, 3, figsize=(17 , 5))

feature_lst = ['feature_0', 'feature_1', 'feature_2','feature_3','feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9']

corr = train[feature_lst].corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True


for idx, method in enumerate(['pearson', 'kendall', 'spearman']):
    sns.heatmap(train[feature_lst].corr(method=method), ax=ax[idx],
            square=True, annot=True, fmt='.1f', center=0, linewidth=2,
            cbar=False, cmap=sns.diverging_palette(240, 10, as_cmap=True),
            mask=mask
           ) 
    ax[idx].set_title(f'{method.capitalize()} Correlation', loc='left', fontweight='bold')     

plt.show()

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:180%; text-align:left;"> 3.4.2) The correlation between this continuos features and the target </h1>

In [None]:
train.corr()['target1'].sort_values(ascending=False)

In [None]:
a = train.drop(['target','target1'], axis=1)
a.corrwith(train['target1']).plot(kind='bar', figsize=(18,11), color=['salmon'])
plt.title('Correlation b/n target and Independant features')
plt.xticks(size=15)
plt.yticks(size=15)
plt.show()

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:180%; text-align:left;"> 3.5) Skewness and Kurtosis </h1>

In [None]:
train.skew()

In [None]:
test.skew()

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 4) Model Building and Evaluation </h1>

In [None]:
# Independant variable
X = train.iloc[:,:-2]

# Dependant variable
y = train['target']

In [None]:
# split  data into training and testing sets of 80:20 ratio
# 20% of test size selected
# random_state is random seed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
print("Length of X_train is: {X_train}".format(X_train = len(X_train)))
print("Length of X_test is: {X_test}".format(X_test = len(X_test)))
print("Length of y_train is: {y_train}".format(y_train = len(y_train)))
print("Length of y_test is: {y_test}".format(y_test = len(y_test)))

<h1 style="background-color:orange; font-family:newtimeroman; font-size:160%; text-align:left;"> 4.1) XGBoost </h1>

In [None]:
# model = XGBClassifier(tree_method='cpu_hist', use_label_encoder=True)
model = XGBClassifier(random_state=42, use_label_encoder=True)
model.fit(X, y)

In [None]:
# fig, ax = plt.subplots(figsize=(10,10))

# plot_importance(model,
#                height=0.5,
#                max_num_features=None,
#                title='Feature importance',
#                xlabel='F score', 
#                ylabel='Features',
#                ax=ax)

In [None]:
# Feature Importance
main_colors = ['#f03aa5', '#40c2f3', '#c489ce', '#bb3ca9']

f, ax = plt.subplots(1, 1, figsize=(18, 18))

plot_importance(model, 
                max_num_features=None,
                color=main_colors[0],
                ax=ax)
plt.title('Feature Importance', fontsize=20)
plt.show()

In [None]:
y_pred_xgb = model.predict_proba(test)

In [None]:
submission_xgb = pd.DataFrame(y_pred_xgb, columns=['Class_1','Class_2','Class_3','Class_4'])
submission_xgb['id'] = sub['id']
submission_xgb

<h1 style="background-color:orange; font-family:newtimeroman; font-size:160%; text-align:left;"> 4.2) LGBM </h1>

In [None]:
LGB = LGBMClassifier(random_state=42, use_label_encoder=True)
LGB.fit(X, y)

In [None]:
plot_importance(LGB, figsize=(18, 15));

<h1 style="background-color:orange; font-family:newtimeroman; font-size:160%; text-align:left;"> 4.2) LightAutoML </h1>

In [None]:
pip install -U lightautoml

In [None]:
# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from sklearn.metrics import log_loss

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 2021 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 60 * 60 # Time in seconds for automl run

In [None]:
le = LabelEncoder()
train['target'] = le.fit_transform(train['target'])

In [None]:
%%time

automl = TabularUtilizedAutoML(task = Task('multiclass',), 
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
                               verbose=0,
                               reader_params = {'n_jobs': N_THREADS},
)

In [None]:
target_column = 'target'

roles = {
    'target': target_column,
    'drop': ['id'],
}

lightml_pred = automl.fit_predict(train.iloc[:,:-1], roles = roles)
print('lightml_pred:\n{}\nShape = {}'.format(lightml_pred[:10], lightml_pred.shape))

In [None]:
%%time

test_pred = automl.predict(test)
print('Prediction for test set:\n{}\nShape = {}'.format(test_pred[:5], test_pred.shape))

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> Submission </h1>

submission = pd.DataFrame(y_pred_xgb_test, columns=xgb.classes_)
submission

submission.insert(0, 'id', testoriginal['id'])
submission

submission.to_csv("submission.csv", index = False)

In [None]:
sub.iloc[:, 1:] = test_pred.data
sub.to_csv('Submission1.csv', index = False)