# Importing Library

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use("fivethirtyeight")
sns.set_style("darkgrid")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import warnings
warnings.filterwarnings("ignore")

# Reading Dataset

# Reading the datasets

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/test.csv")
sub = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv")

In [None]:
trainoriginal = train.copy()
testoriginal = test.copy()

In [None]:
display(train.head())
display(test.head())
display(sub.head())

In [None]:
print("Size of the train:", train.shape)
print("\nSize of the test:", test.shape)
print("\nSize of the Submission:", sub.shape)

In [None]:
print('Columns in Train data:\n\n', train.columns)
print('-'*80)
print('\n\nColumns in Test data:\n\n', test.columns)

In [None]:
print('Datatypes of Train dataset:\n', train.info())
print('-'*50)
print('\n\nDatatypes of Train dataset:\n', test.info())

# Exploratory Data Analysis

## Dropping Unwanted Columns 

In [None]:
train = train.drop(['id'], axis=1)
test = test.drop(['id'], axis=1)

## Missing Values

### Finding missing values of train & test dataset

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

## Analysing the Variables

In [None]:
train['target'].value_counts().sort_index(ascending=True)


In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x='target', data=train, order=train['target'].value_counts().index);

In [None]:
train['target1'] = train['target'].map({'Class_1':0, 'Class_2':1, 'Class_3':2, 'Class_4':3})
train['feature_0'].value_counts()

In [None]:
train['feature_2'].value_counts()

In [None]:
train['feature_38'].value_counts().sort_index(ascending=False)

### Below features in negative values:
* feature_19 (-1, -2)
* feature_30 (-1)
* feature_31 (-1)
* feature_32 (-1, -2)
* feature_35 (-2)
* feature_38 (-1, -2, -3, -5, -8)
* feature_39 (-1, -2, -3, -5)
* feature_42 (-1, -2)



## Outliers

In [None]:
plt.figure(figsize=(18,25))
sns.boxplot(data=train, orient="h");

In [None]:
plt.figure(figsize=(18,25))
sns.boxplot(data=test.iloc[:,1:], orient="h");

## Relation between Features

In [None]:
# Pearson Correlation
plt.figure(figsize=(18,10))
sns.heatmap(train.corr(method='pearson'), cbar=False, annot=True, fmt='.1f', linewidth=0.2, cmap='coolwarm');

In [None]:
# Spearman Correlation
plt.figure(figsize=(18,12))
sns.heatmap(train.corr(method='spearman'), cbar=False, annot=True, fmt='.1f', linewidth=0.2, cmap='coolwarm');

In [None]:
fig, ax = plt.subplots(figsize=(18, 12))
corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool))
ax.text(-1.1, -0.7, 'Correlation between the Features', fontsize=20, fontweight='bold', fontfamily='serif')
sns.heatmap(corr, mask=mask, annot=False, fmt='.2f', linewidth=0.2, cbar=True, cmap='coolwarm');


In [None]:
# kendall
fig, ax = plt.subplots(1, 3, figsize=(17 , 5))

feature_lst = ['feature_0', 'feature_1', 'feature_2','feature_3','feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9']

corr = train[feature_lst].corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True


for idx, method in enumerate(['pearson', 'kendall', 'spearman']):
    sns.heatmap(train[feature_lst].corr(method=method), ax=ax[idx],
            square=True, annot=True, fmt='.1f', center=0, linewidth=2,
            cbar=False, cmap=sns.diverging_palette(240, 10, as_cmap=True),
            mask=mask
           ) 
    ax[idx].set_title(f'{method.capitalize()} Correlation', loc='left', fontweight='bold')     

plt.show()

In [None]:
train.corr()['target1'].sort_values(ascending=False)

In [None]:
a = train.drop(['target','target1'], axis=1)
a.corrwith(train['target1']).plot(kind='bar', figsize=(18,11), color=['salmon'])
plt.title('Correlation b/n target and Independant features')
plt.xticks(size=15)
plt.yticks(size=15)
plt.show()

## Skewness

In [None]:
train.skew()

In [None]:
test.skew()

# Data Modeling

In [None]:
# Independant variable
X = train.iloc[:,:-2]

# Dependant variable
y = train['target']

In [None]:
# split  data into training and testing sets of 80:20 ratio
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
print("Length of X_train is: {X_train}".format(X_train = len(X_train)))
print("Length of X_test is: {X_test}".format(X_test = len(X_test)))
print("Length of y_train is: {y_train}".format(y_train = len(y_train)))
print("Length of y_test is: {y_test}".format(y_test = len(y_test)))

## XGBoost

In [None]:
from xgboost import XGBClassifier, plot_importance
model = XGBClassifier(random_state=42, use_label_encoder=True)
model.fit(X, y)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))

plot_importance(model,
                height=0.5,
                max_num_features=None,
                title='Feature importance',
                xlabel='F score', 
                ylabel='Features',
                ax=ax)

In [None]:
# Feature Importance
main_colors = ['#f03aa5', '#40c2f3', '#c489ce', '#bb3ca9']

f, ax = plt.subplots(1, 1, figsize=(18, 18))

plot_importance(model, 
                max_num_features=None,
                color=main_colors[0],
                ax=ax)
plt.title('Feature Importance', fontsize=20)
plt.show()

In [None]:
y_pred_xgb = model.predict_proba(test)

## LGBM

In [None]:
from lightgbm import LGBMClassifier, plot_importance
LGB = LGBMClassifier(random_state=42, use_label_encoder=True)
LGB.fit(X, y)

In [None]:
plot_importance(LGB, figsize=(18, 15));

# LightAutoML

In [None]:
pip install -U lightautoml

In [None]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task


In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 3 * 3600 # Time in seconds for automl run
TARGET_NAME = 'target'

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
test_data = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
train_data[TARGET_NAME] = train_data[TARGET_NAME].str.slice(start=6).astype(int) - 1

In [None]:
def create_gr_feats(data):
    pass
    

all_df = pd.concat([train_data, test_data]).reset_index(drop = True)
create_gr_feats(all_df)
train_data, test_data = all_df[:len(train_data)], all_df[len(train_data):]
print(train_data.shape, test_data.shape)

In [None]:
%%time

task = Task('multiclass',)

In [None]:
%%time

roles = {
    'target': TARGET_NAME,
    'drop': ['id'],
}

In [None]:
%%time 

automl = TabularUtilizedAutoML(task = task, 
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
                               reader_params = {'n_jobs': N_THREADS},
                               configs_list=[
                                   '../input/lightautoml-configs/conf_0_sel_type_0.yml',
                                   '../input/lightautoml-configs/conf_1_sel_type_1.yml'
                               ])
oof_pred = automl.fit_predict(train_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:5], oof_pred.shape))

In [None]:
%%time

# Fast feature importances calculation
fast_fi = automl.get_feature_scores('fast', silent = False)
fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (20, 10), grid = True)

# Final Submission

In [None]:
submission_xgb = pd.DataFrame(y_pred_xgb, columns=['Class_1','Class_2','Class_3','Class_4'])
submission_xgb['id'] = sub['id']
submission_xgb

In [None]:

submission_xgb.to_csv('submission.csv', index=0)

In [None]:
%%time

test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred[:10], test_pred.shape))

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')
submission.iloc[:, 1:] = test_pred.data
submission.to_csv('lightautoml.csv', index = False)