In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> Table of Contents </h1>

* [1) Load Required Libraries](#1)

* [2) Read Data](#2)

* [3) EDA (Exploratory Data Analysis)](#3)

  * [3.1) DataPrep (AutoEDA)](#3.1)
  
    * [3.1.1) Analyze distributions with plot()](#3.1.1)
    
    * [3.1.2) Analyze correlations with plot_correlation()](#3.1.2)
    
    * [3.1.3) Analyze missing values with plot_missing()](#3.1.3)
    
    * [3.1.4) Create a profile report with create_report()](#3.1.4)
  
  * [3.2) Missing Values](#3.2)
  
  * [3.3) Outliers](#3.3)
  
  * [3.4) Relation between Features](#3.4)

* [4) Data Preprocessing](#4)

* [5) Model Building and Evaluation](#5)

  * [5.1) XGBoost Classifier](#5.1)
  
  * [5.2) LGBM Classifier](#5.2)
  
  * [5.3) Random Forest Classifier](#5.3)

* [6) LightAutoML](#6)

### Data Description

- The dataset is used for this competition is synthetic, but based on a real dataset and generated using a CTGAN. The original dataset deals with predicting the **category on an eCommerce product** given various attributes about the listing. Although the features are anonymized, they have properties relating to real-world features.

- This competition dataset is **similar** to the **Tabular Playground Series - May 2021 dataset**, but with **increased observations, increased features, and increased class labels.**

- Submissions are **evaluated using multi-class logarithmic loss**. Each row in the dataset has been labeled with one true Class. For each row, you must submit the predicted probabilities that the product belongs to each class label. The formula is:

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 1) Load Required Libraries </h1>

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')
sns.set_style('darkgrid')

from termcolor import cprint      # For making colorful printing texts

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier

import xgboost as xgb
from xgboost import XGBClassifier, plot_importance
from lightgbm import LGBMClassifier, plot_importance

from catboost import CatBoostClassifier

from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss

from sklearn.preprocessing import StandardScaler

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 2) Read Data </h1>

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/test.csv")
submission = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/sample_submission.csv")

In [None]:
display(train.head())
display(test.head())
display(submission.head())

In [None]:
cprint("Shape of the train set :", 'green')
cprint('*'*25, 'green')
print('\nShape of the train set:', train.shape)

cprint("\nShape of the test set :", 'green')
cprint('*'*25, 'green')
print('\nShape of the test set:', test.shape)

cprint("\nShape of the Submission set :", 'green')
cprint('*'*25, 'green')
print('\nShape of the Submission set:', submission.shape)

In [None]:
cprint("Dtypes in train :", 'green')
cprint('*'*25, 'green')
print('\n', train.info())
cprint("Dtypes in test :", 'green')
cprint('*'*25, 'green')
print('\n', test.info())

In [None]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [None]:
train.columns

In [None]:
train.target.unique()

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 3) EDA (Exploratory Data Analysis) </h1>

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 3.1) DataPrep (AutoEDA) </h1>

In [None]:
!pip install dataprep

In [None]:
from dataprep.eda import *
from dataprep.eda import plot

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 3.1.1) Analyze distributions with plot() </h1>

- a) **The function plot()** explores the distributions and statistics of the dataset. The following describes the functionality of plot() for a given dataframe df.

- b) **plot(df):** plots the distribution of each column and calculates dataset statistics (“I want to see an overview of the dataset” )

- c) **plot(df, x):** plots the distribution of column x in various ways and calculates column statistics (“I want to understand the column x”)

- d) **plot(df, x, y):** generates plots depicting the relationship between columns x and y. (“I want to understand the relationship between x and y”)

In [None]:
# plots the distribution of each column and calculates dataset statistics
plot(train)

In [None]:
# plots the distribution of column x in various ways and calculates column statistics
plot(train, 'target')

In [None]:
# generates plots depicting the relationship between columns x and y
plot(train, 'feature_72', 'feature_73')

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 3.1.2) Analyze correlations with plot_correlation() </h1>

- The function **plot_correlation()** explores the correlation between columns in various ways and using multiple correlation metrics. It generates correlation matrices using Pearson, Spearman, and KendallTau correlation coefficients

- **plot_correlation(df):** plots correlation matrices (correlations between all pairs of columns)

- **plot_correlation(df, x):** plots the most correlated columns to column x

- **plot_correlation(df, x, y):** plots the joint distribution of column x and column y and computes a regression line

In [None]:
from dataprep.eda import plot_correlation

In [None]:
plot_correlation(train)

In [None]:
# plots the most correlated columns to column "target"
plot_correlation(train, 'feature_1')

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 3.1.3) Analyze missing values with plot_missing() </h1>

- The function **plot_missing()** enables thorough analysis of the missing values and their impact on the dataset. The following describes the functionality of plot_missing() for a given dataframe df.

- **plot_missing(df):** plots the amount and position of missing values, and their relationship between columns (“I want to understand the missing values of the dataset”)

- **plot_missing(df, x):** plots the impact of the missing values in column x on all other columns

- **plot_missing(df, x, y):** plots the impact of the missing values from column x on column y in various ways.

In [None]:
from dataprep.eda import plot_missing

In [None]:
plot_missing(train)

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 3.1.4) Create a profile report with create_report() </h1>

- The function **create_report()** generates a comprehensive profile report of the dataset. create_report() combines the individual components of the dataprep.eda package and outputs them into a nicely formatted HTML document. The document contains the following information:

- **Overview:** detect the types of columns in a dataframe

- **Variables:** variable type, unique values, distint count, missing values

- **Quantile statistics** like minimum value, Q1, median, Q3, maximum, range, interquartile range

- **Descriptive statistics** like mean, mode, standard deviation, sum, median absolute deviation, coefficient of variation, kurtosis, skewness

- **Text analysis** for length, sample and letter

- **Correlations:** highlighting of highly correlated variables, Spearman, Pearson and Kendall matrices

- **Missing Values:** bar chart, heatmap and spectrum of missing values

In [None]:
create_report(train)

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 3.2) Missing Values </h1>

In [None]:
cprint("Missing Values in train :", 'green')
cprint('*'*25, 'green')
print(train.isnull().sum().sort_values(ascending=False))
cprint("\nMissing Values in test :", 'green')
cprint('*'*25, 'green')
print(test.isnull().sum().sort_values(ascending=False))

In [None]:
print('\nMissing values in train:', train.isnull().sum().sum())
print('\nMissing values in train:',test.isnull().sum().sum())

In [None]:
train.describe().T.style.bar(subset=['mean'], color='#20c8f2')\
                   .background_gradient(subset=['std'], cmap='YlGn')

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 3.3) Outliers </h1>

In [None]:
plt.figure(figsize=(18,25))
sns.boxplot(data=train, orient="h");

In [None]:
plt.figure(figsize=(18,25))
sns.boxplot(data=test.iloc[:,1:], orient="h");

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 3.4) Relation between Features </h1>

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 3.4.1) The correlation between the continuos variables </h1>

a. Pearson Correlation

b. Spearman Correlation

c. kendall

In [None]:
# Pearson Correlation
plt.figure(figsize=(18,10))
sns.heatmap(train.corr(method='pearson'), cbar=False, annot=True, fmt='.1f', linewidth=0.2, cmap='coolwarm');

In [None]:
# Spearman Correlation
plt.figure(figsize=(24,15))
sns.heatmap(train.corr(method='spearman'), cbar=False, annot=True, fmt='.1f', linewidth=0.2, cmap='coolwarm');

In [None]:
fig, ax = plt.subplots(figsize=(18, 12))
corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool))
ax.text(-1.1, -0.7, 'Correlation between the Features', fontsize=20, fontweight='bold', fontfamily='serif')
sns.heatmap(corr, mask=mask, annot=False, fmt='.2f', linewidth=0.2, cbar=True, cmap='coolwarm');

In [None]:
# kendall
fig, ax = plt.subplots(1, 3, figsize=(17 , 5))

feature_lst = ['feature_0', 'feature_1', 'feature_2','feature_3','feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9']

corr = train[feature_lst].corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True


for idx, method in enumerate(['pearson', 'kendall', 'spearman']):
    sns.heatmap(train[feature_lst].corr(method=method), ax=ax[idx],
            square=True, annot=True, fmt='.1f', center=0, linewidth=2,
            cbar=False, cmap=sns.diverging_palette(240, 10, as_cmap=True),
            mask=mask
           ) 
    ax[idx].set_title(f'{method.capitalize()} Correlation', loc='left', fontweight='bold')     

plt.show()

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 3.4.3) Skewness and Kurtosis </h1>

In [None]:
train.skew()

In [None]:
test.skew()

In [None]:
features = [feature for feature in train.columns if feature not in ['id', 'target']]
features = features[:36]

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of skew ")
sns.distplot(train[features].skew(),color="red", kde=True,bins=120, label='train')
sns.distplot(test[features].skew(),color="orange", kde=True,bins=120, label='test')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of kurtosis ")
sns.distplot(train[features].kurtosis(),color="darkblue", kde=True,bins=120, label='train')
sns.distplot(test[features].kurtosis(),color="yellow", kde=True,bins=120, label='test')
plt.legend()
plt.show()

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 4) Data Preprocessing </h1>

<h1 style="background-color:orange; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 4.1) target </h1>

In [None]:
plt.figure(figsize=(10,7))
sns.countplot(x="target", data=train);

In [None]:
import plotly.graph_objects as go
# Use `hole` to create a donut-like pie chart
fig = go.Figure(data=[go.Pie(labels=train.target, hole=.3)])
fig.show()

In [None]:
feature_cols = [col for col in train.columns if col != "target"]
target_cat = train["target"]
df = train.drop("target", axis=1)

In [None]:
df.head()

In [None]:
le = LabelEncoder()
train['target'] = le.fit_transform(train['target'])
#target = le.fit_transform(target_cat)

In [None]:
print("-"*30)
print("Before label encoding, ")
print(target_cat[:10])
print("-"*30)
print("After label encoding, ")
print(train['target'][:10])
print("-"*30)

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 5) Model Building and Evaluation </h1>

In [None]:
X = train.drop('target', axis = 1)
y = train['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#### How to deal with Skewness in data?

For skewness the best way is to handle it by standardizing data by applying **Transformation, Scaling or both Transformation and Scaling**.

##### I have used Logarithmic Transformation in combination with Standard Scaler (between 0-1) to standardize my data.

In [None]:
X_train = X_train.applymap(lambda p: np.log(p+1))
X_test = X_test.applymap(lambda p: np.log(p+1))

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

<h1 style="background-color:orange; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 5.1) Principal Component Analysis (PCA) </h1>

In [None]:
from sklearn.decomposition import PCA
pca = PCA().fit(X_train)

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (12,6)

fig, ax = plt.subplots()
xi = np.arange(1, 76, step=1)
yi = np.cumsum(pca.explained_variance_ratio_)

plt.ylim(0.0,1.1)
plt.plot(xi, yi, marker='o', linestyle='--', color='b')

plt.xlabel('Number of Components')
plt.xticks(np.arange(0, 75, step=2)) #change from 0-based array index to 1-based human-readable label
plt.ylabel('Cumulative variance (%)')
plt.title('The number of components needed to explain variance')

plt.axhline(y=0.95, color='r', linestyle='-')
plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red', fontsize=16)

ax.grid(axis='x')
plt.show()

<h1 style="background-color:orange; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 5.2) XGBoost Classifier </h1>

In [None]:
xgb_params= {'n_estimators': 25000, 
             'max_depth': 8, 
             'learning_rate': 0.0320, 
             'reg_lambda': 29.326, 
             'subsample': 0.918, 
             'colsample_bytree': 0.235, 
             'colsample_bynode': 0.820, 
             'colsample_bylevel': 0.453}

In [None]:
def cross_val(X, y, model, params, folds=9):

    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=21)
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        print(f"Fold: {fold}")
        x_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        x_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        alg = model(**params)
        alg.fit(x_train, y_train,
                eval_set=[(x_test, y_test)],
                early_stopping_rounds=100,
                verbose=400,                    # verbose =400 --> every 400 iterations it will tell logloss score; verbose = False --> No output
                eval_metric='mlogloss')

        pred = alg.predict_proba(x_test)
        loss = log_loss(y_test, pred)
        print(f"Log loss: {loss}")
        print("-"*50)
    
    return alg

In [None]:
xgb_model = cross_val(X, y, XGBClassifier, xgb_params)

In [None]:
y_pred_xgb1 = xgb_model.predict_proba(X_test)
y_pred_xgb  = xgb_model.predict(X_test)
xgb_acc     = accuracy_score(y_test, y_pred_xgb)

y_pred_xgb_test = xgb_model.predict_proba(test)

In [None]:
submission[['Class_1','Class_2', 'Class_3', 'Class_4','Class_5','Class_6', 'Class_7', 'Class_8', 'Class_9']] = y_pred_xgb_test
submission.to_csv(f'xgb.csv',index=False)

In [None]:
print("Accuracy : Train Score {:.2f} & Test Score {:.2f}".format(xgb_model.score(X_train, y_train), xgb_model.score(X_test, y_test)))
cprint('-'*70, 'green')
print("ROC_AUC_Score: ", roc_auc_score(y_test, y_pred_xgb1, multi_class = 'ovr'))
print("\n\nConfusion matrix: \n\n",confusion_matrix(y_test, y_pred_xgb))
cprint('-'*70, 'green')
print("\n\nClassification Report:\n\n",classification_report(y_test, y_pred_xgb))

In [None]:
plt.figure(figsize=(9,7))
sns.heatmap(confusion_matrix(y_test, y_pred_xgb), fmt='.1f', annot=True, square=True, linewidth=0.2, cbar=False);

In [None]:
xgb_model.get_params(deep=True)

In [None]:
xgb_model.get_xgb_params()

In [None]:
xgb_model.feature_importances_

In [None]:
# To have even better plot, let’s sort the features based on importance value:

plt.figure(figsize=(12,18))
sorted_idx = xgb_model.feature_importances_.argsort()
plt.barh(X.columns[sorted_idx], xgb_model.feature_importances_[sorted_idx]);
plt.title("Xgboost Feature Importance", size=20)
plt.show()

import shap

shap_tree = shap.TreeExplainer(xgb_model)

shap_values = shap_tree.shap_values(X_test)

shap.summary_plot(shap_values, X_test)

In [None]:
# Plot_tree allows to visualize the trees that were built by XGBoost
plt.figure(figsize=(35,40))
xgb.plot_tree(xgb_model, ax=plt.gca());

<h1 style="background-color:orange; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 5.3) LGBM Classifier </h1>

In [None]:
LGB = LGBMClassifier(random_state=42, use_label_encoder=True)
LGB.fit(X_train, y_train)

In [None]:
y_pred_LGB = LGB.predict(X_test)
print("Accuracy:",accuracy_score(y_test, y_pred_LGB))

In [None]:
print("XGBoost : Train Score {:.2f} & Test Score {:.2f}".format(LGB.score(X_train, y_train), LGB.score(X_test, y_test)))
print('-'*70)
print("\n\nConfusion matrix \n\n",confusion_matrix(y_test, y_pred_LGB))
print('-'*70)
print("\n\nClassification Report\n\n",classification_report(y_test, y_pred_LGB))

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(confusion_matrix(y_test, y_pred_LGB), fmt='.1f', annot=True, linewidth=0.2, square=True, cbar=False);

In [None]:
# plt.rcParams["figure.figsize"] = (12, 22)
# lightgbm.plot_importance(lgbm_model, max_num_features = 60, height=.9)

plot_importance(LGB, figsize=(15, 19));

In [None]:
y_pred_LGB_test = LGB.predict(X_test)

<h1 style="background-color:orange; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 5.4) Random Forest Classifier </h1>

In [None]:
rfc = RandomForestClassifier(max_depth=10, min_samples_split=9)
rfc.fit(X_train, y_train)

In [None]:
y_pred_rfc = rfc.predict(X_test)
rf_acc = accuracy_score(y_test, y_pred_rfc)

In [None]:
print("Random Forest : Train Score {:.2f} & Test Score {:.2f}".format(rfc.score(X_train, y_train), rfc.score(X_test, y_test)))
print('-'*70)
print("\n\nConfusion matrix \n\n",confusion_matrix(y_test, y_pred_rfc))
print('-'*70)
print("\n\nClassification Report\n\n",classification_report(y_test, y_pred_rfc))

In [None]:
plt.figure(figsize=(15,18))
feature_imp = pd.Series(rfc.feature_importances_, index=X.columns).sort_values(ascending=False)

# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)

# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features", size=20)
plt.legend()
plt.show()

<h1 style="background-color:orange; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 5.5) CatBoost Classifier </h1>

model = CatBoostClassifier(iterations = 4000, reg_lambda=100, learning_rate = 0.02,          # task_type = 'GPU'
                           bootstrap_type='Bernoulli', random_strength = 5, depth = 8,
                           loss_function='MultiClass')

model.fit(X_train, y_train)

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 6) LightAutoML </h1>

pip install -U lightautoml

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from sklearn.metrics import log_loss
from lightautoml.dataset.roles import NumericRole

N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 2021 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 8 * 3600 # Time in seconds for automl run
TARGET_NAME = 'target'

%%time

train_data = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
train_data[TARGET_NAME] = train_data[TARGET_NAME].str.slice(start=6).astype(int) - 1
train_data.head()

test_data = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
test_data.head()

submission = pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv')
submission.head()

%%time

task = Task('multiclass',)

columns = ['PREDS_Linear_' + str(i) for i in range(1, 10)] + \
          ['PREDS_LGBM_' + str(i) for i in range(1, 10)] + \
          ['PREDS_CB_' + str(i) for i in range(1, 10)] + \
          ['PREDS_NN_' + str(i) for i in range(1, 10)]
columns

%%time 

roles = {
    'target': TARGET_NAME,
    'drop': ['id'],
    NumericRole(np.float32, prob = True): columns
}

automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {
                           'use_algos': [['lgb_tuned', 'cb_tuned']],
                       },
                       tuning_params = {'max_tuning_time': 1800},
                       reader_params = {'n_jobs': N_THREADS},
                       #configs_list = ['../input/lightautoml-configs/conf_1_sel_type_1.yml'],
                       max_runs_per_config=1
                       )
oof_pred = automl.fit_predict(train_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))

%%time

test_pred = automl.predict(test)
print('Prediction for test set:\n{}\nShape = {}'.format(test_pred[:5], test_pred.shape))

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> Submission </h1>

submission.iloc[:, 1:] = test_pred.data
submission.to_csv('lightautoml_2lvl_8hours_with_nn_oofs.csv', index = False)

submission