# Data and Library Imports üìö

Let's start this interesting competition by first importing data and libraries!

In [None]:
! pip install -q rich dabl

In [None]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from rich import print as _pprint
from rich.progress import track
from colorama import Fore, Style

import dabl

import plotly.express as px

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from IPython.display import HTML
import tensorflow as tf

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

f = open("../input/notebookassets/blue.css").read()
HTML(f"<style>{f}</style>")

In [None]:
def cprint(string):
    _pprint(f"[black]{string}[/black]")
    
def cout(string: str, color=Fore.RED):
    """
    Saves some work
    """
    print(color+string+Style.RESET_ALL)

def stats(scol, col):
    cout(f"Average Value in the Column: {scol} is: {np.mean(col):.4f}", Fore.RED)
    cout(f"Median Value in the Column: {scol} is: {np.median(col):.4f}", Fore.BLUE)
    cout(f"Maxmimum Value in the Column: {scol} is: {np.max(col):.4f}", Fore.GREEN)
    cout(f"Minimum Value in the Column: {scol} is: {np.min(col):.4f}", Fore.YELLOW)
    cout(f"50th Quantile of the Column: {scol} is: {np.quantile(col, 0.5):.4f}", Fore.CYAN)
    cout(f"75th Quantile of the Column: {scol} is: {np.quantile(col, 0.75):.4f}", Fore.MAGENTA)

In [None]:
train_file = pd.read_csv("../input/tabular-playground-series-mar-2021/train.csv")
test_file = pd.read_csv("../input/tabular-playground-series-mar-2021/test.csv")

train_file.head()

# Peeking the Data üëÅ

Let's start by taking a peek at the data and getting some basic insights from it!
<!-- <div class="alert alert-block alert-info"></div> -->

In [None]:
train_file.describe()

## A brief look at Target Column

<div class="alert alert-block alert-info">
This Competition (unlike it's predecessors) wants us to classify the data points into either of 2 categories (0 or 1).
As you can probably see from below, the dataset is not very balanced when looking from the Target Column's perspective.
</div>

In [None]:
names = train_file['target'].value_counts().index.tolist()
values = train_file['target'].value_counts().tolist()

plt.figure(figsize=(9, 9))
plt.pie(x=values, labels=names, autopct="%1.2f%%", colors=["cyan", "blue"], explode=[0, 0.05])
plt.title("Target Value Pie-Chart")
plt.show()

In [None]:
# Get a list of Categorical as well as continuous column names
catCols = [f"cat{i}" for i in range(0, 19)]
conCols = [f"cont{i}" for i in range(0, 11)]

In [None]:
# Show Unique categories in feature columns
cprint("[bold magenta] Categorical Features in the dataset [/bold magenta]")
for col in catCols:
    print(f"Number of features in {col}: ", end='')
    cout(f"{train_file[col].nunique()}", color=Fore.MAGENTA)

In [None]:
# # Show some quick stats of Continuous Features
# cprint("[bold green] Continuous Features and their Basic Statistics [/bold green]")
# for col in conCols:
#     print(f"\n{'-'*20} Column: {col} {'-'*20}\n")
#     stats(col, train_file[col])

# Advanced Exploratory Data Analysis üìä

Let's now start with detailed Exploratory Data Analysis and take a look at different combinations of features, one by one!

## Continuous Features

Let's begin by seeing this month's data's continuous features and how they are distributed.

In [None]:
fig, ax = plt.subplots(nrows=3, ncols=4, figsize=(15, 15))
fig.suptitle('Distribution of Continuous Features (cont0-cont10)', fontsize=16)

for idx, col in enumerate(train_file[conCols]):
    i,j = (idx // 4, idx % 4)
    sns.kdeplot(train_file[col], color="blue", shade=True, label="%1.1f"%(train_file[col].skew()), ax=ax[i,j])

fig.delaxes(ax[2, 3])
plt.tight_layout()
plt.show()

Let's now see how Continuous features behave based on what target variable they represent.

In [None]:
fig, ax = plt.subplots(nrows=3, ncols=4, figsize=(15, 15))
fig.suptitle('Distribution of Continuous Features with Targets (cont0-cont10)', fontsize=16)

for idx, col in enumerate(train_file[conCols]):
    i,j = (idx // 4, idx % 4)
    current_plot_1 = train_file.loc[train_file['target'].astype(int) == 1][col]
    current_plot_0 = train_file.loc[train_file['target'].astype(int) == 0][col]
    f1 = sns.kdeplot(current_plot_0, color="blue", shade=True, ax=ax[i,j], label="Target-0")
    f2 = sns.kdeplot(current_plot_1, color="purple", shade=True, ax=ax[i,j], label="Target-1")
    f1 = f1.legend(loc="best")
    f2 = f2.legend(loc="best")
    
fig.delaxes(ax[2, 3])
plt.tight_layout()
plt.show()

## Categorical Features

Let's now take a look at the Categorical features in this Dataset and the distribution of their various categories.

In [None]:
fig, ax = plt.subplots(nrows=4, ncols=4, figsize=(15, 15))
fig.suptitle('Distribution of Categorical Features (cat0-cat18)', fontsize=16)

catCols_s = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat6', 'cat9', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18']
for idx, col in enumerate(train_file[catCols_s]):
    i, j = (idx // 4, idx % 4)
    sns.histplot(x=col, data=train_file, ax=ax[i, j], color='green')

fig.delaxes(ax[3, 3])
plt.tight_layout()
plt.show()

Just like the Continuous Features, let's see how the selected categorical features change with Targets. 

In [None]:
fig, ax = plt.subplots(nrows=4, ncols=4, figsize=(15, 15))
fig.suptitle('Distribution of Categorical Features with respect to Targets(cat0-cat18)', fontsize=16)

catCols_s = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat6', 'cat9', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18']
for idx, col in enumerate(train_file[catCols_s]):
    i, j = (idx // 4, idx % 4)
    f1 = sns.histplot(x=col, data=train_file[train_file['target'].astype(int)==0], ax=ax[i, j], color='green', label='Target-0')
    f2 = sns.histplot(x=col, data=train_file[train_file['target'].astype(int)==1], ax=ax[i, j], color='yellow', label='Target-1')
    f1 = f1.legend(loc="best")
    f2 = f2.legend(loc="best")

fig.delaxes(ax[3, 3])
plt.tight_layout()
plt.show()

<div class="alert alert-block alert-info">
I have not plotted the following Categorical Columns in the Above Plot:
    
<code>cat5</code>,
<code>cat7</code>,
<code>cat8</code>,
<code>cat10</code>.

The Reason behind this was that since there were so many categories, it wouldn't fit into subplots.
We shall plot them separately down below.
</div>

## A closer look at some Categorical features.

Let's now take a look at the aforementioned 4 categorical features that are too clumsy to fit in a subplots.

### Feature: cat5

Since the feature `cat5` has *over 80* different categories, I will be plotting only the top-10 most occuring ones.

In [None]:
ixs = train_file['cat5'].value_counts().index.tolist()[:10]
cat5_fl = train_file[train_file['cat5'].isin(ixs)]['cat5']

plt.figure(figsize=(8, 7))
ax = sns.histplot(cat5_fl, color='orange')
plt.xlabel("Category")
plt.ylabel("Count")
plt.title("Top-10 Categories of feature: cat5")

total = len(train_file['cat5'])
for p in ax.patches:
        percentage = '{:.2f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() / 5
        y = p.get_y() + p.get_height() + 0.02
        ax.annotate(percentage, (x, y))

plt.show()

As you can see above, the categories `BI`, `AB` and `BU` dominate the count for this feature.

### Feature: cat7

Same as cat5, I am only showing the top-10 most occuring values here.

In [None]:
ixs = train_file['cat7'].value_counts().index.tolist()[:10]
cat7_fl = train_file[train_file['cat7'].isin(ixs)]['cat7'].reset_index(drop=True)

plt.figure(figsize=(8, 7))
ax = sns.histplot(cat7_fl, color='orange')
plt.xlabel("Category")
plt.ylabel("Count")
plt.title("Top-10 Categories of feature: cat7")

total = len(train_file['cat7'])
for p in ax.patches:
        percentage = '{:.2f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() / 7
        y = p.get_y() + p.get_height() + 0.02
        ax.annotate(percentage, (x, y))

plt.show()

### Feature: cat8

Same as cat5 and cat7, I am only showing the top-10 most occuring values here.

In [None]:
ixs = train_file['cat8'].value_counts().index.tolist()[:10]
cat8_fl = train_file[train_file['cat8'].isin(ixs)]['cat8'].reset_index(drop=True)

plt.figure(figsize=(8, 7))
ax = sns.histplot(cat8_fl, color='orange')
plt.xlabel("Category")
plt.ylabel("Count")
plt.title("Top-10 Categories of feature: cat8")

total = len(train_file['cat8'])
for p in ax.patches:
        percentage = '{:.2f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() / 7
        y = p.get_y() + p.get_height() + 0.02
        ax.annotate(percentage, (x, y))

plt.show()

### Feature: cat10

Same as cat5, cat7 and cat8, I am only showing the top-10 most occuring values here.

In [None]:
ixs = train_file['cat10'].value_counts().index.tolist()[:10]
cat10_fl = train_file[train_file['cat10'].isin(ixs)]['cat10'].reset_index(drop=True)

plt.figure(figsize=(8, 7))
ax = sns.histplot(cat10_fl, color='orange')
plt.xlabel("Category")
plt.ylabel("Count")
plt.title("Top-10 Categories of feature: cat10")

total = len(train_file['cat10'])
for p in ax.patches:
        percentage = '{:.2f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() / 7
        y = p.get_y() + p.get_height() + 0.02
        ax.annotate(percentage, (x, y))

plt.show()

## Correlation Between Continuous features and Target

Let us now plot the correlation between continuous features and the target and see how things change.

In [None]:
corr = train_file[conCols+["target"]].corr()

fig = px.imshow(corr, title="Correlation Heatmap Between Continuous Features and Target", color_continuous_scale=px.colors.sequential.Hot_r)
fig.show()

# DABL Plot

In [None]:
dabl.plot(train_file, target_col='target')

# Multi Input Embeddings Model - Tensorflow üåÜ

Let's start modelling by using one of the model types that I experimented in last TPS Competition (Feburary).

For Categorical features, I am using Entity Embeddings for every feature then concatenating them and then passing them to a few final layers where they are combined with the features learned from the Dense network built for continuous data.

First get the TPU for faster training.

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

print("REPLICAS: ", strategy.num_replicas_in_sync)

Then we encode categorical columns in both training and testing sets to make them ready for modelling.

In [None]:
# Encode the categorical features
def encodeCategoricalColumns(train, test):
    for col in track(catCols, description="[red]Encoding...[/red]"):
        le = LabelEncoder()
        fit_data = pd.concat([train[col], test[col]])
        le.fit(fit_data)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])
encodeCategoricalColumns(train_file, test_file)

In [None]:
# Shuffle the data and then split features and targets
train_csv = train_file.sample(frac=1).reset_index(drop=True)
data = train_csv.drop(['id', 'target'], axis=1)
target = train_csv[['target']]
target_train = tf.keras.utils.to_categorical(target)

print(f"Data Shape: {data.shape}, Target shape: {target_train.shape}")

In [None]:
# Function to get the number of input & output dimensions for the embeddings 
def get_emb_dims(categorical_column):
    in_dim = categorical_column.nunique()
    out_dim = int(np.sqrt(in_dim))
    return in_dim, out_dim

# for c in catCols:
#     ind, oud = get_emb_dims(data[c])
#     print(f"{c}: in-{ind}, out-{oud}")

In [None]:
def build_categorical_model(cat_data=data[catCols]):
    # Cat-0
    cat0_inp = tf.keras.Input(shape=(1,))
    cat0_out = tf.keras.layers.Embedding(input_dim=2, output_dim=1, name='cat0')(cat0_inp)
    cat0_out = tf.keras.layers.Reshape(target_shape=(1,))(cat0_out)
    
    # Cat-1
    cat1_inp = tf.keras.Input(shape=(1,))
    cat1_out = tf.keras.layers.Embedding(input_dim=15, output_dim=3, name='cat1')(cat1_inp)
    cat1_out = tf.keras.layers.Reshape(target_shape=(3,))(cat1_out)
    
    # Cat-2
    cat2_inp = tf.keras.Input(shape=(1,))
    cat2_out = tf.keras.layers.Embedding(input_dim=19, output_dim=4, name='cat2')(cat2_inp)
    cat2_out = tf.keras.layers.Reshape(target_shape=(4,))(cat2_out)
    
    # Cat-3
    cat3_inp = tf.keras.Input(shape=(1,))
    cat3_out = tf.keras.layers.Embedding(input_dim=13, output_dim=3, name='cat3')(cat3_inp)
    cat3_out = tf.keras.layers.Reshape(target_shape=(3,))(cat3_out)
    
    # Cat-4
    cat4_inp = tf.keras.Input(shape=(1,))
    cat4_out = tf.keras.layers.Embedding(input_dim=20, output_dim=4, name='cat4')(cat4_inp)
    cat4_out = tf.keras.layers.Reshape(target_shape=(4,))(cat4_out)
    
    # Cat-5
    cat5_inp = tf.keras.Input(shape=(1,))
    cat5_out = tf.keras.layers.Embedding(input_dim=84, output_dim=9, name='cat5')(cat5_inp)
    cat5_out = tf.keras.layers.Reshape(target_shape=(9,))(cat5_out)
    
    # Cat-6
    cat6_inp = tf.keras.Input(shape=(1,))
    cat6_out = tf.keras.layers.Embedding(input_dim=16, output_dim=4, name='cat6')(cat6_inp)
    cat6_out = tf.keras.layers.Reshape(target_shape=(4,))(cat6_out)
    
    # Cat-7
    cat7_inp = tf.keras.Input(shape=(1,))
    cat7_out = tf.keras.layers.Embedding(input_dim=51, output_dim=7, name='cat7')(cat7_inp)
    cat7_out = tf.keras.layers.Reshape(target_shape=(7,))(cat7_out)
    
    # Cat-8
    cat8_inp = tf.keras.Input(shape=(1,))
    cat8_out = tf.keras.layers.Embedding(input_dim=61, output_dim=7, name='cat8')(cat8_inp)
    cat8_out = tf.keras.layers.Reshape(target_shape=(7,))(cat8_out)
    
    # Cat-9
    cat9_inp = tf.keras.Input(shape=(1,))
    cat9_out = tf.keras.layers.Embedding(input_dim=19, output_dim=4, name='cat9')(cat9_inp)
    cat9_out = tf.keras.layers.Reshape(target_shape=(4,))(cat9_out)
    
    # Cat-10
    cat10_inp = tf.keras.Input(shape=(1,))
    cat10_out = tf.keras.layers.Embedding(input_dim=299, output_dim=17, name='cat10')(cat10_inp)
    cat10_out = tf.keras.layers.Reshape(target_shape=(17,))(cat10_out)
    
    # Cat-11
    cat11_inp = tf.keras.Input(shape=(1,))
    cat11_out = tf.keras.layers.Embedding(input_dim=2, output_dim=1, name='cat11')(cat11_inp)
    cat11_out = tf.keras.layers.Reshape(target_shape=(1,))(cat11_out)
    
    # Cat-12
    cat12_inp = tf.keras.Input(shape=(1,))
    cat12_out = tf.keras.layers.Embedding(input_dim=2, output_dim=1, name='cat12')(cat12_inp)
    cat12_out = tf.keras.layers.Reshape(target_shape=(1,))(cat12_out)
    
    # Cat-13
    cat13_inp = tf.keras.Input(shape=(1,))
    cat13_out = tf.keras.layers.Embedding(input_dim=2, output_dim=1, name='cat13')(cat13_inp)
    cat13_out = tf.keras.layers.Reshape(target_shape=(1,))(cat13_out)
    
    # Cat-14
    cat14_inp = tf.keras.Input(shape=(1,))
    cat14_out = tf.keras.layers.Embedding(input_dim=2, output_dim=1, name='cat14')(cat14_inp)
    cat14_out = tf.keras.layers.Reshape(target_shape=(1,))(cat14_out)
    
    # Cat-15
    cat15_inp = tf.keras.Input(shape=(1,))
    cat15_out = tf.keras.layers.Embedding(input_dim=4, output_dim=2, name='cat15')(cat15_inp)
    cat15_out = tf.keras.layers.Reshape(target_shape=(2,))(cat15_out)
    
    # Cat-16
    cat16_inp = tf.keras.Input(shape=(1,))
    cat16_out = tf.keras.layers.Embedding(input_dim=4, output_dim=2, name='cat16')(cat16_inp)
    cat16_out = tf.keras.layers.Reshape(target_shape=(2,))(cat16_out)
    
    # Cat-17
    cat17_inp = tf.keras.Input(shape=(1,))
    cat17_out = tf.keras.layers.Embedding(input_dim=4, output_dim=2, name='cat17')(cat17_inp)
    cat17_out = tf.keras.layers.Reshape(target_shape=(2,))(cat17_out)
    
    # Cat-18
    cat18_inp = tf.keras.Input(shape=(1,))
    cat18_out = tf.keras.layers.Embedding(input_dim=4, output_dim=2, name='cat18')(cat18_inp)
    cat18_out = tf.keras.layers.Reshape(target_shape=(2,))(cat18_out)
    
    # Concatenate all entity embedding layers.
    input_layers = [cat0_inp, cat1_inp, cat2_inp, cat3_inp, cat4_inp, cat5_inp, cat6_inp, cat7_inp, cat8_inp, cat9_inp, cat10_inp, cat11_inp, cat12_inp, cat13_inp, cat14_inp, cat15_inp, cat16_inp, cat17_inp, cat18_inp]
    output_layers = [cat0_out, cat1_out, cat2_out, cat3_out, cat4_out, cat5_out, cat6_out, cat7_out, cat8_out, cat9_out, cat10_out, cat11_out, cat12_out, cat13_out, cat14_out, cat15_out, cat16_out, cat17_out, cat18_out]
    
    concat = tf.keras.layers.Concatenate(name="combine-embd")(output_layers)
    
    # Add Dense, Dropout and Batchnorm
    bn = tf.keras.layers.BatchNormalization(name="embd-bn")(concat)
    dropout_1 = tf.keras.layers.Dropout(0.3, name="embd-drop1")(bn)
    fc1 = tf.keras.layers.Dense(1024, activation='relu', name="embd-fc1")(dropout_1)
    dropout_2 = tf.keras.layers.Dropout(0.5, name="embd-drop2")(fc1)
    fc2 = tf.keras.layers.Dense(512, activation='relu', name="embd-fc2")(dropout_2)
    dropout_3 = tf.keras.layers.Dropout(0.5, name="embd-drop3")(fc2)
    cat_output = tf.keras.layers.Dense(64, activation='relu', name="embd-out")(dropout_3)
    
    # Return the Last Layer output
    return input_layers, cat_output

def build_continuous_model(con_data=data[conCols]):
    # Model for Continuous Data
    con_inputs = tf.keras.Input(shape=(con_data.shape[1]))
    
    fc1 = tf.keras.layers.Dense(512, activation='relu', name="cont_fc1")(con_inputs)
    drp1 = tf.keras.layers.Dropout(0.5, name="cont_drop1")(fc1)
    fc2 = tf.keras.layers.Dense(256, activation='relu', name="cont_fc2")(drp1)
    drp2 = tf.keras.layers.Dropout(0.3, name="cont_drop2")(fc2)
    fc3 = tf.keras.layers.Dense(128, activation='relu', name="cont_fc3")(drp2)
    drp3 = tf.keras.layers.Dropout(0.3, name="cont_drop3")(fc3)
    con_output = tf.keras.layers.Dense(64, activation='relu', name="cont_out")(drp3)
    
    return con_inputs, con_output

In [None]:
def build_model():
    cat_inputs, cat_outputs = build_categorical_model(data[catCols])
    con_inputs, con_outputs = build_continuous_model(data[conCols])
    
    # Concatenate the outputs
    all_layers_out = [cat_outputs, con_outputs]
    concat_outputs = tf.keras.layers.Concatenate()(all_layers_out)
    fc1 = tf.keras.layers.Dense(256, activation='relu', name="all_fc1")(concat_outputs)
    drp1 = tf.keras.layers.Dropout(0.3, name="all_drop1")(fc1)
    fc2 = tf.keras.layers.Dense(128, activation='relu', name="all_fc2")(drp1)
    drp2 = tf.keras.layers.Dropout(0.3, name="all_drop2")(fc2)
    fc3 = tf.keras.layers.Dense(64, activation='relu', name="all_fc3")(drp2)
    drp3 = tf.keras.layers.Dropout(0.5, name="all_drop3")(fc3)
    fc4 = tf.keras.layers.Dense(16, activation='relu', name="all_fc4")(drp3)
    output = tf.keras.layers.Dense(2, activation='softmax', name="all_out")(fc4)
    
    all_inputs = cat_inputs + [con_inputs]
    return tf.keras.Model(inputs=all_inputs, outputs=output)

Use the above defined functions to build this huge model.

In [None]:
# Initialize a temporary model so that we can see the model architecture plot.
temp_model = build_model()
tf.keras.utils.plot_model(temp_model)

Now, let's train this model and see how it performs!

In [None]:
# Init KFolds training
kfold = KFold(5, shuffle=False)

for fold_, (train_idx, valid_idx) in enumerate(kfold.split(data, target)):
    print(f"{'='*20} FOLD: {fold_+1} {'='*20}")
    # Get this fold's training and validation splits
    trainX, trainY = data.iloc[train_idx], target_train[train_idx]
    validX, validY = data.iloc[valid_idx], target_train[valid_idx]
    
    training_data = ([trainX[f"cat{i}"] for i in range(0, 19)] + [trainX[conCols]], trainY)
    validation_data = ([validX[f"cat{i}"] for i in range(0, 19)] + [validX[conCols]], validY)

    # Initalize this fold's model
    with strategy.scope():
        model = build_model()
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    # Init early stopping so that we don't train the model more than we have to
    early_stopper = tf.keras.callbacks.EarlyStopping(
        monitor='val_acc',
        patience=2,
        mode='max',
        restore_best_weights=True
    )
    
    # Also have a model checkpoint
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath=f"fold_{fold_}_model.h5",
        monitor='val_acc',
        mode='max',
        save_best_only=True
    )
    
    # Fit this fold's model
    model.fit(
        training_data[0],
        training_data[1],
        epochs=5,
        validation_data=validation_data,
        callbacks=[early_stopper, checkpoint]
    )

<div class="alert alert-block alert-info">
Keep in mind that I am training this model for only 2 epochs per fold (reason being it's high time consumption).
I encourage you to fork the notebook (if you do, leaving an upvote would be nice üòÅ) and then increasing the number of folds and playing around with other numbers to see how the results change!
</div>

# Inference üåå

Now let's do quick inference based on the model(s) trained above.

In [None]:
# Get the testing data in the right shape for inference
test_ids = test_file['id']
test = test_file.drop(['id'], axis=1)
test_data = [test[f"cat{i}"] for i in range(0, 19)] + [test[conCols]]

In [None]:
# Do the inference
preds = model.predict(test_data)
preds = preds.argmax(1)
print(preds)  # Printing some predictions for sanity check

In [None]:
# Save the inference into a submission file
submission_file = pd.DataFrame()
submission_file['id'] = test_ids.tolist()
submission_file['target'] = preds
submission_file.to_csv("submission.csv", index=None)
submission_file.head()

<div class="alert alert-block alert-info">
    Thanks for taking your time to look at my work!
    <br><br>
    <bold>If you like this notebook, you can consider giving an upvote! It will help me make more useful content!</bold>
</div>

In [None]:
cprint("[bold green]UNDER WORK! MORE CELLS ARE BEING ADDED DAILY![/bold green]")