# 1. Importing packages + data

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()
from fastai.tabular.all import *

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/test.csv")

# 2. Preliminary data

In [None]:
train.target.unique()

10 labels in the multiclassification problem.

In [None]:
train.target.value_counts(normalize=True)

Classes are balanced!

In [None]:
train_df = train.drop(columns=['target', 'row_id'])
f = plt.figure(figsize=(12, 12))
plt.matshow(train_df.corr(), fignum=f.number)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);

We have a lot of highly correlated features. This would be bad for generalization. Need to do feature selection in future work.

# 3. Loading data to fastai TabularDataLoaders

In [None]:
cont_names = train.columns[1:-1].tolist() # Removed target & row_id columns
cat_names = []
procs = [Normalize] # Normalizing the numeric columns
dep_var = 'target'

path = "/kaggle/input/tabular-playground-series-feb-2022/"

In [None]:
dls = TabularDataLoaders.from_csv(path + 'train.csv',
                                  path=path,
                                  y_names="target",
                                  cat_names = cat_names,
                                  cont_names = cont_names,
                                  procs = [Normalize],
                                  y_block = CategoryBlock())

splits = RandomSplitter(valid_pct=0.2)(range_of(train))

to = TabularPandas(train, procs=[Normalize],
                   cat_names = cat_names,
                   cont_names = cont_names,
                   y_names='target',
                   splits=splits,
                  y_block = CategoryBlock())

dls = to.dataloaders(bs=64)

In [None]:
# Checking if data is loaded successfully
dls.show_batch()

# 4. Training fastai model

In [None]:
loss_func = CrossEntropyLossFlat()
learn = tabular_learner(dls, metrics=accuracy, loss_func=loss_func)

In [None]:
# Finding the optimal learning rate
learn.lr_find()

In [None]:
# Rounded off the valley score from the previous code block
# Increase epochs while training
learn.fit_one_cycle(10, 0.0015)

With a different learning rate and greater epochs, I have gotten 0.95+ validation accuracy.

# 5.  Interpretation of training results

In [None]:
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix(figsize=(12,12), dpi=60)

- `coli`,`enterica` and `pyogenes` give a lot of misclassified predicted labels.
- Accuracy is a bad metric for assessing the leaderboard in the competition. It is possible that the leaderboard is being calcualted on categories which are not misclassified a lot. (`jejuni` and `aureus`)
- Two things would be important going forward:
    1. Validation scores
    2. The metric used to arrive at that validation score.

# 6. Making predictions on test set

In [None]:
test = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/test.csv")
test_df = test.copy()
test_df.drop(['row_id'], axis=1, inplace=True)
dl = learn.dls.test_dl(test_df)

In [None]:
# Extracting the labels for the test examples
preds_probs, dummy, preds  = learn.get_preds(dl=dl, with_decoded=True)

The code below assigns the appropriate label to the predicted class label.
Like 

In [None]:
new_df = pd.DataFrame(
    data = {
        'row_id': test.row_id,
        'target_value': preds
    })

dict_pd = pd.DataFrame(
    data = {
        'target_value': np.arange(0, 10, 1),
        'target': learn.dls.vocab
    }
)

In [None]:
submission = new_df.merge(dict_pd, on='target_value')
submission.drop(['target_value'], axis=1, inplace=True)
submission = submission.sort_values(by='row_id')
submission.head(5)

In [None]:
submission.to_csv("submission.csv", index=False)

# Future work
- Use ROC_AUC as another metric for measuring accuracy
- Mix this model with other NNs or Tree based methods to improve validation score
- Implement CrossEntropy loss (good for Multiclass classification)
- Add & engineer features

- I'm grateful that you spent your time reading/skimming all the way through. 
- Comments/suggestions/criticisms on the notebook would be highly appreciated.
- Check out my other work on [Kaggle](https://www.kaggle.com/rrrohit).