## It is from futurama, isn't it?
![futurama](https://external-preview.redd.it/sSxBOd7IXnKScyKbp6WqAaGbKDvuYTUI7B8QmoB7ovU.jpg?auto=webp&s=42096f7841c12d798971cb241b7c9760ffd862e0)

In [None]:
import fastai
print("Fastai version : " + fastai.__version__)

In [None]:
import pandas as pd
import numpy as np

from fastai import *
from fastai.tabular.all import *

import seaborn as sns
from matplotlib import pyplot as plt
from IPython.display import Markdown, display
pd.options.mode.chained_assignment = None  # default='warn'

from sklearn.model_selection import StratifiedKFold, train_test_split

## Dataset

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')
subm = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')

train.shape, test.shape

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print("--Null distribution--")
display((train.isnull().sum()*100/train.shape[0]).sort_values(ascending = False).to_frame().T)

print('\n')
print("--Number of Uniques--")
display(train.nunique().sort_values(ascending = False).to_frame().T)

## Preprocessing

In [None]:
# filter some collumns
req_cols = train.columns.difference(['PassengerId','Name', 'Cabin'] 
                                       ).tolist()
print("Required features :", req_cols)

In [None]:
train = train[req_cols]
test = test[[col for col in req_cols if col != "Transported"]]

In [None]:
# Divide the features into categorical and continuous

cat_feat = train.select_dtypes("O").columns.tolist()
cont_feat = train.columns.difference(cat_feat+["Transported"]).tolist()

print("categorical data:")
display(train[cat_feat].head(2))

print("\ncontinuos data:")
display(train[cont_feat].head(2))

In [None]:
# splitting woth 20% to validation
splits = RandomSplitter(valid_pct=0.2, seed=13)(range_of(train))

# create tabular data
tp = TabularPandas(train, procs=[Categorify, FillMissing,Normalize],  # handles categorical data, fills missing part and normalizes input
                   cat_names = cat_feat,
                   cont_names = cont_feat,
                   y_names='Transported',
                   y_block=CategoryBlock(),
                   splits=splits)

tp.show(max_n=3)

In [None]:
print("filling strategy:",tp.fill_strategy)

In [None]:
norms = tp.procs.normalize
print("Mean and STD for each continuos col")
cont_col_stat = pd.DataFrame([norms.means, norms.stds], index = ["mean","std"])
cont_col_stat

In [None]:
# visualize
label_encoding_dict = {cat : tp.procs.categorify[cat].o2i for cat in tp.cat_names}
print("encoded data")
display(tp.items.iloc[[0]])

print("\noriginal data - categorical values")
row = tp.items.iloc[0]
tp.decode_row(row).to_frame().T

In [None]:
# creating TabDataloaders
trn_dl = TabDataLoader(tp.train, bs=64, shuffle=True, drop_last=True)
val_dl = TabDataLoader(tp.valid, bs=64)

In [None]:
dls = DataLoaders(trn_dl, val_dl)

## Model and trainig 

In [None]:
# Create Neural Network with 3 layers - [32, 16, 8 Neurons]
learn = tabular_learner(dls, layers=[32,16, 8], metrics=accuracy)
learn.summary()

In [None]:
print("size of embeddings")
print(learn.embeds)

In [None]:
# find effective learning rate
learn.lr_find()

In [None]:
# Fit_one_cycle is an appraoch which uses dynamic learning-rate while trainig the model
learn.fit_one_cycle(20, lr_max = 6e-2)

In [None]:
plt.title("cyclic learning rate")
plt.xlabel("training step")
plt.ylabel("learning rate")
plt.plot(range(len(learn.recorder.lrs)),learn.recorder.lrs)

In [None]:
plt.title("Loss trend")
plt.xlabel("training step")
plt.ylabel("cross-entropy loss")
plt.plot(range(len(learn.recorder.losses)),learn.recorder.losses)

In [None]:
# checking validation results
learn.recorder.show_results(max_n = 3)

In [None]:
concat_embed_size = sum([x.weight.shape[1] for x in learn.embeds])  # 163 size
n_cont_feat = 5 # continuos data

print("total size of concatenated categorical embeddings and continuos features equals ::",  concat_embed_size + n_cont_feat)

## Inference

In [None]:
# make predictions
t = learn.dls.train_ds.new(test)
t.process()

dl = TabDataLoader(t)
preds = learn.get_preds(dl=dl)[0].argmax(1).numpy()
preds[:5]

In [None]:
sub = pd.DataFrame({'PassengerId':subm.PassengerId, 'Transported': preds.astype(bool)})
sub.to_csv('submission.csv', index=False)
sub.head()