# Before

In [None]:
from IPython.display import clear_output
! pip install -q 'https://github.com/PyTorchLightning/lightning-flash/archive/refs/heads/master.zip#egg=lightning-flash[tabular]'
! pip install -q "matplotlib==3.1.1" "pandas" --force-reinstall
clear_output()

import numpy as np
import pandas as pd
import os,random

from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics import accuracy_score

import torch
import flash
from flash.tabular import TabularClassificationData, TabularClassifier

TRAIN_PATH = "../input/spaceship-titanic/train.csv"
TEST_PATH = "../input/spaceship-titanic/test.csv"
SAMPLE_SUBMISSION_PATH = "../input/spaceship-titanic/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"

NEW_TRAIN_PATH = "new_train.csv"
NEW_TEST_PATH = "new_test.csv"

ID = "PassengerId"
TARGET = "Transported"

DELETE_COL = ["Name"]
BOOL_COL = ["CryoSleep","VIP"]

SEED = 2022
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything()

VAL_SPLIT = 0.2
BATCH_SIZE = 16

MODEL_NAME = "fttransformer"
MAX_EPOCHS = 5

OUTPUT_FOLDER = "classes"
MODEL_SAVE_PATH = "tabular_classification_model.pt"

# Pre

In [None]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
    
def checkNull_fillData(df):
    for col in df.columns:
        if len(df.loc[df[col].isnull() == True]) != 0:
            if df[col].dtype == "float64" or df[col].dtype == "int64":
                df.loc[df[col].isnull() == True,col] = df[col].median()
            else:
                df.loc[df[col].isnull() == True,col] = df[col].mode()[0]

checkNull_fillData(train)
checkNull_fillData(test)

train = train.drop(DELETE_COL,axis=1)
test = test.drop(DELETE_COL,axis=1)

train[TARGET] = train[TARGET].astype(int)
for col in BOOL_COL:
    train[col] = train[col].astype(int)
    test[col] = test[col].astype(int)

In [None]:
# train,test = toLabelEncode(train,test)
# train,test = autoPreprocessDevide(train,test,TARGET)
    
train.to_csv(NEW_TRAIN_PATH,index=False)
test.to_csv(NEW_TEST_PATH,index=False)

str_col = train.describe(include="O").columns.tolist()
num_col = train.describe(exclude="O").columns.tolist()

STR_COL = [col for col in str_col if col != ID and col != TARGET]
NUM_COL = [col for col in num_col if col != ID and col != TARGET]

print("STR_COL = ",STR_COL)
print("NUM_COL = ",NUM_COL)

# Build 

In [None]:
# prepare data
train_data = TabularClassificationData.from_csv(
    categorical_fields=STR_COL,
    numerical_fields=NUM_COL,
    target_fields=TARGET,
    train_file=NEW_TRAIN_PATH,
    val_split=VAL_SPLIT,
    batch_size=BATCH_SIZE,
)

# defain model
model = TabularClassifier.from_data(train_data, backbone=MODEL_NAME)

# build model
trainer = flash.Trainer(max_epochs=MAX_EPOCHS, gpus=torch.cuda.device_count())
trainer.fit(model, datamodule=train_data)

# After

In [None]:
# prediction
test_data = TabularClassificationData.from_csv(
    predict_file=NEW_TEST_PATH,
    parameters=train_data.parameters,
    batch_size=1,
)
pred_test = trainer.predict(model, datamodule=test_data, output=OUTPUT_FOLDER)
# print(pred_test)

# save model
trainer.save_checkpoint(MODEL_SAVE_PATH)

pred_result = []

for i in range(len(pred_test)):
#     print(pred_test[i])
    row = pred_test[i]
    for j in range(len(row)):
#         print(row[j])
        pred_result.append(row[j])

sub = pd.read_csv(SAMPLE_SUBMISSION_PATH)
sub[TARGET] = pred_result
sub[TARGET] = sub[TARGET].astype(bool)
sub.to_csv(SUBMISSION_PATH,index=False)
sub.head()