In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv("../input/ghouls-goblins-and-ghosts-boo/train.csv.zip")
df_test = pd.read_csv("../input/ghouls-goblins-and-ghosts-boo/test.csv.zip")

In [None]:
df_train.head()

In [None]:
df_train.describe().T

In [None]:
df_test.describe().T

# Preprocessing for NN

In [None]:
# drop noisy 'color' and 'id'
df_train.drop(columns = ['id', 'color'], inplace=True)
df_test.drop(columns = ['id', 'color'], inplace=True)

In [None]:
# OneHotEncode 'type'
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
y = encoder.fit_transform(df_train['type'].to_numpy().reshape(-1, 1)).A
X = df_train.drop(columns = ['type'])

In [None]:
X.head()

# Network Initialization

In [None]:
import torch
import torch.nn.functional as F

from torch import nn  
from torch import optim
from torch.utils.data import DataLoader, TensorDataset  
from tqdm import tqdm  

In [None]:
class NN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(NN, self).__init__()
        self.fc1 = nn.Linear(input_size, 5)
        self.fc2 = nn.Linear(5, num_classes)
        
        self.dr1 = nn.BatchNorm1d(5)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dr1(x)
        return self.fc2(x)

In [None]:
device = torch.device("cpu")
input_size = 4
num_classes = 3
learning_rate = 0.1
batch_size = 32
num_epochs = 75

> **Utility functions**

In [None]:
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval()
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device).float()
            _, y = y.to(device=device).max(dim = 1)
            x = x.reshape(x.shape[0], -1).float()

            scores = model(x)
            _, predictions = scores.max(dim = 1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)
    model.train()
    return num_correct / num_samples

In [None]:
def cross_entropy_one_hot(out, target):
    _, labels = target.max(dim = 1)
    return nn.CrossEntropyLoss()(out, labels)

In [None]:
def get_prediction(model, X):
    with torch.no_grad():
        X = X.to(device=device)
        scores = model(X.float())
        _, predictions = scores.max(dim = 1)
        predictions = predictions.cpu().numpy()
    return predictions.T

In [None]:
def from_prediction_to_onehot_encoding(Y):
    preds = list()
    for y in Y:
        pred = [0] * num_classes
        pred[y] = 1
        preds.append(pred)
    return np.array(preds)

# Modeling

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=17)

In [None]:
X_train_tensor = torch.from_numpy(np.vstack(X_train.to_numpy()[:, :]).astype(np.float64))
X_test_tensor = torch.from_numpy(np.vstack(X_test.to_numpy()[:, :]).astype(np.float64))
y_train_tensor = torch.from_numpy(y_train)
y_test_tensor = torch.from_numpy(y_test)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [None]:
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size)

In [None]:
model = NN(input_size=input_size, num_classes=num_classes).to(device)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [None]:
model = model.float()
losses = []
for epoch in range(num_epochs):
    local_losses = []
    for batch_idx, (data, targets) in enumerate(tqdm(train_loader, disable=True)):
        data = data.to(device=device)
        targets = targets.to(device=device)
        data = data.reshape(data.shape[0], -1).float()
        targets = targets.float()

        scores = model(data)
        loss = cross_entropy_one_hot(scores, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        local_losses.append(loss.item())
    losses.append(np.mean(local_losses))

In [None]:
sns.lineplot(x = range(num_epochs), y = losses)

In [None]:
y_test_score = get_prediction(model, X_test_tensor)
y_train_score = get_prediction(model, X_train_tensor)

In [None]:
print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}")
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")

# Create submission

In [None]:
X_submission_tensor = torch.from_numpy(np.vstack(df_test.to_numpy()[:, :]).astype(np.float64))
submission = get_prediction(model, X_submission_tensor)
submission = from_prediction_to_onehot_encoding(submission)
submission = encoder.inverse_transform(submission).T[0]

In [None]:
ids = pd.read_csv("../input/ghouls-goblins-and-ghosts-boo/sample_submission.csv.zip")['id']
pd.DataFrame({"id": ids, "type": submission}).set_index("id").to_csv("predictions.csv")