In [None]:
# refers to https://stackabuse.com/introduction-to-pytorch-for-classification/
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
dataset = pd.read_csv('train.csv')
dataset.columns

In [None]:
categorical_columns = ['workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'sex','native-country']
numerical_columns = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
outputs = ['exceeds50K']

for category in categorical_columns:
    dataset[category] = dataset[category].astype('category')


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

categorical_data = np.stack([dataset[cat].cat.codes.values for cat in categorical_columns], 1)
categorical_data = torch.tensor(categorical_data, dtype=torch.int64).to(device)

numerical_data = np.stack([dataset[col].values for col in numerical_columns], 1)
numerical_data = torch.tensor(numerical_data, dtype=torch.float).to(device)

outputs = torch.tensor(dataset[outputs].values).flatten().to(device)

print(categorical_data.shape)
print(numerical_data.shape)
print(outputs.shape)

categorical_column_sizes = [len(dataset[column].cat.categories) for column in categorical_columns]
categorical_embedding_sizes = [(col_size, min(50, (col_size+1)//2)) for col_size in categorical_column_sizes]
print(categorical_embedding_sizes)

total_records = outputs.shape[0]
test_records = int(total_records * .1)

categorical_train_data = categorical_data[:total_records-test_records]
categorical_test_data = categorical_data[total_records-test_records:total_records]
numerical_train_data = numerical_data[:total_records-test_records]
numerical_test_data = numerical_data[total_records-test_records:total_records]
train_outputs = outputs[:total_records-test_records]
test_outputs = outputs[total_records-test_records:total_records]

In [None]:
class Model(nn.Module):

    def __init__(self, embedding_size, num_numerical_cols, output_size, layers, p=0.4):
        super().__init__()
        self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
        self.embedding_dropout = nn.Dropout(p)
        self.batch_norm_num = nn.BatchNorm1d(num_numerical_cols)

        all_layers = []
        num_categorical_cols = sum((nf for ni, nf in embedding_size))
        input_size = num_categorical_cols + num_numerical_cols

        for i in layers:
            all_layers.append(nn.Linear(input_size, i))
            all_layers.append(nn.ReLU(inplace=True))
            all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p))
            input_size = i

        all_layers.append(nn.Linear(layers[-1], output_size))

        self.layers = nn.Sequential(*all_layers)

    def forward(self, x_categorical, x_numerical):
        embeddings = []
        for i,e in enumerate(self.all_embeddings):
            embeddings.append(e(x_categorical[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.embedding_dropout(x)

        x_numerical = self.batch_norm_num(x_numerical)
        x = torch.cat([x, x_numerical], 1)
        x = self.layers(x)
        return x

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import f1_score

model = Model(categorical_embedding_sizes, numerical_data.shape[1], 2, [200,100,50], p=0.4).cuda()

loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 1000
aggregated_losses = []

for i in range(epochs):
    i += 1
    y_pred = model(categorical_train_data, numerical_train_data)
    single_loss = loss_function(y_pred, train_outputs)
    aggregated_losses.append(single_loss)

    if i%100 == 1:
        print(f'epoch: {i:3} loss: {single_loss.item():10.8f}')
        y_val = model(categorical_test_data, numerical_test_data)
        y_val = np.argmax(y_val.detach().cpu().numpy(), axis=1)
       
        acc = accuracy_score(test_outputs.cpu().numpy(), y_val)
        f1=f1_score(test_outputs.cpu().numpy(), y_val, average='weighted')
        print("acc: %.2f%%" % (acc * 100.0))
        print("f1: %.2f%%" % (f1 * 100.0))


    optimizer.zero_grad()
    single_loss.backward()
    optimizer.step()

print(f'epoch: {i:3} loss: {single_loss.item():10.10f}')
