# To run the notebook

Use Google Colab (no need for GPU for this notebook). Before running any code, upload *train.csv* and *test.csv* to the filesystem first. Then run all the cells. 

After the last cell runs, two files called *categorical_train_pred.csv* and *categorical_test_pred.csv* should be saved to the filesystem. Download these files since they're required for catboost.ipynb.

# Summary of techniques

For this notebook I only used the categorical data. 

To process the data, I first created a mapping from the categorical string values to numerical values. I then encoded each training entry as a one-hot vector, using the mapping. Then I split my data into training and validation data, and I created datasets for each.

I created a simple deep neural network with a few linear layers. I used relu activation functions after each layer to prevent a vanishing gradient (and also to add non-linearity). I added dropout layers after some of the linear layers to prevent overfitting.

I then trained the neural network on the training data and observed the validation accuracies. Then I ran the network on the entire training data as well as the test data (to generate predictions). Those predictions would go to catboost.ipynb.

I experimented with many different hyperparameters.


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [None]:
df = pd.read_csv('train.csv')

c_to_i = {}
i_to_c = {}
for i, c in enumerate(df['category'].unique()):
    c_to_i[c] = i
    i_to_c[i] = c

maps = {col: {} for col in df.columns}
for col in df.columns:
    for i, val in enumerate(df[col].unique()):
        maps[col][val] = i

features = ['gender', 'baseColour', 'season', 'usage']

df = df.sample(frac=1).reset_index(drop=True)

df_train, df_val = train_test_split(df, train_size=0.8)

In [None]:
print(F.one_hot(torch.tensor(1), num_classes=3))

tensor([0, 1, 0])


In [None]:
class CategoricalData(Dataset):
    def __init__(self, df, features, test=False):
        self.df = df
        self.len = len(df)
        self.features = features
        self.test = test

    def __getitem__(self, idx):
        r = self.df.iloc[idx]
        one_hot = [F.one_hot(torch.tensor(maps[f][r[f]]).to(device), num_classes=len(maps[f])) for f in self.features]
        one_hot = torch.cat(one_hot).to(device).type(torch.float)

        if self.test:
            return one_hot
        else:
            label = c_to_i[r['category']]
            return one_hot, label

    def __len__(self):
        return self.len

train_dataset = CategoricalData(df_train, features)
val_dataset = CategoricalData(df_val, features)
total_dataset = CategoricalData(df, features)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=True)
total_dataloader = DataLoader(total_dataset, batch_size=64, shuffle=True)

In [None]:
class CategoricalNN(nn.Module):
    def __init__(self, input_size=62, output_size=27, dropout=0.2):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 512)
        self.d2 = nn.Dropout(dropout)
        self.fc3 = nn.Linear(512, 64)
        self.d3 = nn.Dropout(dropout)
        self.fc4 = nn.Linear(64, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        # F.normalize(x)
        x = F.relu(self.fc2(x))
        x = self.d2(x)
        x = F.relu(self.fc3(x))
        x = self.d3(x)
        x = F.log_softmax(self.fc4(x), dim=1)
        return x


In [None]:
def train_epoch(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    train_loss, correct = 0, 0

    model.train()
    for _, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        pred = model(X)
        loss = loss_fn(pred, y)
        train_loss += loss.item()
        correct += (pred.argmax(1) == y).type(torch.float).sum().item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    average_train_loss = train_loss / num_batches
    average_train_accuracy = correct / size
    return average_train_accuracy, average_train_loss

def test_epoch(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    model.eval()
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    average_test_loss = test_loss / num_batches
    average_test_accuracy = correct / size
    return average_test_accuracy, average_test_loss

def train_full(train_dataloader, val_dataloader, model, loss_fn, optimizer, epochs=10, save_weights=False):
    train_accuracies, val_accuracies = [], []
    top_val_accuracy = 0.0

    for t in tqdm(range(epochs)):
        train_accuracy, train_loss = train_epoch(train_dataloader, model, loss_fn, optimizer)
        train_accuracies.append(train_accuracy)

        val_accuracy, val_loss = test_epoch(val_dataloader, model, loss_fn)
        val_accuracies.append(val_accuracy)

        if val_accuracy > top_val_accuracy:
            top_val_accuracy = val_accuracy
            if save_weights:
                torch.save(model, 'categorical_model.pth')

        print(f"Epoch {t+1}:\t Train accuracy: {100*train_accuracy:0.1f}%\t Avg train loss: {train_loss:>6f}\t Val accuracy: {100*val_accuracy:0.1f}%\t Avg val loss: {val_loss:>6f}")

    print(f"Top val accuracy: {top_val_accuracy}")
    return train_accuracies, val_accuracies

In [None]:
%%time

save_weights = True
load_weights = False
num_epochs = 15

if load_weights:
    categorical_model = torch.load('categorical_model.pth')
else:
    categorical_model = CategoricalNN(dropout=0.1).to(device)

loss_fn = nn.NLLLoss()
optimizer = torch.optim.Adam(categorical_model.parameters())

# train_accuracies, val_accuracies = train_full(train_dataloader, val_dataloader, categorical_model, loss_fn, optimizer, epochs=num_epochs, save_weights=save_weights)
train_accuracies, val_accuracies = train_full(total_dataloader, val_dataloader, categorical_model, loss_fn, optimizer, epochs=num_epochs, save_weights=save_weights)

  7%|▋         | 1/15 [00:06<01:28,  6.35s/it]

Epoch 1:	 Train accuracy: 44.8%	 Avg train loss: 1.970973	 Val accuracy: 51.4%	 Avg val loss: 1.622104


 13%|█▎        | 2/15 [00:12<01:24,  6.48s/it]

Epoch 2:	 Train accuracy: 51.6%	 Avg train loss: 1.600449	 Val accuracy: 53.3%	 Avg val loss: 1.506680


 20%|██        | 3/15 [00:19<01:17,  6.45s/it]

Epoch 3:	 Train accuracy: 52.7%	 Avg train loss: 1.536772	 Val accuracy: 54.3%	 Avg val loss: 1.467613


 27%|██▋       | 4/15 [00:25<01:10,  6.40s/it]

Epoch 4:	 Train accuracy: 53.2%	 Avg train loss: 1.507805	 Val accuracy: 54.7%	 Avg val loss: 1.445739


 33%|███▎      | 5/15 [00:32<01:05,  6.53s/it]

Epoch 5:	 Train accuracy: 53.7%	 Avg train loss: 1.487188	 Val accuracy: 54.6%	 Avg val loss: 1.452398


 40%|████      | 6/15 [00:38<00:58,  6.54s/it]

Epoch 6:	 Train accuracy: 54.1%	 Avg train loss: 1.470702	 Val accuracy: 54.8%	 Avg val loss: 1.428895


 47%|████▋     | 7/15 [00:45<00:52,  6.56s/it]

Epoch 7:	 Train accuracy: 54.3%	 Avg train loss: 1.458236	 Val accuracy: 54.7%	 Avg val loss: 1.425951


 53%|█████▎    | 8/15 [00:51<00:44,  6.42s/it]

Epoch 8:	 Train accuracy: 54.1%	 Avg train loss: 1.455774	 Val accuracy: 55.3%	 Avg val loss: 1.405540


 60%|██████    | 9/15 [00:58<00:38,  6.47s/it]

Epoch 9:	 Train accuracy: 54.5%	 Avg train loss: 1.444660	 Val accuracy: 54.7%	 Avg val loss: 1.403958


 67%|██████▋   | 10/15 [01:04<00:31,  6.37s/it]

Epoch 10:	 Train accuracy: 54.4%	 Avg train loss: 1.439505	 Val accuracy: 55.2%	 Avg val loss: 1.403515


 73%|███████▎  | 11/15 [01:11<00:25,  6.44s/it]

Epoch 11:	 Train accuracy: 54.7%	 Avg train loss: 1.435642	 Val accuracy: 55.3%	 Avg val loss: 1.390384


 80%|████████  | 12/15 [01:17<00:19,  6.41s/it]

Epoch 12:	 Train accuracy: 54.7%	 Avg train loss: 1.431934	 Val accuracy: 55.2%	 Avg val loss: 1.391523


 87%|████████▋ | 13/15 [01:23<00:12,  6.37s/it]

Epoch 13:	 Train accuracy: 54.8%	 Avg train loss: 1.428985	 Val accuracy: 55.6%	 Avg val loss: 1.391827


 93%|█████████▎| 14/15 [01:30<00:06,  6.45s/it]

Epoch 14:	 Train accuracy: 54.9%	 Avg train loss: 1.424503	 Val accuracy: 55.6%	 Avg val loss: 1.382280


100%|██████████| 15/15 [01:36<00:00,  6.43s/it]

Epoch 15:	 Train accuracy: 54.9%	 Avg train loss: 1.421885	 Val accuracy: 55.4%	 Avg val loss: 1.382384
Top val accuracy: 0.5564031437817846
CPU times: user 1min 35s, sys: 236 ms, total: 1min 35s
Wall time: 1min 36s





In [None]:
if save_weights:
    categorical_model = torch.load('categorical_model.pth')

categorical_model.eval()

def make_predictions(df):
    eval_dataset = CategoricalData(df, test_features, test=True)
    eval_dataloader = DataLoader(eval_dataset, batch_size=128, shuffle=False)

    predictions = []
    for _, X in enumerate(eval_dataloader):
        X = X.to(device)

        pred = categorical_model(X)
        labels = pred.argmax(1)
        for j in range(pred.shape[0]):
            predictions.append(i_to_c[labels[j].item()])

    return predictions

def eval_pred(base, pred):
    assert(base['id'].equals(pred['id']))
    print('ids match')
    diff_count = (base['category'] == pred['category']).value_counts()

    return (100.0*diff_count[True])/len(base)


In [None]:
%%time

test_features = ['gender', 'baseColour', 'season', 'usage']

df = pd.read_csv('train.csv')

train_pred = df[['id']]
train_pred['category'] = make_predictions(df)

print(f'train accuracy: {eval_pred(df, train_pred)}')

df_test = pd.read_csv('test.csv')
test_pred = df_test[['id']]
test_pred['category'] = make_predictions(df_test)
assert(df_test['id'].equals(test_pred['id']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


ids match
train accuracy: 55.64340870208536
CPU times: user 9.11 s, sys: 22.8 ms, total: 9.13 s
Wall time: 9.2 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
train_pred.to_csv('categorical_train_pred.csv', index=False)
test_pred.to_csv('categorical_test_pred.csv', index=False)