# PetFinder: MLP on categorical (and numerical) features

In this kernel I demostrate a simple NN baseline implemented in PyTorch. I use only categorical and numerical features form train/test.csv. To encode categorical features I use embedding layers as described in https://arxiv.org/abs/1604.06737 (also this work is mentioned in a lesson 4 of the fast.ai DL course https://course.fast.ai/videos/?lesson=4 as well as in this discussion https://www.kaggle.com/c/petfinder-adoption-prediction/discussion/76100, however, I stumbled upon this thread after I created this kernel). <br><BR>

I borrowed a lot of code from this blog post https://yashuseth.blog/2018/07/22/pytorch-neural-network-for-tabular-data-with-categorical-embeddings/ as well as from this tutorial on PyTorch https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html. Also I used some code from this public kernel https://www.kaggle.com/peterhurford/pets-lightgbm-baseline-with-all-the-data and took a small snippet of code from https://www.kaggle.com/bminixhofer/deterministic-neural-networks-using-pytorch/data <br><br>


In [None]:
import json
import time
import copy

import scipy as sp
import pandas as pd
import numpy as np

import random
import os

from functools import partial
from math import sqrt

from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix as sk_cmatrix

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from collections import Counter

In [None]:
# https://www.kaggle.com/bminixhofer/deterministic-neural-networks-using-pytorch/data
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [None]:
# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

In [None]:
print('Train')
train = pd.read_csv("../input/train/train.csv")
print(train.shape)

print('Test')
test = pd.read_csv("../input/test/test.csv")
print(test.shape)

print('Breeds')
breeds = pd.read_csv("../input/breed_labels.csv")
print(breeds.shape)

print('Colors')
colors = pd.read_csv("../input/color_labels.csv")
print(colors.shape)

print('States')
states = pd.read_csv("../input/state_labels.csv")
print(states.shape)

In [None]:
target = train['AdoptionSpeed']
train_id = train['PetID']
test_id = test['PetID']
train.drop(['AdoptionSpeed', 'PetID'], axis=1, inplace=True)
test.drop(['PetID'], axis=1, inplace=True)

#### Drop useless columns

In [None]:
train.drop(['Name', 'RescuerID', 'Description'], axis=1, inplace=True)
test.drop(['Name', 'RescuerID', 'Description'], axis=1, inplace=True)

In [None]:
train.Age = train.Age.apply(lambda x: x if x < 12 else (x // 12)*12)
test.Age = test.Age.apply(lambda x: x  if x < 12 else (x // 12)*12)

In [None]:
train.Quantity = train.Quantity.apply(lambda x: x if x < 10 else 10)
test.Quantity = test.Quantity.apply(lambda x: x if x < 10 else 10)

### Define columns types

In [None]:
numeric_cols = [
                'Type',
                'Age', 
                'Fee', 
                'VideoAmt', 
                'PhotoAmt', 
               ] 

cat_cols = list(set(train.columns) - set(numeric_cols))

#### Standart scale numerical columns

In [None]:
for col in ['Age', 'Fee', 'VideoAmt', 'PhotoAmt']:
    train[col] = train[col].transform(lambda x: (x - x.mean()) / x.std())
    test[col] =  test[col].transform(lambda x: (x - x.mean()) / x.std())

In [None]:
# transform some cat columns to 0..N-1 range
for col in cat_cols:
    if train[col].min() == 1:
        train[col] -= 1
        test[col] -= 1

In [None]:
# label encode States columns
train['State'], States_indexer = pd.factorize(train.State)

test['State'] = States_indexer.get_indexer(test.State)

In [None]:
cat_dims = {
    #'Type': 2,
    'Breed1': 308,
    'Breed2': 308,
    'Gender': 3,
    'Color1': 8,
    'Color2': 8,
    'Color3': 8,
    'MaturitySize': 4, # some values are not present
    'FurLength': 3,  # some values are not present
    'Vaccinated': 3,
    'Dewormed': 3,
    'Sterilized': 3,
    'Health': 3,
    'Quantity': 10,      # check num of categories
    'State': 14
}

In [None]:
emb_dims = {
    #'Type': 2,
    'Breed1': 200,
    'Breed2': 200,
    'Gender': 2,
    'Color1': 4,
    'Color2': 4,
    'Color3': 4,
    'MaturitySize': 2, # some values are not present
    'FurLength': 2,  # some values are not present
    'Vaccinated': 2,
    'Dewormed': 2,
    'Sterilized': 2,
    'Health': 2,
    'Quantity': 5,      # check num of categories, default 10
    'State': 7
}

In [None]:
emb_dims_tuples = [(cat_dims[c], emb_dims[c]) for c in cat_cols]

#### Define Tabular Dataset

In [None]:
class TabularDataset(Dataset):
    def __init__(self, data, num_cols=None, cat_cols=None, target=None):

        self.n = data.shape[0]

        if isinstance(target, pd.Series):
            self.y = target.values
        else:
            self.y = target

        self.cat_cols = cat_cols
        self.cont_cols = num_cols

        if self.cont_cols:
            self.cont_X = data[self.cont_cols].values.astype(np.float32)
        else:
            self.cont_X = np.zeros((self.n, 1))

        if self.cat_cols:
            self.cat_X = data[cat_cols].values.astype(np.long)
        else:
            self.cat_X =  np.zeros((self.n, 1))

    def __len__(self):
        """
        Denotes the total number of samples.
        """
        return self.n

    def __getitem__(self, idx):
        """
        Generates one sample of data.
        """
        if self.y is None:
            return [self.cont_X[idx], self.cat_X[idx]] 
        else:
            return [self.cont_X[idx], self.cat_X[idx], self.y[idx]]

### Define MLP

In [None]:
class FeedForwardNN(nn.Module):

    def __init__(self, emb_dims, no_of_cont, lin_layer_sizes,
               output_size, emb_dropout, lin_layer_dropouts):
        """
        Parameters
        ----------

        emb_dims: List of two element tuples
          This list will contain a two element tuple for each
          categorical feature. The first element of a tuple will
          denote the number of unique values of the categorical
          feature. The second element will denote the embedding
          dimension to be used for that feature.

        no_of_cont: Integer
          The number of continuous features in the data.

        lin_layer_sizes: List of integers.
          The size of each linear layer. The length will be equal
          to the total number
          of linear layers in the network.

        output_size: Integer
          The size of the final output.

        emb_dropout: Float
          The dropout to be used after the embedding layers.

        lin_layer_dropouts: List of floats
          The dropouts to be used after each linear layer.
        """

        super().__init__()

        # Embedding layers
        self.emb_layers = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims])

        no_of_embs = sum([y for x, y in emb_dims])

        self.no_of_embs = no_of_embs
        self.no_of_cont = no_of_cont

        # Linear Layers
        first_lin_layer = nn.Linear(self.no_of_embs + self.no_of_cont, lin_layer_sizes[0])
        self.lin_layers = nn.ModuleList([first_lin_layer] +\
              [nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i + 1]) for i in range(len(lin_layer_sizes) - 1)])

        # initialize weigths in linear layers
        for lin_layer in self.lin_layers:
            nn.init.kaiming_normal_(lin_layer.weight.data)

        # Output Layer
        self.output_layer = nn.Linear(lin_layer_sizes[-1], output_size)
        nn.init.kaiming_normal_(self.output_layer.weight.data)

        # Batch Norm Layers
        self.first_bn_layer = nn.BatchNorm1d(self.no_of_cont)
        self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size) for size in lin_layer_sizes])

        # Dropout Layers
        self.emb_dropout_layer = nn.Dropout(emb_dropout)
        self.droput_layers = nn.ModuleList([nn.Dropout(size) for size in lin_layer_dropouts])

    def forward(self, cont_data, cat_data):

        if self.no_of_embs != 0:
            x = [emb_layer(cat_data[:, i]) for i, emb_layer in enumerate(self.emb_layers)]
            x = torch.cat(x, 1)
            x = self.emb_dropout_layer(x)

        if self.no_of_cont != 0:
            normalized_cont_data = self.first_bn_layer(cont_data)

            if self.no_of_embs != 0:
                x = torch.cat([x, normalized_cont_data], 1) 
            else:
                x = normalized_cont_data

        for lin_layer, dropout_layer, bn_layer in zip(self.lin_layers, self.droput_layers, self.bn_layers):
            
            # NB activation func
            x = F.tanh(lin_layer(x))
            
            x = bn_layer(x)
            x = dropout_layer(x)

        x = self.output_layer(x)

        return x

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def train_model(model, dataloaders, criterion, optimizer, num_epochs=25):
    since = time.time()
    seed_everything()
    val_acc_history = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch+1, num_epochs))

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for cont_data, cat_data, labels in dataloaders[phase]:
                cont_data, cat_data = cont_data.to(device), cat_data.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    outputs = model(cont_data, cat_data)
                    loss = criterion(outputs, labels)

                    _, preds = torch.max(outputs, 1)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * cont_data.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_acc_history.append(epoch_acc)


    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s; Best val Acc: {:4f}'.format(time_elapsed // 60, time_elapsed % 60, best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, val_acc_history

In [None]:
batch_size = 64
N_SPLITS = 10
N_EPOCHS = 50

In [None]:
test_ds = TabularDataset(test, num_cols=numeric_cols, cat_cols=cat_cols, target=None)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [None]:
def predict(model, dataloader, output_size=5):
    preds = np.zeros((len(dataloader.dataset), output_size))
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        for i, (cont_data, cat_data, *other) in enumerate(dataloader):
            cont_data, cat_data = cont_data.to(device), cat_data.to(device)
            preds[i*batch_size: (i+1)*batch_size] = model(cont_data, cat_data).cpu().numpy()
    
    return preds

In [None]:
fold_splits = StratifiedKFold(n_splits=N_SPLITS, random_state=42).split(train, target)

test_preds = np.zeros((test.shape[0], 5))
train_preds = np.zeros((train.shape[0], 5))
for i, (train_index, val_index) in enumerate(fold_splits):
    print(f'Fold {i+1}/{N_SPLITS}')
        
    if isinstance(train, pd.DataFrame):
        train_X, val_X = train.iloc[train_index], train.iloc[val_index]
        train_y, val_y = target[train_index], target[val_index]
    else:
        train_X, val_X = train[train_index], train[val_index]
        train_y, val_y = target[train_index], target[val_index]
    
    train_ds = TabularDataset(train_X, num_cols=numeric_cols, cat_cols=cat_cols, target=train_y)
    valid_ds = TabularDataset(val_X, num_cols=numeric_cols, cat_cols=cat_cols, target=val_y)
    
    dataloaders = {
        'train': DataLoader(train_ds, batch_size=batch_size, shuffle=True),
        'val': DataLoader(valid_ds, batch_size=batch_size, shuffle=False),
    }
    
    model = FeedForwardNN(emb_dims_tuples, len(numeric_cols), lin_layer_sizes=[384, 256, 128],
                          output_size=5, emb_dropout=0.01,
                          lin_layer_dropouts=[0.1,0.1,0.001]).to(device)
    
    criterion = nn.CrossEntropyLoss()
    optimizer_ft = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    
    # train
    model, val_history = train_model(model, dataloaders, criterion, optimizer_ft, num_epochs=N_EPOCHS)
    
    # predict
    test_preds += predict(model, test_dl, output_size=5)
    train_preds[val_index] = predict(model, dataloaders['val'], output_size=5)

In [None]:
# just pick the index of the max value 
# w/o dividing by the num of folds and applying sigmoid function
test_preds_class = np.argmax(test_preds, axis=1)
train_preds_class = np.argmax(train_preds, axis=1)

In [None]:
quadratic_weighted_kappa(target, train_preds_class)

In [None]:
pd.DataFrame(sk_cmatrix(target, train_preds_class), index=list(range(5)), columns=list(range(5)))

In [None]:
submission = pd.DataFrame({'PetID': test_id, 'AdoptionSpeed': test_preds_class})
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)