# Deep learning Modelling for Tabular Dataset

### Problem Statement: Based on the certain features of the shelter animal (age, sex, color, breed), predict its outcome.

### 5 Possible outcomes:
    
    * Return_to_owner
    * Euthanasia
    * Adoption
    * Transfer
    * Died

In [1]:
# Import Libraries

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import models

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from collections import Counter

In [2]:
# training data

train = pd.read_csv('./data/kaggle_animal/train.csv')
print('Shape: ', train.shape)
train.head()

Shape:  (26729, 10)


Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [3]:
#  test data

test = pd.read_csv('./data/kaggle_animal/test.csv')
print('Shape: ', test.shape)
test.head()

Shape:  (11456, 8)


Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan
2,3,Gus,2016-01-13 12:20:00,Cat,Neutered Male,1 year,Domestic Shorthair Mix,Brown Tabby
3,4,Pongo,2013-12-28 18:12:00,Dog,Intact Male,4 months,Collie Smooth Mix,Tricolor
4,5,Skooter,2015-09-24 17:59:00,Dog,Neutered Male,2 years,Miniature Poodle Mix,White


In [4]:
# Sample Submission File
# for each row, each outcome's probability needs to be filled into hte columns

sample = pd.read_csv('./data/kaggle_animal/sample_submission.csv')
print('Shape: ', sample.shape)
sample.head()

Shape:  (11456, 6)


Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,1,0,0,0,0
1,2,1,0,0,0,0
2,3,1,0,0,0,0
3,4,1,0,0,0,0
4,5,1,0,0,0,0


In [5]:
# Basic counting of the OutcomeTypes

Counter(train['OutcomeType'])

Counter({'Return_to_owner': 4786,
         'Euthanasia': 1555,
         'Adoption': 10769,
         'Transfer': 9422,
         'Died': 197})

In [6]:
# Most common names and its counting

Counter(train['Name']).most_common(10)

[(nan, 7691),
 ('Max', 136),
 ('Bella', 135),
 ('Charlie', 107),
 ('Daisy', 106),
 ('Lucy', 94),
 ('Buddy', 87),
 ('Princess', 86),
 ('Rocky', 85),
 ('Luna', 68)]

## Data Preprocessing

In [7]:
# Dropping out the columns, which is not useful!

train_X = train.drop(columns=['OutcomeType', 'OutcomeSubtype', 'AnimalID'])
Y = train['OutcomeType']
test_X = test

### Stacking test and train dataset so that they go through same pre-processing

In [8]:
stacked_df = train_X.append(test_X.drop(columns=['ID']))

  stacked_df = train_X.append(test_X.drop(columns=['ID']))


In [9]:
train_X.head()

Unnamed: 0,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Hambone,2014-02-12 18:22:00,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,Emily,2013-10-13 12:44:00,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,Pearce,2015-01-31 12:28:00,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,,2014-07-11 19:09:00,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,,2013-11-15 12:52:00,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [10]:
# Dropping dateTime column as it is no help
stacked_df = stacked_df.drop(columns=['DateTime'])
stacked_df.head()

Unnamed: 0,Name,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Hambone,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,Emily,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,Pearce,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [11]:
# Dropping out the columns with most 'Nan' values

for col in stacked_df.columns:
    if stacked_df[col].isnull().sum() > 10000:
        print('Dropping: ', col, stacked_df[col].isnull().sum())
        stacked_df = stacked_df.drop(columns=col)

Dropping:  Name 10916


In [12]:
stacked_df.head()

Unnamed: 0,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [13]:
# Label Encoding

for col in stacked_df.columns:
    if stacked_df.dtypes[col] == 'object':
        stacked_df[col] = stacked_df[col].fillna('NA')
        
    else:
        stacked_df[col] = stacked_df[col].fillna(0)
    stacked_df[col] =LabelEncoder().fit_transform(stacked_df[col])

In [14]:
stacked_df.head()

Unnamed: 0,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,3,5,1482,146
1,0,4,5,775,184
2,1,3,21,1293,97
3,0,1,26,775,47
4,1,3,21,1101,311


In [15]:
# Make all variables categorical

for col in stacked_df.columns:
    stacked_df[col] = stacked_df[col].astype('category')

In [16]:
# Splitting the train and test sets again

X = stacked_df[0:26729]
test_processed = stacked_df[26729:]

# Check if shape[0] matches original
print('Train shape after processing: ', X.shape, 'Original: ', train.shape)
print('Test shape after processing: ', test_processed.shape, 'Original: ', test.shape)

Train shape after processing:  (26729, 5) Original:  (26729, 10)
Test shape after processing:  (11456, 5) Original:  (11456, 8)


In [17]:
# Encoding the Targets

Y = LabelEncoder().fit_transform(Y)



In [18]:
# Sanity check to see the numbers match and matching with previous counter to create target dictionary

print(Counter(train['OutcomeType']))
print(Counter(Y))
target_dict = {
    'Return_to_owner' : 3,
    'Euthanasia' : 2,
    'Adoption' : 0,
    'Transfer' : 4,
    'Died' : 1
}

Counter({'Adoption': 10769, 'Transfer': 9422, 'Return_to_owner': 4786, 'Euthanasia': 1555, 'Died': 197})
Counter({0: 10769, 4: 9422, 3: 4786, 2: 1555, 1: 197})


In [19]:
target_dict

{'Return_to_owner': 3,
 'Euthanasia': 2,
 'Adoption': 0,
 'Transfer': 4,
 'Died': 1}

In [20]:
# TRaining validation random split

X_train, X_val,y_train, y_val = train_test_split(X, Y, test_size=0.10, random_state=0)
X_train.head()

Unnamed: 0,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
6917,1,3,5,1293,146
13225,0,4,33,1515,231
2697,1,4,5,1353,43
21905,1,3,31,245,40
17071,0,4,37,775,156


In [21]:
# Choosing columns for embedding

##categorical embedding for columns having more than two values
#embedded_cols = {n: len(col.cat.categories) for n,col in X.items() if len(col.cat.categories) > 2}
#embedded_cols
#
embedded_cols = {}

for n, col in X.items():
    if len(col.cat.categories) > 2:
        embedded_cols.update({n : len(col.cat.categories)})
        
embedded_cols


{'SexuponOutcome': 6, 'AgeuponOutcome': 46, 'Breed': 1678, 'Color': 411}

In [22]:
embedded_cols_names = embedded_cols.keys()
len(X.columns) - len(embedded_cols)

1

In [23]:
# Determine size of embeddings

embedding_sizes = [(n_categories, min(50, (n_categories) // 2)) for _, n_categories in embedded_cols.items()]
embedding_sizes

[(6, 3), (46, 23), (1678, 50), (411, 50)]

## PyTorch dataset 

In [24]:
class ShelterOutcomeDataset(Dataset):
    def __init__(self, X, Y, embedded_col_names):
        X = X.copy()
        self.X1 = X.loc[:, embedded_col_names].copy().values.astype(np.int64)   # categorical columns
        self.X2 = X.drop(columns=embedded_col_names).copy().values.astype(np.float32) # numerical columns
        self.y = Y
        
    def __getitem__(self, idx):
        return self.X1[idx], self.X2[idx], self.y[idx]
    
    def __len__(self):
        return len(self.y)

In [25]:
# Creating train and validation dataset in torch

train_ds = ShelterOutcomeDataset(X_train, y_train, embedded_cols_names)
valid_ds = ShelterOutcomeDataset(X_val, y_val, embedded_cols_names)

# Choosing device for CPU/GPU compatible

In [26]:
def get_default_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    
    else:
        return torch.device('cpu')
    
    
def to_device(data, device):
    # move tensor to a chosen device
    if isinstance(data, (list, tuple)):
        return [to_device(x, device) for x in data]
    
    return data.to(device, non_blocking=True)

In [27]:
class DeviceDataLoader():
    # Wrap a dataloader to move the data to a device
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        #Yield a batch of data after moving it to a device
        for b in self.dl:
            yield to_device(b, self.device)
            
    def __len__(self):
        return len(self.dl)

In [28]:
device = get_default_device()

print(device)

cuda


## Model

In [29]:
class ShelterOutcomeModel(nn.Module):
    def __init__(self, embedding_sizes, n_cont):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories, size in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings)  # length of all embeddings combined
        self.n_emb, self.n_cont = n_emb, n_cont
        self.lin1 = nn.Linear(self.n_emb + self.n_cont, 200)
        self.lin2 = nn.Linear(200, 70)
        self.lin3 = nn.Linear(70, 5)
        self.bn1 = nn.BatchNorm1d(self.n_cont)
        self.bn2 = nn.BatchNorm1d(200)
        self.bn3 = nn.BatchNorm1d(70)
        self.emb_drop = nn.Dropout(0.6)
        self.drops = nn.Dropout(0.3)
        
    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:, i]) for i, e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)
        x2 = self.bn1(x_cont)
        x = torch.cat([x, x2], 1)
        x = F.relu(self.lin1(x))
        x = self.drops(x)
        x = self.bn2(x)
        x = F.relu(self.lin2(x))
        x = self.drops(x)
        x = self.bn3(x)
        x = self.lin3(x)
        
        return x

In [30]:
# Model construction and definition

model = ShelterOutcomeModel(embedding_sizes, 1)
to_device(model, device)

ShelterOutcomeModel(
  (embeddings): ModuleList(
    (0): Embedding(6, 3)
    (1): Embedding(46, 23)
    (2): Embedding(1678, 50)
    (3): Embedding(411, 50)
  )
  (lin1): Linear(in_features=127, out_features=200, bias=True)
  (lin2): Linear(in_features=200, out_features=70, bias=True)
  (lin3): Linear(in_features=70, out_features=5, bias=True)
  (bn1): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(70, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (emb_drop): Dropout(p=0.6, inplace=False)
  (drops): Dropout(p=0.3, inplace=False)
)

In [31]:
# Function to input Optimizer 

def get_optimizer(model, lr=0.001, wd=0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

In [32]:
# Training Function

def train_model(model, optim, train_dl):
    model.train()
    total=0
    sum_loss = 0
    for x1, x2, y in train_dl:
        batch = y.shape[0]
        output = model(x1, x2)
        loss = F.cross_entropy(output, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total

In [33]:
# Evaluation Function

def val_loss(model, valid_dl):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0 
    for x1, x2, y in valid_dl:
        current_batch_size = y.shape[0]
        out = model(x1, x2)
        loss = F.cross_entropy(out, y)
        sum_loss += current_batch_size*(loss.item())
        total += current_batch_size
        pred = torch.max(out, 1)[1]
        correct += (pred == y).float().sum().item()
    print('valid loss %.3f and accuracy %.3f' % (sum_loss/total, correct/total))
    return sum_loss/total, correct/total

In [34]:
def train_loop(model, epochs, lr=0.01, wd=0.0):
    optim = get_optimizer(model, lr=lr, wd=wd)
    for i in range(epochs):
        loss = train_model(model, optim, train_dl)
        print('training loss: ', loss)
        val_loss(model, valid_dl)

# Training

In [35]:
batch_size = 1000

train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)

In [36]:
train_dl = DeviceDataLoader(train_dl, device)
valid_dl = DeviceDataLoader(valid_dl, device)

In [37]:
train_loop(model, epochs=8, lr=0.05, wd=0.00001)

training loss:  1.1738186110322721
valid loss 0.932 and accuracy 0.594
training loss:  1.0004362812619452
valid loss 0.896 and accuracy 0.616
training loss:  0.9708535089459497
valid loss 0.881 and accuracy 0.637
training loss:  0.960785511803064
valid loss 0.878 and accuracy 0.639
training loss:  0.9522645581311707
valid loss 0.877 and accuracy 0.638
training loss:  0.9464072160915873
valid loss 0.867 and accuracy 0.646
training loss:  0.9460821108917957
valid loss 0.883 and accuracy 0.632
training loss:  0.9430020642233006
valid loss 0.888 and accuracy 0.635


# Test Output

In [41]:
test_ds = ShelterOutcomeDataset(test_processed, np.zeros(len(test_processed)), embedded_cols_names)
test_dl = DataLoader(test_ds, batch_size=batch_size)

In [42]:
test_dl = DeviceDataLoader(test_dl, device)

In [44]:
preds = []
with torch.no_grad():
    for x1, x2, y  in test_dl:
        out = model(x1, x2)
        prob = F.softmax(out, dim=1)
        preds.append(prob)

In [45]:
final_probs = [item for sublist in preds for item in sublist]

In [46]:
len(final_probs)

11456

In [47]:
sample['Adoption'] = [float(t[0]) for t in final_probs]
sample['Died'] = [float(t[1]) for t in final_probs]
sample['Euthanasia'] = [float(t[2]) for t in final_probs]
sample['Return_to_owner'] = [float(t[3]) for t in final_probs]
sample['Transfer'] = [float(t[4]) for t in final_probs]

In [48]:
sample.head()

Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,0.148847,0.013911,0.076281,0.200107,0.560854
1,2,0.548305,0.002238,0.020077,0.254301,0.17508
2,3,0.541172,0.006338,0.030607,0.104736,0.317146
3,4,0.152622,0.02024,0.060009,0.068289,0.69884
4,5,0.599264,0.002156,0.015615,0.202934,0.180032


In [49]:
sample.to_csv('./data/kaggle_animal/sample_final.csv', index=False)