In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import _LRScheduler

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

%matplotlib inline

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else
torch.device('cpu')

## Load data

In [None]:
IN_PATH = '/data-directory/'
df = pd.read_csv(IN_PATH + 'file.csv')

#### Inspect raw data

In [None]:
# Number of rows
len(df)

# Sample
df.head(10)

# Summary stats
df.describe(include='all')

# Categories
df['catvar'].astype('category').cat.categories

# Missings
df.isnull().sum()

#### Set index

In [None]:
df = df.set_index('id')

#### Feature engineering

In [None]:
# Convert to datetime
df['date'] = pd.to_datetime(df['date_string'], format='%d%b%Y')

# Days between
df['days_between'] = (df['date1'] - df['date2']).dt.days

# Substring before first digit
df['substring'] = df['string'].str.split(r'\d').str[0]

#### Assign categorical vs continuous features

In [None]:
cat_vars = ['cat_var1', 'cat_var2', 'cat_var3']

con_vars = ['con_var1', 'con_var2', 'con_var3']

#### Scale continuous values

If there's a separate test set, remember to save the mapper so you can apply the exact same transformation on that

In [None]:
# Calculate mean and variance of each continuous variable
scaler = StandardScaler()
scaler.fit(df[con_vars])

In [None]:
scaler.mean_

In [None]:
scaler.var_

In [None]:
# Standardise each continuous variable by subtracting mean and dividing by stddev
df[con_vars] = scaler.transform(df[con_vars])

In [None]:
df[con_vars].head()

#### Map categorical values to integers

In [None]:
# Cast categorical variables to the correct type
for c in cat_vars:
    df[c] = df[c].astype('category')

In [None]:
# Map categorical values to integers (leave 0 for missing)
for col_name, col_val in df[cat_vars].items():
    df[col_name] = col_val.cat.codes+1

In [None]:
df.describe(include='all')

## Set up entity embeddings for categorical variables

In [None]:
cat_sz = [(c, len(df[c].cat.categories)+1) for c in cat_vars]
cat_sz

In [None]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
emb_szs

## Split out data for training, validation, and test

In [None]:
# Randomly allocate row numbers for training, validation and test
np.random.seed(123)
valid_pct = 0.2
test_pct = 0.2

n = len(df)
n_valid = int(valid_pct * n)
n_test = int(test_pct * n)

row_nums = np.random.permutation(n)

valid_rows = row_nums[:n_valid]
test_rows  = row_nums[n_valid:n_valid+n_test]
train_rows = row_nums[n_valid+n_test:]

In [None]:
# Split data
df_train = df.iloc[train_rows]
df_valid = df.iloc[valid_rows]
df_test  = df.iloc[test_rows]

In [None]:
# Split out target variables
y_train = df_train['target']
x_train = df_train.drop(['target'], axis=1)

y_valid = df_valid['target']
x_valid = df_valid.drop(['target'], axis=1)

y_test  = df_test['target']
x_test  = df_test.drop(['target'], axis=1)

In [None]:
# Target log instead of raw value
log_y_train = np.log(y_train)
log_y_valid = np.log(y_valid)
log_y_test  = np.log(y_test)

In [None]:
# Cap range of valid values
max_log_y_train = np.max(log_y_train)
log_y_train_range = (0, max_log_y_train*1.2)
log_y_train_range

## Set up embedding dataset and data loader

In [None]:
class EmbeddingDataset(Dataset):

    # List of categorical and continuous variables
    def __init__(self, df_cat, df_con, target):

        # Get series values for each cat/con variable
        cat_vals = [c.values for n,c in df_cat.items()]
        con_vals = [c.values for n,c in df_con.items()]

        # Stack all
        self.cats = np.stack(cat_vals, 1).astype(np.int64)
        self.cons = np.stack(con_vals, 1).astype(np.float32)
        self.target = target[:, None].astype(np.float32)

    def __getitem__(self, idx):
        return [self.cats[idx], self.cons[idx], self.target[idx]]

    def __len__(self):
        return len(self.target)

In [None]:
train_data = EmbeddingDataset(x_train[cat_vars], x_train[con_vars], log_y_train)
valid_data = EmbeddingDataset(x_valid[cat_vars], x_valid[con_vars], log_y_valid)
test_data  = EmbeddingDataset(x_test[cat_vars], x_test[con_vars], log_y_test)

In [None]:
bs = 128
train_data_loader = DataLoader(train_data, batch_size=bs, shuffle=True, num_workers=0)

## Set up model

In [None]:
class EmbeddingNet(nn.Module):
  
    def __init__(self, emb_szs, n_con, target_range):
        super(EmbeddingNet, self).__init__()

        # Hold all embedding submodules in a list
        self.embs = nn.ModuleList([nn.Embedding(c, s) for c, s in emb_szs])

        # Initialise embeddings with some "reasonable" values based on some rule-of-thumb process
        for emb in self.embs:
            sc = 2 / (emb.weight.data.size(1) + 1)
            emb.weight.data.uniform_(-sc, sc)

        # Define layers
        n_emb = sum(e.embedding_dim for e in self.embs)

        self.emb_drop = nn.Dropout(0.04)
        self.bn = nn.BatchNorm1d(n_con)

        self.fc1 = nn.Linear(n_emb + n_con, 1000)
        nn.init.kaiming_normal_(self.fc1.weight.data)

        self.dr1 = nn.Dropout(0.001)
        self.fc2 = nn.Linear(1000, 500)
        nn.init.kaiming_normal_(self.fc2.weight.data)

        self.dr2 = nn.Dropout(0.01)

        self.outp = nn.Linear(500, 1)
        nn.init.kaiming_normal_(self.outp.weight.data)

        self.target_range = target_range
    
    def forward(self, x_cat, x_con):
      
        # Embeddings
        x = [e(x_cat[:, i]) for i,e in enumerate(self.embs)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)

        # Continuous
        x2 = self.bn(x_con)
        x = torch.cat([x, x2], 1)

        # Layers
        x = F.relu(self.fc1(x))
        x = self.dr1(x)
        x = F.relu(self.fc2(x))
        x = self.dr2(x)
        x = self.outp(x)

        # Output
        x = torch.sigmoid(x)
        x = x * (self.target_range[1] - self.target_range[0])
        x = x + self.target_range[0]

        return x

In [None]:
model = EmbeddingNet(emb_szs, n_con=len(con_vars), target_range=log_y_train_range)
model.to(device)
print(model)

## Find good learning rate to use

In [None]:
class ExpRangeLR(_LRScheduler):

    def __init__(self, optimizer, batches_per_epoch, lr_start=1e-5, lr_end=10, last_epoch=-1):
        self.lr_start = lr_start
        self.lr_mult = (lr_end/lr_start)**(1/batches_per_epoch)
        self.lr_hist = []  # track history of learning rates

        super(ExpRangeLR, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        lrs = [self.lr_start * (self.lr_mult**self.last_epoch)
               for base_lr in self.base_lrs]

        self.lr_hist.append(lrs)
        return lrs

In [None]:
def find_lr(model, data_loader, optimizer, scheduler, criterion):
    i = 0
    best_loss = -1

    avg_loss = 0
    raw_loss_hist = []
    debias_loss_hist = []

    for d in iter(data_loader):
        i = i + 1

        # Set learning rate for this batch (first step = starting LR)
        scheduler.step()

        # Split into categorical, continuous, and target variables
        x_cats, x_cons, y_act = d

        # Move to device
        x_cats = x_cats.to(device)
        x_cons = x_cons.to(device)
        y_act = y_act.to(device)

        # Forward pass: Compute predictions with current parameters and calculate loss
        y_pred = model(x_cats, x_cons)
        loss = criterion(y_pred, y_act)

        raw_loss = loss.item()
        avg_loss = (avg_loss * 0.98) + (raw_loss * 0.02)
        debias_loss = avg_loss / (1-(0.98**i))

        # Keep track of the losses
        raw_loss_hist.append(raw_loss)
        debias_loss_hist.append(debias_loss)

        # Stop if loss starts increasing
        if best_loss == -1 or debias_loss < best_loss:
            best_loss = debias_loss

        if debias_loss > best_loss * 4:
            break

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Backward pass: Gradient of loss wrt to model parameters
        loss.backward()

        # Update parameters
        optimizer.step()
  
        # Return learning rates and losses
        lr_hist = [lr[0] for lr in scheduler.lr_hist[1:]]
        return lr_hist, raw_loss_hist, debias_loss_hist

In [None]:
lr_start = 1e-5
opt = torch.optim.Adam(model.parameters(), lr=lr_start)
crit = F.mse_loss
batches_per_epoch = math.ceil(len(train_data)/bs)
sched = ExpRangeLR(optimizer=opt, batches_per_epoch=batches_per_epoch, lr_start=lr_start, lr_end=10)

In [None]:
lr_hist, raw_loss_hist, debias_loss_hist = find_lr(model=model, data_loader=train_data_loader, optimizer=opt, scheduler=sched, criterion=crit)

In [None]:
start_batch = math.floor(batches_per_epoch*0.05)  # Skip first 5% as loss rapidly improves

plt.ylabel('Loss')
plt.xlabel('Learning Rate')
plt.xscale('log')
plt.plot(lr_hist[start_batch:], debias_loss_hist[start_batch:])
plt.show()

## Set up custom learning rate scheduler - Cosine Annealing with Restarts

In [None]:
# NOTE: _LRScheduler class assumes step is taken at each epoch, but we step at each minibatch. So imagine epoch==minibatch here.
# NOTE: _LRScheduler class takes a step upon init
class CosineAnnealingLRWithRestarts(_LRScheduler):
  
    def __init__(self, optimizer, T_max, cycle_mult=1, eta_min=0, last_epoch=-1):
        self.T_max = T_max  # number of epochs in a cycle
        self.eta_min = eta_min

        self.cycle_mult = cycle_mult  # multiplying factor for next cycle length
        self.cycle_len = self.T_max  # number of epochs in first cycle
        self.curr_epoch = last_epoch
        self.lr_hist = []  # track history of learning rates

        super(CosineAnnealingLRWithRestarts, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        lrs = [self.eta_min + (base_lr - self.eta_min) *
               (1 + math.cos(math.pi * self.curr_epoch / self.cycle_len)) / 2
               for base_lr in self.base_lrs]

        self.lr_hist.append(lrs)
        return lrs

    def step(self, epoch=None):
        if epoch is None:
            epoch = self.last_epoch + 1
        self.last_epoch = epoch

        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

        # Restart
        self.curr_epoch += 1
        if self.curr_epoch == self.cycle_len:
            self.curr_epoch = 0
            self.cycle_len = int(self.cycle_len * self.cycle_mult)

## Fit model

In [None]:
def train_with_embeddings(model, train_dl, valid_data, optimizer, scheduler, criterion, num_cycles=1, cycle_mult=1):
  
    # Print output heading
    heading = 'Epoch   Train_Loss  Valid_Loss'
    print(heading)
    print('-' * len(heading))

    # Load validation data
    xv_cats = torch.from_numpy(valid_data.cats).to(device)
    xv_cons = torch.from_numpy(valid_data.cons).to(device)
    yv_act = torch.from_numpy(valid_data.target).to(device)

    # Calculate total number of epochs required
    epochs = math.ceil((1-cycle_mult**num_cycles)/(1-cycle_mult)) if cycle_mult>1 else num_cycles

    for e in range(epochs):
        for d in iter(train_dl):

            # Set learning rate for this batch (first step = starting LR)
            scheduler.step()

            # Split into categorical, continuous, and target variables
            x_cats, x_cons, y_act = d

            # Move to device
            x_cats = x_cats.to(device)
            x_cons = x_cons.to(device)
            y_act = y_act.to(device)

            # Forward pass: Compute predictions with current parameters and calculate loss
            y_pred = model(x_cats, x_cons)
            loss = criterion(y_pred, y_act)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Backward pass: Gradient of loss wrt to model parameters
            loss.backward()

            # Update parameters
            optimizer.step()

    # Score validation data
    yv_pred = model(xv_cats, xv_cons)
    valid_loss = criterion(yv_pred, yv_act)
    print('{:<8d}{:<12.4f}{:<12.4f}'.format(e+1, loss.item(), valid_loss.item()))

In [None]:
# Re-initialise model after updating weights from finding the LR
model = EmbeddingNet(emb_szs, n_con=len(con_vars), target_range=log_y_train_range)
model.to(device)

In [None]:
learning_rate = 1e-3
opt = torch.optim.Adam(model.parameters(), lr=learning_rate)
crit = F.mse_loss
batches_per_epoch = math.ceil(len(train_data)/bs)

num_cycles = 5
cycle_mult = 1
sched = CosineAnnealingLRWithRestarts(optimizer=opt, T_max=batches_per_epoch, cycle_mult=cycle_mult)

In [None]:
train_with_embeddings(model=model, train_dl=train_data_loader, valid_data=valid_data, optimizer=opt, scheduler=sched, criterion=crit, num_cycles=num_cycles, cycle_mult=cycle_mult)

## Examine embeddings

In [None]:
cat_var1_embedding = model.embs[2].weight.detach().numpy()

labels = df['cat_var1'].astype('category').cat.categories
# OR
labels = ['N/A','Cat1','Cat2','Cat3']

#### Hierarchical

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

Z = linkage(cat_var1_embedding)
dendrogram(Z, orientation='left', labels=labels)

#### KMeans

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=0).fit(cat_var1_embedding)
df_cluster = pd.DataFrame({'Label': labels,
                          'Cluster': kmeans.labels_})

In [None]:
df_cluster[df_cluster['Cluster']==0]

In [None]:
df_cluster[df_cluster['Cluster']==1]

In [None]:
df_cluster[df_cluster['Cluster']==2]

#### t-SNE

In [None]:
from sklearn import manifold

tsne = manifold.TSNE(init='pca', random_state=0, method='exact')
Y = tsne.fit_transform(cat_var1_embedding)

plt.figure(figsize=(16,16))
plt.scatter(-Y[:, 0], -Y[:, 1])
for i, txt in enumerate(labels):
    plt.annotate(txt, (-Y[i, 0],-Y[i, 1]), fontsize=14)
plt.show()