In [None]:
import torch
import torch.nn as nn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

In [None]:
df = pd.read_csv('../data/ann_files/income.csv')
df

In [None]:
# we can use the following lines to shuffle the data
# df = shuffle(df, random_state=101)
# df.reset_index(drop=True, inplace=True)
# df.head()

In [None]:
df['label'].value_counts()

In [None]:
df.columns

In [None]:
cat_cols = ['sex', 'education', 'marital-status', 'workclass', 'occupation']
cont_cols = ['age', 'hours-per-week']
y_col = ['label']

In [None]:
df.info()

In [None]:
for col in cat_cols:
    df[col] = df[col].astype('category')

In [None]:
df.info()

In [None]:
cats = np.stack([df[col].cat.codes for col in cat_cols], axis=1)
conts = np.stack([df[col] for col in cont_cols], axis=1)

In [None]:
cats = torch.tensor(cats, dtype=torch.int64)
conts = torch.tensor(conts, dtype=torch.float32)
y = torch.tensor(df[y_col].values).flatten()

In [None]:
cats

In [None]:
conts

In [None]:
y

In [None]:
cat_szs = [len(df[col].cat.categories) for col in cat_cols]
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]
emb_szs

In [None]:
# here we define a sort of nominal batch size to just cut in half the training time. Pay attention that batch size here is just
# nominal and is not the technical usage of batch size which leads to batch gradient descent
batch_size = 25000
test_size = 5000

cat_train = cats[:batch_size]
cat_test = cats[batch_size:batch_size+test_size]
cont_train = conts[:batch_size]
cont_test = conts[batch_size:batch_size+test_size]

y_train = y[:batch_size]
y_test = y[batch_size:batch_size+test_size]

In [None]:
class TabularModel(nn.Module):
    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(num_embeddings, embedding_dim) for num_embeddings, embedding_dim in emb_szs])
        self.embed_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerslist = []
        num_embeds = sum([embedding_dim for num_embeddings, embedding_dim in emb_szs])
        num_in = num_embeds + n_cont
        
        for i in layers:
            layerslist.append(nn.Linear(num_in, i))
            layerslist.append(nn.ReLU(inplace=True))
            layerslist.append(nn.BatchNorm1d(i))
            layerslist.append(nn.Dropout(p))
            num_in = i
            
        layerslist.append(nn.Linear(layers[-1],out_sz))
        
        self.layers = nn.Sequential(*layerslist)
        
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i, e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x_cat = torch.cat(embeddings, axis=1)
        x_cat = self.embed_drop(x_cat)
        
        x_cont = self.bn_cont(x_cont)
        
        x = torch.cat([x_cat, x_cont], axis=1)
        x = self.layers(x)
        return x

In [None]:
torch.manual_seed(33)
model = TabularModel(emb_szs, conts.shape[1], 2, [50], p=0.4)
model

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
import time
start_time = time.time()
epochs = 300
losses = []

for i in range(epochs):
    i += 1
    y_pred = model(cat_train, cont_train)
    loss = criterion(y_pred, y_train)
    losses.append(loss)
    
    if i%10 ==1:
        print(f'epoch: {i}, loss is {loss}')
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
duration = time.time() - start_time
print(f'training took {duration} seconds')

In [None]:
plt.plot(range(epochs), losses)

In [None]:
with torch.no_grad():
    y_val = model(cat_test, cont_test)
    loss = criterion(y_val, y_test)
loss

In [None]:
length = len(y_val)
trues_cnt = 0
for i in range(length):
    if y_val[i].argmax().item() == y_test[i].item():
        trues_cnt += 1
print(f"{trues_cnt} out of {length} = {(trues_cnt/length)*100}% correct")

In [None]:
torch.save(model.state_dict(), '../models/my_taxi_model_3.pt')