# A Neural Network

In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
train = pd.read_csv("/ML/ML Projects/calculate calories/playground-series-s5e5/train.csv")
test = pd.read_csv("/ML/ML Projects/calculate calories/playground-series-s5e5/test.csv")

In [3]:
train.columns

Index(['id', 'Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp', 'Calories'],
      dtype='object')

In [4]:
test.columns

Index(['id', 'Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp'],
      dtype='object')

In [5]:
df =  pd.get_dummies(train, columns=["Sex"], dtype=int)
df.columns

Index(['id', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp',
       'Calories', 'Sex_female', 'Sex_male'],
      dtype='object')

In [6]:
df = df.drop(columns = ['id', 'Calories'])

In [7]:
df.head()

Unnamed: 0,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Sex_female,Sex_male
0,36,189.0,82.0,26.0,101.0,41.0,0,1
1,64,163.0,60.0,8.0,85.0,39.7,1,0
2,51,161.0,64.0,7.0,84.0,39.8,1,0
3,20,192.0,90.0,25.0,105.0,40.7,0,1
4,38,166.0,61.0,25.0,102.0,40.6,1,0


Now we can create our independent (predictors) and dependent (target) variables. They both need to be PyTorch tensors. Our dependent variable is Calories:

In [8]:
from torch import tensor

t_dep = tensor(train.Calories)

In [9]:
t_indep = tensor(df.values, dtype=torch.float)
t_indep

tensor([[ 36.0000, 189.0000,  82.0000,  ...,  41.0000,   0.0000,   1.0000],
        [ 64.0000, 163.0000,  60.0000,  ...,  39.7000,   1.0000,   0.0000],
        [ 51.0000, 161.0000,  64.0000,  ...,  39.8000,   1.0000,   0.0000],
        ...,
        [ 60.0000, 162.0000,  67.0000,  ...,  40.9000,   0.0000,   1.0000],
        [ 45.0000, 182.0000,  91.0000,  ...,  40.3000,   0.0000,   1.0000],
        [ 39.0000, 171.0000,  65.0000,  ...,  40.6000,   1.0000,   0.0000]])

In [10]:
t_indep.shape

torch.Size([750000, 8])

In [11]:
vals,indices = t_indep.max(dim=0)
t_indep = t_indep / vals

# Training the linear model

In [12]:
from fastai.data.transforms import RandomSplitter
trn_split,val_split=RandomSplitter(seed=42)(df)

In [13]:
trn_indep,val_indep = t_indep[trn_split],t_indep[val_split]
trn_dep,val_dep = t_dep[trn_split],t_dep[val_split]
len(trn_indep),len(val_indep), trn_dep.shape, val_dep.shape

(600000, 150000, torch.Size([600000]), torch.Size([150000]))

# Creating a neural network

In [14]:
n_coeff = t_indep.shape[1]

We'll also need to turn our dependent variable into a column vector, which we can do by indexing the column dimension with the special value None, which tells PyTorch to add a new dimension in this position:

In [15]:
trn_dep = trn_dep[:,None]
val_dep = val_dep[:,None]
trn_dep.shape, val_dep.shape

(torch.Size([600000, 1]), torch.Size([150000, 1]))

In [16]:
trn_indep.shape, trn_dep.shape

(torch.Size([600000, 8]), torch.Size([600000, 1]))

First, we'll need to create coefficients for each of our layers. Our first set of coefficients will take our n_coeff inputs, and create n_hidden outputs. We can choose whatever n_hidden we like -- a higher number gives our network more flexibility, but makes it slower and harder to train. So we need a matrix of size n_coeff by n_hidden. We'll divide these coefficients by n_hidden so that when we sum them up in the next layer we'll end up with similar magnitude numbers to what we started with.

Then our second layer will need to take the n_hidden inputs and create a single output, so that means we need a n_hidden by 1 matrix there. The second layer will also need a constant term added.

In [17]:
def init_coeffs(n_hidden=40):
    layer1 = (torch.rand(n_coeff, n_hidden)-0.5)/n_hidden
    layer2 = torch.rand(n_hidden, 1)-0.3
    const = torch.rand(1)[0]
    return layer1.requires_grad_(),layer2.requires_grad_(),const.requires_grad_()

Now we have our coefficients, we can create our neural net. The key steps are the two matrix products, indeps@l1 and res@l2 (where res is the output of the first layer). The first layer output is passed to F.relu (that's our non-linearity), and the second is passed without any activation (linear activation) as this allows unbounded real outputs for regression.

In [18]:
import torch.nn.functional as F
def calc_preds(coeffs, indeps):
    l1,l2,const = coeffs
    res = F.leaky_relu(indeps @ l1, negative_slope=0.01)
    res = F.relu(res@l2 + const)
    return res

Finally, now that we have more than one set of coefficients, we need to add a loop to update each one:

In [19]:
def update_coeffs(coeffs, lr):
    for layer in coeffs:
        layer.sub_(layer.grad * lr)
        layer.grad.zero_()

In [20]:
def train_model(epochs = 12, lr = 0.1):
    torch.manual_seed(442)
    coeffs = init_coeffs()
    for i in range(epochs):
        res = calc_preds(coeffs, trn_indep)
        loss = torch.abs((res) - trn_dep).mean()
        loss.backward()
        with torch.no_grad():
            update_coeffs(coeffs, lr)
        print(f"{loss:.3f}", end="; ")
    return coeffs

In [21]:
coeffs = train_model(epochs = 850, lr = 0.025)

88.238; 88.113; 87.919; 87.719; 87.514; 87.294; 87.044; 86.787; 86.519; 86.231; 85.921; 85.594; 85.246; 84.869; 84.471; 84.039; 83.583; 83.102; 82.589; 82.043; 81.461; 80.839; 80.161; 79.452; 78.707; 77.906; 77.059; 76.178; 75.267; 74.328; 73.359; 72.361; 71.335; 70.288; 69.223; 68.146; 67.062; 65.972; 64.886; 63.809; 62.744; 61.699; 60.673; 59.669; 58.695; 57.754; 56.847; 55.978; 55.158; 54.389; 53.669; 52.997; 52.370; 51.789; 51.255; 50.766; 50.318; 49.910; 49.541; 49.208; 48.906; 48.633; 48.385; 48.161; 47.957; 47.773; 47.605; 47.452; 47.313; 47.185; 47.066; 46.955; 46.851; 46.753; 46.661; 46.572; 46.488; 46.406; 46.328; 46.251; 46.177; 46.105; 46.034; 45.965; 45.896; 45.829; 45.762; 45.696; 45.630; 45.565; 45.500; 45.435; 45.371; 45.306; 45.242; 45.178; 45.113; 45.049; 44.984; 44.920; 44.855; 44.790; 44.725; 44.660; 44.594; 44.528; 44.462; 44.396; 44.330; 44.263; 44.196; 44.129; 44.061; 43.993; 43.925; 43.857; 43.788; 43.719; 43.650; 43.581; 43.511; 43.440; 43.370; 43.299; 43.227; 

# Measuring RMSLE

In [22]:
coeffs_detached = [c.detach() for c in coeffs] 

In [23]:
res_lr_re = calc_preds(coeffs_detached, val_indep)
res_lr_re[:20]

tensor([[ 47.8076],
        [ 22.3647],
        [ 15.1160],
        [ 62.8945],
        [145.9174],
        [ 73.9797],
        [ 91.7850],
        [103.2430],
        [183.2233],
        [ 43.7155],
        [ 96.8142],
        [131.8031],
        [192.7808],
        [ 69.4947],
        [ 81.3401],
        [ 94.7430],
        [ 61.4321],
        [ 76.5971],
        [ 28.1045],
        [  3.7485]])

In [24]:
def rmsle(pred, val_dep):
    rmsle = torch.abs((torch.log(1+pred) - torch.log(1+val_dep)).mean())
    return rmsle

In [25]:
rmsle(res_lr_re, val_dep)

tensor(0.0117, dtype=torch.float64)

In [26]:
test_df = test.copy()

In [27]:
test_df =  pd.get_dummies(test_df, columns=["Sex"], dtype=int)

In [28]:
test_df = test_df.drop(columns = ['id'])

In [29]:
tst_indep = tensor(test_df.values, dtype=torch.float)
tst_indep = tst_indep/vals

In [30]:
preds_test = calc_preds(coeffs_detached, tst_indep)

In [31]:
test['Calories'] = preds_test

In [32]:
sub_df = test[['id','Calories']]
sub_df.to_csv('sub_relu_bias.csv', index=False)

In [33]:
!head sub_relu.csv

id,Calories
750000,29.786493
750001,110.42535
750002,82.10322
750003,116.03846
750004,83.3504
750005,23.146364
750006,44.3384
750007,7.399134
750008,9.549758


# Deep learning

The neural net in the previous section only uses one hidden layer, so it doesn't count as "deep" learning. But we can use the exact same technique to make our neural net deep, by adding more matrix multiplications.

First, we'll need to create additional coefficients for each layer:

In [34]:
def init_coeffs():
    hiddens = [40, 20]  # <-- set this to the size of each hidden layer you want
    sizes = [n_coeff] + hiddens + [1]
    n = len(sizes)
    layers = [(torch.rand(sizes[i], sizes[i+1])-0.3)/sizes[i+1]*4 for i in range(n-1)]
    consts = [(torch.rand(1)[0]-0.5)*0.1 for i in range(n-1)]
    for l in layers+consts: l.requires_grad_()
    return layers,consts

While building and training the deep learning model, it as observed that even small tweaks to the initialization of the weights made a big difference — sometimes causing the model to completely fail to learn.

The deep learning calc_preds looks much the same as before, but now we loop through each layer, instead of listing them separately:

In [35]:
import torch.nn.functional as F

def calc_preds(coeffs, indeps):
    layers,consts = coeffs
    n = len(layers)
    res = indeps
    for i,l in enumerate(layers):
        res = res@l + consts[i]
        if i!=(n-1): 
            res = F.leaky_relu(res, negative_slope=0.01)
        else:
            res = F.relu(res)
    return res

We also need a minor update to update_coeffs since we've got layers and consts separated now:`m

In [36]:
def update_coeffs(coeffs, lr):
    layers,consts = coeffs
    for layer in layers+consts:
        layer.sub_(layer.grad * lr)
        layer.grad.zero_()

In [52]:
def train_model(epochs=30, lr=0.01):
    torch.manual_seed(442)
    coeffs = init_coeffs()
    for i in range(epochs): 
        res = calc_preds(coeffs, trn_indep)
        loss = torch.abs((res) - trn_dep).mean()
        loss.backward()
        with torch.no_grad():
            update_coeffs(coeffs, lr)
        print(f"{loss:.3f}", end="; ")
    return coeffs

In [64]:
coeffs = train_model(epochs = 550, lr = 0.002)

86.915; 84.740; 82.364; 79.981; 77.698; 75.469; 73.307; 71.193; 69.146; 67.187; 65.319; 63.566; 61.929; 60.410; 59.021; 57.767; 56.684; 55.740; 54.938; 54.261; 53.708; 53.254; 52.882; 52.591; 52.359; 52.175; 52.031; 51.916; 51.825; 51.754; 51.698; 51.653; 51.616; 51.586; 51.560; 51.538; 51.519; 51.502; 51.486; 51.472; 51.458; 51.445; 51.433; 51.421; 51.409; 51.398; 51.386; 51.375; 51.364; 51.353; 51.342; 51.330; 51.319; 51.308; 51.297; 51.286; 51.275; 51.264; 51.253; 51.242; 51.231; 51.220; 51.208; 51.197; 51.186; 51.175; 51.163; 51.152; 51.141; 51.129; 51.118; 51.107; 51.095; 51.084; 51.072; 51.061; 51.049; 51.038; 51.026; 51.015; 51.003; 50.991; 50.980; 50.968; 50.956; 50.944; 50.932; 50.921; 50.909; 50.897; 50.885; 50.873; 50.861; 50.849; 50.837; 50.824; 50.812; 50.800; 50.788; 50.775; 50.763; 50.751; 50.738; 50.726; 50.713; 50.701; 50.688; 50.676; 50.663; 50.650; 50.638; 50.625; 50.612; 50.599; 50.586; 50.573; 50.560; 50.547; 50.534; 50.521; 50.508; 50.495; 50.482; 50.468; 50.455; 

In [71]:
coeffs_trained = coeffs

Experimenting with lower learning rates.

In [123]:
for i in range(500): 
    res = calc_preds(coeffs_trained, trn_indep)
    loss = torch.abs((res) - trn_dep).mean()
    loss.backward()
    with torch.no_grad():
        update_coeffs(coeffs_trained, 0.00035)
    print(f"{loss:.3f}", end="; ")

7.058; 6.920; 6.879; 6.821; 6.800; 6.768; 6.756; 6.737; 6.729; 6.718; 6.712; 6.705; 6.702; 6.697; 6.695; 6.692; 6.690; 6.688; 6.686; 6.685; 6.684; 6.683; 6.682; 6.681; 6.680; 6.680; 6.679; 6.678; 6.678; 6.677; 6.677; 6.676; 6.676; 6.675; 6.675; 6.674; 6.674; 6.674; 6.673; 6.673; 6.672; 6.672; 6.671; 6.671; 6.670; 6.670; 6.669; 6.669; 6.668; 6.668; 6.667; 6.667; 6.667; 6.666; 6.666; 6.665; 6.665; 6.664; 6.664; 6.663; 6.663; 6.662; 6.662; 6.661; 6.661; 6.661; 6.660; 6.660; 6.659; 6.659; 6.658; 6.658; 6.657; 6.657; 6.656; 6.656; 6.656; 6.655; 6.655; 6.654; 6.654; 6.653; 6.653; 6.652; 6.652; 6.651; 6.651; 6.650; 6.650; 6.650; 6.649; 6.649; 6.648; 6.648; 6.647; 6.647; 6.646; 6.646; 6.645; 6.645; 6.645; 6.644; 6.644; 6.643; 6.643; 6.642; 6.642; 6.641; 6.641; 6.640; 6.640; 6.640; 6.639; 6.639; 6.638; 6.638; 6.637; 6.637; 6.636; 6.636; 6.636; 6.635; 6.635; 6.634; 6.634; 6.633; 6.633; 6.632; 6.632; 6.631; 6.631; 6.631; 6.630; 6.630; 6.629; 6.629; 6.628; 6.628; 6.627; 6.627; 6.627; 6.626; 6.626;

In [124]:
coeffs_prev = coeffs_trained

In [128]:
coeffs_trained = coeffs_prev

In [125]:
res_dnn = calc_preds(coeffs_prev, val_indep)
res_dnn[:20]

tensor([[ 48.8859],
        [ 22.5483],
        [ 16.1244],
        [ 62.2050],
        [143.6851],
        [ 74.4506],
        [ 90.9737],
        [101.5608],
        [180.7012],
        [ 45.0365],
        [ 96.8265],
        [134.1979],
        [192.6899],
        [ 70.7964],
        [ 77.4568],
        [ 89.8667],
        [ 59.8620],
        [ 82.0239],
        [ 27.4504],
        [  4.3156]], grad_fn=<SliceBackward0>)

In [126]:
rmsle(res_dnn, val_dep)

tensor(0.0141, dtype=torch.float64, grad_fn=<AbsBackward0>)

In [131]:
preds_test = calc_preds(coeffs_prev, tst_indep)
test['Calories'] = preds_test.detach().numpy()

In [132]:
sub_df = test[['id','Calories']]
sub_df.to_csv('sub_dnn.csv', index=False)

In [133]:
!head sub_dnn.csv

id,Calories
750000,27.496075
750001,110.738235
750002,82.47298
750003,116.671295
750004,77.61338
750005,22.979803
750006,46.875248
750007,7.8778753
750008,9.256768
