In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import csv

# Load Data

In [9]:
df = pd.read_csv("Dataset.csv")
df.columns = ['ID','WBFP','Weight','Height',"BMI","Age","Sport"]

#Drop missing rows
df = df.dropna()

df.head(5)

Unnamed: 0,ID,WBFP,Weight,Height,BMI,Age,Sport
0,1,12.18,84.364,192.0,22.885,25,basket
1,2,8.08,84.976,204.5,20.319,25,basket
2,3,14.031,90.785,190.5,25.016,31,basket
3,4,12.143,96.159,192.1,26.058,26,basket
4,5,12.81,97.425,198.2,24.801,34,basket


# Train a linear model to predict BMI based on age, height and weights

We are assuming that

BMI = W1 height + W2 weight + w3 * age +bias




Split into train and validation

In [11]:
train_pct = 0.8
train_index = int(len(df)*train_pct)

train_data = df.iloc[:train_index].copy()
validation_data = df.iloc[train_index:].copy()
print(f'train = {len(train_data)},\nvalidation = {len(validation_data)}')

train = 280,
validation = 70


# Random initialization of weights

From standard normal distribution



In [12]:
def reset():
    global w1,w2,w3,bias
    w1  = np.random.randn()
    w2  = np.random.randn()
    w3  = np.random.randn()
    bias  = np.random.randn()

reset()

print_weight = lambda : print('w1 = {},\nw2 = {},\nw3 = {},\nbias = {}'.format(w1,w2,w3,bias))
print_weight()

w1 = -0.6097386701435867,
w2 = -0.2883776611589175,
w3 = -0.0032166711256735223,
bias = -1.09114847976101


# Normalize features

Modify input features and output BMI to have

mean=0

standard deviation = 1

In [13]:
def normalize(df, means, stds):
    #print(means)
    df['Weight'] = (df['Weight'] - means.Weight)/stds.Weight
    df['Height'] = (df['Height'] - means.Height)/stds.Height
    df['Age'] = (df['Age'] - means.Age)/stds.Age
    if 'BMI' in df.columns:
        df['BMI'] = (df['BMI'] - means.BMI)/stds.BMI
    df.head()

def de_normalize(df, means, stds):
    #print(means)
    df = df.copy()
    df['Weight'] = df['Weight'] *stds.Weight + means.Weight
    df['Height'] = df['Height'] *stds.Height + means.Height
    df['Age'] = df['Age'] *stds.Age + means.Age
    if 'BMI' in df.columns:
        df['BMI'] = df['BMI'] *stds.BMI + means.BMI
    if 'predictionBMI' in df.columns:
        df['predictionBMI'] = df['predictionBMI'] *stds.BMI + means.BMI
        
    return df

In [14]:
means = train_data.mean()
stds = train_data.std()
normalize(train_data,means,stds)
print("Normalized train data")
train_data.head()

Normalized train data


Unnamed: 0,ID,WBFP,Weight,Height,BMI,Age,Sport
0,1,12.18,0.439571,1.170018,-0.348537,-0.222895,basket
1,2,8.08,0.491021,2.618368,-1.256472,-0.222895,basket
2,3,14.031,0.979376,0.996216,0.405481,0.504218,basket
3,4,12.143,1.431162,1.181605,0.774175,-0.101709,basket
4,5,12.81,1.537593,1.8884,0.329407,0.867775,basket


In [15]:
normalize(validation_data,means,stds)
print('Normalized test data')
validation_data.head()

Normalized test data


Unnamed: 0,ID,WBFP,Weight,Height,BMI,Age,Sport
280,281,12.973,-0.372534,-0.869258,0.244487,-0.707637,handball
281,282,13.922,-0.116545,-1.888896,1.585514,-0.34408,handball
282,283,17.618,0.822168,-0.092943,1.146761,0.019476,handball
283,284,22.325,0.004683,-0.266745,0.240949,-0.465266,handball
284,285,23.202,0.916157,-12.421296,48.643871,0.261847,handball


# Predict BMI using the linear function


In [40]:

def predict_BMI(df):
    pred = w1 * df['Height'] + w2 *df['Weight'] + w3 * df['Age'] + bias 
    df['predictionBMI'] = pred
    return df


print('Random weights predictions')
preddf = predict_BMI(train_data)
preddf.head()

Random weights predictions


Unnamed: 0,ID,WBFP,Weight,Height,BMI,Age,Sport,predictionBMI
0,1,12.18,0.439571,1.170018,-0.348537,-0.222895,basket,-0.365826
1,2,8.08,0.491021,2.618368,-1.256472,-0.222895,basket,-1.42238
2,3,14.031,0.979376,0.996216,0.405481,0.504218,basket,0.453386
3,4,12.143,1.431162,1.181605,0.774175,-0.101709,basket,0.855581
4,5,12.81,1.537593,1.8884,0.329407,0.867775,basket,0.462643


# Loss Function

Mean squared error - squared difference between predictionBMI and actual BMI

In [17]:
def calculate_loss(df):
    return np.square(df['predictionBMI'] - df['BMI'])

preddf = predict_BMI(train_data)
print('loss = ', calculate_loss(preddf).mean())

loss =  3.271320218841834


The goal is to get zero loss

# Gradient descent

Minimizes mean square error by updating weights and bias based on gradient

BMI = W1 height + W2 weight + w3 * age +bias

loss = (BMI -actualBMI)^2

d(loss)/d(w1) = 2 (BMI-actualBMI) * height

d(loss)/d(w2) = 2 (BMI-actualBMI) * weight

d(loss)/d(w3) = 2 (BMI-actualBMI) * age

d(loss)/d(bias) = 2 (BMI-actualBMI)

w = w - dw * learning_rate

In [18]:
def calculate_gradients(df):
    diff = df['predictionBMI'] - df['BMI']
    dw1 = 2 * diff *df['Height']
    dw2 = 2 * diff *df['Weight']
    dw3 = 2 * diff *df['Age']
    dbias = 2* diff
    dw1,dw2,dw3 , dbias  =  dw1.values.mean(),dw2.values.mean(),dw3.values.mean(),dbias.values.mean()
    #print(dw1,dw2,dw3 , dbias) 
    return dw1,dw2,dw3 , dbias

In [19]:
def train(learning_rate = 0.01):
    global w1, w2, w3, bias, preddf
    dw1,dw2,dw3 , dbias = calculate_gradients(preddf)
    w1 = w1 - dw1*learning_rate 
    w2 = w2 - dw2 * learning_rate 
    w3 = w3 - dw3 * learning_rate 
    bias = bias - dbias.mean() * learning_rate 
    #print(w1, w2, w3, bias)
    preddf = predict_BMI(train_data)
    return calculate_loss(preddf).mean()

In [20]:
print('\nPrediction on validation set before training')
de_normalize(predict_BMI(validation_data),means,stds).head(10)


Prediction on validation set before training


Unnamed: 0,ID,WBFP,Weight,Height,BMI,Age,Sport,predictionBMI
280,281,12.973,74.704,174.4,24.561,21.0,handball,22.594228
281,282,13.922,77.749,165.6,28.351,24.0,handball,24.139369
282,283,17.618,88.915,181.1,27.111,27.0,handball,20.276146
283,284,22.325,79.191,179.6,24.551,23.0,handball,21.246314
284,285,23.202,90.033,74.7,161.347,29.0,handball,41.442042
285,286,24.969,102.508,187.7,29.096,28.0,handball,18.025882
286,287,10.503,59.103,182.3,17.784,23.0,handball,22.083579
287,288,22.594,95.179,186.0,27.512,22.0,handball,18.874089
288,289,16.867,86.437,186.8,24.771,21.0,handball,19.314432
289,290,13.942,87.547,181.7,26.518,20.0,rugby,20.257788


# Train

In [27]:
import time, math
from tqdm.notebook import  tqdm


learning_rate = 0.01

for i in tqdm(range(300)):
    loss = train(learning_rate)
    time.sleep(0.01)
    if i%20 ==0:
        print(f'epoch : {i} : loss = {loss}')


  0%|          | 0/300 [00:00<?, ?it/s]

epoch : 0 : loss = 0.010640205427402199
epoch : 20 : loss = 0.009456020186785243
epoch : 40 : loss = 0.008560456542169317
epoch : 60 : loss = 0.007883067266468494
epoch : 80 : loss = 0.007370656199671617
epoch : 100 : loss = 0.006983022737188933
epoch : 120 : loss = 0.006689773211191279
epoch : 140 : loss = 0.0064679223030130704
epoch : 160 : loss = 0.0063000845264394685
epoch : 180 : loss = 0.0061731087308892
epoch : 200 : loss = 0.006077046256584121
epoch : 220 : loss = 0.00600437083863431
epoch : 240 : loss = 0.005949388670317176
epoch : 260 : loss = 0.005907792196622419
epoch : 280 : loss = 0.00587632258665827


In [28]:
print('after training')
print_weight()

after training
w1 = -0.7734672635162253,
w2 = 1.2380310113130453,
w3 = 0.022671726012017095,
bias = -3.2560544147266065e-06


In [29]:

print('\nPrediction on validation set after training')
de_normalize(predict_BMI(validation_data),means,stds).head(10)


Prediction on validation set after training


Unnamed: 0,ID,WBFP,Weight,Height,BMI,Age,Sport,predictionBMI
280,281,12.973,74.704,174.4,24.561,21.0,handball,24.421387
281,282,13.922,77.749,165.6,28.351,24.0,handball,27.569263
282,283,17.618,88.915,181.1,27.111,27.0,handball,26.951135
283,284,22.325,79.191,179.6,24.551,23.0,handball,24.439691
284,285,23.202,90.033,74.7,161.347,29.0,handball,54.244908
285,286,24.969,102.508,187.7,29.096,28.0,handball,29.285603
286,287,10.503,59.103,182.3,17.784,23.0,handball,17.846948
287,288,22.594,95.179,186.0,27.512,22.0,handball,27.513774
288,289,16.867,86.437,186.8,24.771,21.0,handball,24.731926
289,290,13.942,87.547,181.7,26.518,20.0,rugby,26.342414


In [30]:
def predictBMI_real(data):
    df = pd.DataFrame(data)
    normalize(df,means, stds)
    return de_normalize(predict_BMI(df),means, stds)

# Using the linear model calculate my BMI

In [31]:
new_data = [{'name' :'Ebad', 'Age': 22, 'Height': 186, 'Weight': 75}]
predictBMI_real(new_data)

Unnamed: 0,name,Age,Height,Weight,predictionBMI
0,Ebad,22.0,186.0,75.0,21.578129
