# Linear regression
A simple linear regression model that predict a person's body mass index from its height and weight

In [41]:
# imports
import pandas as pd
import numpy as np


In [42]:
# read dataset
filename = "datasets/500_Person_Gender_Height_Weight_Index"
df = pd.read_csv(f"{filename}.csv", usecols=[1,2,3], header=0, names=["height", "weight", "index"])
print(df)


     height  weight  index
0       174      96      4
1       189      87      2
2       185     110      4
3       195     104      3
4       149      61      3
..      ...     ...    ...
495     150     153      5
496     184     121      4
497     141     136      5
498     150      95      5
499     173     131      5

[500 rows x 3 columns]


In [43]:
# useful functions
def split_train_test(df, p):
    """
    Takes a dataframe and the number of data in the train set.
    Returns a list of dataframes.
    """
    n = int(p*len(df))
    train = df.iloc[0:n, :]
    test = df.iloc[n:len(df), :]
    return train, test


In [44]:
# split dataset into train and test
train_df, test_df = split_train_test(df, 0.5)

print(train_df)
print(test_df)

# create array-like objects for train and test data
x_train = np.array(train_df["height"])
y_train = np.array(train_df["weight"])
z_train = np.array(train_df["index"])

x_test = np.array(test_df["height"])
y_test = np.array(test_df["weight"])
z_test = np.array(test_df["index"])


     height  weight  index
0       174      96      4
1       189      87      2
2       185     110      4
3       195     104      3
4       149      61      3
..      ...     ...    ...
245     151     114      5
246     182      98      3
247     142     159      5
248     188      90      3
249     161      89      4

[250 rows x 3 columns]
     height  weight  index
250     153      70      3
251     140     143      5
252     169     141      5
253     162     159      5
254     183     147      5
..      ...     ...    ...
495     150     153      5
496     184     121      4
497     141     136      5
498     150      95      5
499     173     131      5

[250 rows x 3 columns]


## Train the model

In [45]:
# set initial values for learnable parameters
a = 1
b = 1
c = 0

lr = 0.000005 # learning rate
epochs = 1000 # number of iterations

n = len(z_train)

for i in range(epochs):
    z_predicted = a*x_train + b*y_train + c # make a prediction
    error = z_predicted - z_train # calculate the error
    loss = np.sum(error**2)/n # calculate the loss 
    loss_a = 2*np.sum(error*x_train)/n # partial derivatives of the loss
    loss_b = 2*np.sum(error*y_train)/n
    loss_c = 2*np.sum(error)/n
    a = a - loss_a*lr # adjust the parameters 
    b = b - loss_b*lr
    c = c - loss_c*lr
    print(f"loss: {loss}  \t({i+1}/{epochs})")
    

loss: 76351.8  	(1/1000)
loss: 26472.69092866148  	(2/1000)
loss: 9202.63829055032  	(3/1000)
loss: 3222.7192950313724  	(4/1000)
loss: 1151.754334415275  	(5/1000)
loss: 434.1820979419113  	(6/1000)
loss: 185.19899085255236  	(7/1000)
loss: 98.46228708067387  	(8/1000)
loss: 67.90783811700328  	(9/1000)
loss: 56.81328077891344  	(10/1000)
loss: 52.46422870302127  	(11/1000)
loss: 50.45849913009371  	(12/1000)
loss: 49.27181771102934  	(13/1000)
loss: 48.376320593340935  	(14/1000)
loss: 47.589129650457544  	(15/1000)
loss: 46.84681182944302  	(16/1000)
loss: 46.12729093577447  	(17/1000)
loss: 45.422811383607836  	(18/1000)
loss: 44.730577498058004  	(19/1000)
loss: 44.049512565268664  	(20/1000)
loss: 43.37913670274464  	(21/1000)
loss: 42.719178322158534  	(22/1000)
loss: 42.069439581832256  	(23/1000)
loss: 41.42974977761149  	(24/1000)
loss: 40.799949180745436  	(25/1000)
loss: 40.17988341771501  	(26/1000)
loss: 39.56940150034191  	(27/1000)
loss: 38.96835512009377  	(28/1000)
lo

In [46]:
print(f"a: {a}")
print(f"b: {b}")
print(f"c: {c}")


a: -0.0026275756774368207
b: 0.03905622569813027
c: -0.005540027441873309


## Test the model

In [47]:
from sklearn.metrics import r2_score

In [48]:
z_prediction = a*x_test + b*y_test + c
print(f"R2 Score: {r2_score(z_test, z_prediction)}")

for i in range(len(z_prediction)):
    print(f"BMI: {z_test[i]}  Predicted: {z_prediction[i]}")


R2 Score: 0.6925494353846432
BMI: 3  Predicted: 2.326376692779412
BMI: 5  Predicted: 5.2116396525496
BMI: 5  Predicted: 5.057327506507672
BMI: 5  Predicted: 5.778732598816074
BMI: 5  Predicted: 5.254878801212338
BMI: 2  Predicted: 1.8340538033049174
BMI: 4  Predicted: 3.799645557135193
BMI: 5  Predicted: 4.248014479020106
BMI: 5  Predicted: 5.213552409300193
BMI: 4  Predicted: 4.666407840062947
BMI: 5  Predicted: 3.6385229124512204
BMI: 5  Predicted: 4.68253070359099
BMI: 5  Predicted: 4.9352610875218295
BMI: 2  Predicted: 1.6128543275033198
BMI: 4  Predicted: 2.888929306617857
BMI: 5  Predicted: 2.9828072120787383
BMI: 5  Predicted: 4.968704752401663
BMI: 5  Predicted: 5.252966044461745
BMI: 5  Predicted: 5.33334866207202
BMI: 1  Predicted: 1.5058385436552542
BMI: 5  Predicted: 4.6907708400867225
BMI: 5  Predicted: 3.6178597164951483
BMI: 4  Predicted: 4.2289066303512035
BMI: 0  Predicted: 1.7923700019293503
BMI: 3  Predicted: 2.768418234919185
BMI: 3  Predicted: 3.1743885365016866
BM