# Linear regression
A simple linear regression model that predict a person's body mass index from its height and weight

In [5]:
# imports
import pandas as pd
import numpy as np


In [6]:
# read dataset
filename = "datasets/500_Person_Gender_Height_Weight_Index"
df = pd.read_csv(f"{filename}.csv", usecols=[1,2,3], header=0, names=["height", "weight", "index"])
print(df)


     height  weight  index
0       174      96      4
1       189      87      2
2       185     110      4
3       195     104      3
4       149      61      3
..      ...     ...    ...
495     150     153      5
496     184     121      4
497     141     136      5
498     150      95      5
499     173     131      5

[500 rows x 3 columns]


In [7]:
# useful functions
def split_train_test(df, n):
    """
    Takes a dataframe and the number of data in the train set.
    Returns a list of dataframes.
    """
    train = df.iloc[0:n, :]
    test = df.iloc[n:len(df), :]
    return train, test

In [8]:
# split dataset into train and test
train_df, test_df = split_train_test(df, 300)

print(train_df)
print(test_df)

# create array-like objects for train and test data
height_train = train_df["height"]
weight_train = train_df["weight"]
z_train = train_df["index"]

height_train = np.array(height_train)
weight_train = np.array(weight_train)
z_train = np.array(z_train)

height_test = test_df["height"]
weight_test = test_df["weight"]
z_test = test_df["index"]

height_test = np.array(height_test)
weight_test = np.array(weight_test)
z_test = np.array(z_test)


     height  weight  index
0       174      96      4
1       189      87      2
2       185     110      4
3       195     104      3
4       149      61      3
..      ...     ...    ...
295     160     156      5
296     169      88      2
297     140      76      4
298     187      92      3
299     151      82      4

[300 rows x 3 columns]
     height  weight  index
300     186     140      5
301     182     108      4
302     188      81      2
303     179     110      4
304     156     126      5
..      ...     ...    ...
495     150     153      5
496     184     121      4
497     141     136      5
498     150      95      5
499     173     131      5

[200 rows x 3 columns]


## Train the model

In [9]:
# set initial values for learnable parameters
a = 1
b = 1
c = 0

lr = 0.000005
epochs = 1000

n = len(z_train)

for i in range(epochs):
    z_predicted = a*height_train + b*weight_train + c
    error = z_predicted - z_train
    loss = np.sum(error**2)/n
    loss_a = 2*np.sum(error*height_train)/n
    loss_b = 2*np.sum(error*weight_train)/n
    loss_c = 2*np.sum(error)/n
    a = a - loss_a*lr
    b = b - loss_b*lr
    c = c - loss_c*lr
    print(f"loss: {loss}  \t({i+1}/{epochs})")
    

loss: 75669.55333333333  	(1/1000)
loss: 26581.7353481346  	(2/1000)
loss: 9361.339899110204  	(3/1000)
loss: 3319.9278773495917  	(4/1000)
loss: 1200.0708182544802  	(5/1000)
loss: 455.88938115979255  	(6/1000)
loss: 194.29832654792713  	(7/1000)
loss: 102.00650707657003  	(8/1000)
loss: 69.11241192898677  	(9/1000)
loss: 57.06286851948007  	(10/1000)
loss: 52.33344079595091  	(11/1000)
loss: 50.17968334205904  	(12/1000)
loss: 48.93714106472276  	(13/1000)
loss: 48.02181002495408  	(14/1000)
loss: 47.22870642436813  	(15/1000)
loss: 46.48580715746107  	(16/1000)
loss: 45.76773303430958  	(17/1000)
loss: 45.06546914480554  	(18/1000)
loss: 44.37574284641467  	(19/1000)
loss: 43.697297574373394  	(20/1000)
loss: 43.02958568144642  	(21/1000)
loss: 42.37230986476701  	(22/1000)
loss: 41.72526227431024  	(23/1000)
loss: 41.08826804738732  	(24/1000)
loss: 40.46116547579786  	(25/1000)
loss: 39.84379902397082  	(26/1000)
loss: 39.236016855566675  	(27/1000)
loss: 38.63766994200652  	(28/1

In [10]:
print(f"a: {a}")
print(f"b: {b}")
print(f"c: {c}")


a: -0.002295480476659097
b: 0.03870236114858246
c: -0.005551631453991097


## Test the model

In [11]:
from sklearn.metrics import r2_score

In [13]:
z_prediction = a*height_test + b*weight_test + c
print(f"R2 Score: {r2_score(z_test, z_prediction)}")

for i in range(len(z_prediction)):
    print(f"BMI: {z_test[i]}  Predicted: {z_prediction[i]}")


R2 Score: 0.7178764419802066
BMI: 5  Predicted: 4.985819560688962
BMI: 4  Predicted: 3.7565259258409593
BMI: 2  Predicted: 2.6977892919692783
BMI: 4  Predicted: 3.8408170895681013
BMI: 5  Predicted: 4.512850918908581
BMI: 4  Predicted: 3.9749672098724997
BMI: 5  Predicted: 5.495836697050511
BMI: 5  Predicted: 3.069706960982356
BMI: 1  Predicted: 2.2149971143730163
BMI: 5  Predicted: 5.058954128987431
BMI: 5  Predicted: 5.291168295878926
BMI: 5  Predicted: 5.618830221926236
BMI: 5  Predicted: 5.206556325197162
BMI: 2  Predicted: 2.176615560179056
BMI: 2  Predicted: 1.9034035516623191
BMI: 5  Predicted: 4.749656046753393
BMI: 3  Predicted: 3.3944317926437626
BMI: 5  Predicted: 4.893308895919049
BMI: 4  Predicted: 3.2809409966292966
BMI: 5  Predicted: 5.102568258043954
BMI: 4  Predicted: 4.125185693513511
BMI: 4  Predicted: 3.874928489763366
BMI: 5  Predicted: 3.6682854150697435
BMI: 5  Predicted: 4.983524080212303
BMI: 4  Predicted: 2.2182550157135417
BMI: 2  Predicted: 1.744644760023915