### Multiple linear regression by hand vs sklearn

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression

%matplotlib inline

In [3]:
df_original = pd.read_csv('https://github.com/ravikrishnareddy/DL/blob/main/datasets/BodyFat.csv?raw=true', index_col='case_number')
df_original.head()

Unnamed: 0_level_0,body_fat_percent_brozek,body_fat_percent_siri,density,age(years),weight(lbs),height(inches),adiposity_index,fat_free_weight(lbs),neck_circumference(cm),chest_circumference(cm),abdoment_circumference(cm),hip_circumference(cm),thigh_circumference(cm),knee_circumference(cm),ankle_circumference(cm),bicep_circumference(cm),forearm_circumference(cm),wrist_circumference(cm)
case_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,12.6,12.3,1.0708,23,154.25,67.75,23.7,134.9,36.2,93.1,85.2,94.5,59.0,37.3,21.9,32.0,27.4,17.1
2,6.9,6.1,1.0853,22,173.25,72.25,23.4,161.3,38.5,93.6,83.0,98.7,58.7,37.3,23.4,30.5,28.9,18.2
3,24.6,25.3,1.0414,22,154.0,66.25,24.7,116.0,34.0,95.8,87.9,99.2,59.6,38.9,24.0,28.8,25.2,16.6
4,10.9,10.4,1.0751,26,184.75,72.25,24.9,164.7,37.4,101.8,86.4,101.2,60.1,37.3,22.8,32.4,29.4,18.2
5,27.8,28.7,1.034,24,184.25,71.25,25.6,133.1,34.4,97.3,100.0,101.9,63.2,42.2,24.0,32.2,27.7,17.7


In [4]:
# subset data to create a multiple linear regression manually

df = df_original.loc[:,['age(years)','height(inches)','weight(lbs)']]
df.head(5)

Unnamed: 0_level_0,age(years),height(inches),weight(lbs)
case_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,23,67.75,154.25
2,22,72.25,173.25
3,22,66.25,154.0
4,26,72.25,184.75
5,24,71.25,184.25


#### Mathematical approach to calculate slopes and intercept

In [29]:
# create a function to calculate pearson's coefficient
def pearson_coef(x,y):
    numerator = ((x-x.mean())*(y-y.mean())).sum()
    denominator = ((x-x.mean())**2).sum() * ((y-y.mean()**2)).sum()
    if (denominator < 0):
        return numerator / (-1 * np.sqrt(denominator*-1))
    else:
        return numerator / np.sqrt(denominator)

age_coef = pearson_coef(df['age(years)'], df['weight(lbs)']) * (df['weight(lbs)'].std() / df['age(years)'].std())
height_coef = pearson_coef(df['height(inches)'], df['weight(lbs)']) * (df['weight(lbs)'].std() / df['height(inches)'].std())
y_intercept = df['weight(lbs)'].mean() - age_coef*df['age(years)'].mean() - height_coef*df['height(inches)'].mean()

print('age_coef: {}'.format(age_coef))
print('height_coef: {}'.format(height_coef))
print('intercept: {}'.format(y_intercept))

age_coef: 0.004886463554278197
height_coef: -0.4066135580279215
intercept: 207.22853326498472


#### sklearn approach to calculate slopes and intercept

In [30]:
linear_regression_model = LinearRegression()
linear_regression_model.fit(df[['age(years)','height(inches)']], df['weight(lbs)'])

age_coef = linear_regression_model.coef_[0]
height_coef = linear_regression_model.coef_[1]
y_intercept = linear_regression_model.intercept_

print('age_coef: {}'.format(age_coef))
print('height_coef: {}'.format(height_coef))
print('intercept: {}'.format(y_intercept))

age_coef: 0.09652027277465573
height_coef: 2.5304922719300413
intercept: -2.9189204063390264


#### Moral of the story

1. The way we calculated slope and intercept in the case of simple linear regression don't work for multiple linear regression
2. The formula used for slope calculation in multiple linear regression is different and it takes into account other explanatory variables also while calculating slope of a explanatory variable