# Multiple linear regression
#### following tutorial here https://codeburst.io/multiple-linear-regression-sklearn-and-statsmodels-798750747755

In [1]:
# Note we are NOT doing a train test split here

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,129,90,7,326,19.6,0.582,60,0
1,12,92,62,7,258,27.6,0.926,44,1
2,1,90,68,8,0,24.5,1.138,36,0
3,1,109,60,8,182,25.4,0.947,21,0
4,1,73,50,10,0,23.0,0.248,21,0


In [4]:
# We want to use age, bloodpressure,and BMI to predict Glucose
# so the independent variables X are age, bloodpressure and bmi
# predicts the dependent variable, y, or Glucose

In [5]:
df.shape

(539, 9)

In [6]:
#768 rows and 9 columns

In [7]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,539.0,539.0,539.0,539.0,539.0,539.0,539.0,539.0,539.0
mean,3.525046,119.858998,71.204082,29.176252,113.538033,32.895733,0.502451,31.597403,0.333952
std,3.324018,32.946653,13.027106,10.489706,122.861998,6.871851,0.343215,10.74438,0.472061
min,0.0,0.0,0.0,7.0,0.0,18.2,0.085,21.0,0.0
25%,1.0,97.0,64.0,22.0,0.0,27.85,0.259,23.0,0.0
50%,2.0,115.0,72.0,29.0,90.0,32.8,0.415,28.0,0.0
75%,5.0,140.5,80.0,36.0,165.0,36.9,0.659,38.0,1.0
max,17.0,199.0,110.0,99.0,846.0,67.1,2.42,81.0,1.0


In [8]:
#note that if you had any categorical variables with answers liked "yes" or "no", you'll need to convert them to integers
# so this https://towardsdatascience.com/multiple-linear-regression-model-using-python-machine-learning-d00c78f1172a

# Setting up the dependent y variable and independent x variables

In [9]:
X=df[['Age','BloodPressure','BMI']]
Y=df['Glucose']

# Regression

In [10]:
reg=LinearRegression()     #initiating linear regression
reg.fit(X,Y)

LinearRegression()

In [11]:
Intercept=reg.intercept_
Coefficients=reg.coef_

In [12]:
print(Intercept)

53.11338884392636


In [13]:
print(Coefficients)

[0.70179891 0.20684168 0.90718802]


In [14]:
import statsmodels.api as sm #for detail description of linear coefficients, intercepts, deviations, and many more

In [15]:
#pip install -U statsmodels

In [16]:
X=sm.add_constant(X)        #to add constant value in the model
model= sm.OLS(Y,X).fit()         #fitting the model
predictions= model.summary()      #summary of the model
predictions

0,1,2,3
Dep. Variable:,Glucose,R-squared:,0.123
Model:,OLS,Adj. R-squared:,0.118
Method:,Least Squares,F-statistic:,24.99
Date:,"Mon, 24 Jan 2022",Prob (F-statistic):,3.84e-15
Time:,11:19:34,Log-Likelihood:,-2612.7
No. Observations:,539,AIC:,5233.0
Df Residuals:,535,BIC:,5251.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,53.1134,8.830,6.015,0.000,35.768,70.458
Age,0.7018,0.131,5.351,0.000,0.444,0.959
BloodPressure,0.2068,0.113,1.838,0.067,-0.014,0.428
BMI,0.9072,0.203,4.476,0.000,0.509,1.305

0,1,2,3
Omnibus:,18.436,Durbin-Watson:,1.837
Prob(Omnibus):,0.0,Jarque-Bera (JB):,41.573
Skew:,-0.074,Prob(JB):,9.39e-10
Kurtosis:,4.353,Cond. No.,568.0


In [17]:
#So here we see that only Age and BMI are significant with P values
# since we have multivariate regression, we need to look at adjusted r-squred, which is pretty low (bad) at 11%

In [18]:
#So let's drop the "Bloodpressure" variable to see if we get a better fitting model
X_new=df[['Age','BMI']]

In [19]:
reg=LinearRegression()     #initiating linearregression
reg.fit(X_new,Y)

LinearRegression()

In [20]:
Intercept=reg.intercept_
print(Intercept)


62.070830106456555


In [21]:
Coefficients=reg.coef_
print(Coefficients)

[0.77713118 1.01024778]


In [22]:
X_new=sm.add_constant(X_new)        #to add constant value in the model
model= sm.OLS(Y,X_new).fit()         #fitting the model
predictions= model.summary()      #summary of the model
predictions

0,1,2,3
Dep. Variable:,Glucose,R-squared:,0.117
Model:,OLS,Adj. R-squared:,0.114
Method:,Least Squares,F-statistic:,35.63
Date:,"Mon, 24 Jan 2022",Prob (F-statistic):,2.96e-15
Time:,11:19:40,Log-Likelihood:,-2614.4
No. Observations:,539,AIC:,5235.0
Df Residuals:,536,BIC:,5248.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,62.0708,7.379,8.412,0.000,47.576,76.566
Age,0.7771,0.125,6.224,0.000,0.532,1.022
BMI,1.0102,0.195,5.175,0.000,0.627,1.394

0,1,2,3
Omnibus:,17.726,Durbin-Watson:,1.831
Prob(Omnibus):,0.0,Jarque-Bera (JB):,39.019
Skew:,-0.073,Prob(JB):,3.37e-09
Kurtosis:,4.31,Cond. No.,257.0


In [23]:
# The size of the coefficient for each independent variable gives you the size of the effect that variable is having 
# on your dependent variable, 
# and the sign on the coefficient (positive or negative) gives you the direction of the effect
# holding all the other independent variables constant.

In [24]:
#In our case, Y = Glucose readings, x = Age and BMI
#Glucose goes up by .77 with each year you get older
#Glucose goes up by 1.01 with each unit increase in your BMI