# Multiple linear regression
#### following tutorial here https://codeburst.io/multiple-linear-regression-sklearn-and-statsmodels-798750747755

In [1]:
# Note we are NOT doing a train test split here

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# We want to use age, bloodpressure,and BMI to predict Glucose
# so the independent variables X are age, bloodpressure and bmi
# predicts the dependent variable, y, or Glucose

In [5]:
df.shape

(768, 9)

In [6]:
#768 rows and 9 columns

In [7]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [8]:
#note that if you had any categorical variables, you'll need to convert them to integers
# this is from another dataset
#varlist =  ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']

# Defining the map function
#def binary_map(x):
#    return x.map({'yes': 1, "no": 0})

# Applying the function to the housing list
#df[varlist] = df[varlist].apply(binary_map)

# Check the housing dataframe now
#df.head()

#there's more on converting categorical variables here https://towardsdatascience.com/multiple-linear-regression-model-using-python-machine-learning-d00c78f1172a

# Setting up the dependent y variable and independent x variables

In [9]:
X=df[['Age','BloodPressure','BMI']]
Y=df['Glucose']

# Regression

In [10]:
reg=LinearRegression()     #initiating linearregression
reg.fit(X,Y)

LinearRegression()

In [11]:
Intercept=reg.intercept_
Coefficients=reg.coef_

In [12]:
print(Intercept)

68.18491721655327


In [13]:
print(Coefficients)

[0.67280443 0.06018464 0.81849957]


In [14]:
import statsmodels.api as sm #for detail description of linear coefficients, intercepts, deviations, and many more

In [15]:
#pip install -U statsmodels

In [16]:
X=sm.add_constant(X)        #to add constant value in the model
model= sm.OLS(Y,X).fit()         #fitting the model
predictions= model.summary()      #summary of the model
predictions

0,1,2,3
Dep. Variable:,Glucose,R-squared:,0.115
Model:,OLS,Adj. R-squared:,0.112
Method:,Least Squares,F-statistic:,33.22
Date:,"Fri, 17 Dec 2021",Prob (F-statistic):,3.45e-20
Time:,14:06:32,Log-Likelihood:,-3703.2
No. Observations:,768,AIC:,7414.0
Df Residuals:,764,BIC:,7433.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,68.1849,5.801,11.753,0.000,56.796,79.573
Age,0.6728,0.095,7.057,0.000,0.486,0.860
BloodPressure,0.0602,0.060,0.998,0.319,-0.058,0.179
BMI,0.8185,0.144,5.688,0.000,0.536,1.101

0,1,2,3
Omnibus:,18.953,Durbin-Watson:,1.836
Prob(Omnibus):,0.0,Jarque-Bera (JB):,39.165
Skew:,-0.009,Prob(JB):,3.13e-09
Kurtosis:,4.106,Cond. No.,455.0


In [17]:
#So here we see that only Age and BMI are significant with P values
# since we have multivariate regression, we need to look at adjusted r-squred, which is pretty low at 11%

In [18]:
#So let's drop the "Bloodpressure" variable to see if we get a better fitting model
X_new=df[['Age','BMI']]

In [19]:
reg=LinearRegression()     #initiating linearregression
reg.fit(X_new,Y)

LinearRegression()

In [20]:
Intercept=reg.intercept_
print(Intercept)


70.29517231196893


In [21]:
Coefficients=reg.coef_
print(Coefficients)

[0.69554933 0.85890806]


In [22]:
X_new=sm.add_constant(X_new)        #to add constant value in the model
model= sm.OLS(Y,X_new).fit()         #fitting the model
predictions= model.summary()      #summary of the model
predictions

0,1,2,3
Dep. Variable:,Glucose,R-squared:,0.114
Model:,OLS,Adj. R-squared:,0.112
Method:,Least Squares,F-statistic:,49.33
Date:,"Fri, 17 Dec 2021",Prob (F-statistic):,7.050000000000001e-21
Time:,14:07:06,Log-Likelihood:,-3703.7
No. Observations:,768,AIC:,7413.0
Df Residuals:,765,BIC:,7427.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,70.2952,5.402,13.013,0.000,59.691,80.899
Age,0.6955,0.093,7.514,0.000,0.514,0.877
BMI,0.8589,0.138,6.220,0.000,0.588,1.130

0,1,2,3
Omnibus:,18.855,Durbin-Watson:,1.836
Prob(Omnibus):,0.0,Jarque-Bera (JB):,38.868
Skew:,-0.007,Prob(JB):,3.63e-09
Kurtosis:,4.102,Cond. No.,235.0


In [23]:
# The size of the coefficient for each independent variable gives you the size of the effect that variable is having 
# on your dependent variable, 
# and the sign on the coefficient (positive or negative) gives you the direction of the effect
# holding all the other independent variables constant.

In [24]:
#In our case, Y = Glucose readings, X = Age and BMI
#Glucose goes up by .69 with each year you get older
#Glucose goes up by .85 with each unit increase in your BMI

In [28]:
#I want to do something like, what glucose level will a 65 year old have