In [1]:
# This program predicts diabetes progression based on one's age, gender, BMI, BP, and 6 blood serum test results
   # checks the suitability of LR algorithm without any feature scaling and then with scaling
   # Also shows the trained/tested model equation (LR algo basis - y = mx + c) at the end

# Abot the dataset - Diabetes data set (availabile as sklearn module) contains 442 diabetes patients' data, in which
# first 10 columns are input and last 11th column reflects a quantiative measure of diabetes progression one year later 
# More at https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 

In [2]:
# Load the diabetes data from imported module (default is numpy arrays)
#X_features, Y_labels = load_diabetes(return_X_y=True) #This will return 2 numpy arrays
#print(X_features[:2]) #check the first 2 rows of the array
    

In [3]:
# Load the diabetes data from imported module into panda dataframe (instead of default numpy arrays)
X_features = pd.DataFrame(data=load_diabetes().data, columns=load_diabetes().feature_names)
Y_labels = pd.DataFrame(data=load_diabetes().target, columns=['BloodSugarLevel'])

#Check the size in input and output tables (dataframes)
print("No of rows, columns in Input table:", X_features.shape)
print("No of rows, columns in Output table:", Y_labels.shape)

No of rows, columns in Input table: (442, 10)
No of rows, columns in Output table: (442, 1)


In [4]:
#Look at the top 5 entries in the input table (DF), check the names
X_features.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [5]:
#Look at the top 5 entries in the output table (DF), check the names
Y_labels.head()

Unnamed: 0,BloodSugarLevel
0,151.0
1,75.0
2,141.0
3,206.0
4,135.0


In [6]:
#Split the data set in training set and test set with 80:20 mix and do so randomly 
# Don't normalize the features values in dataset for now, we will do it later and compare  

X_train, X_test, Y_train, Y_test = train_test_split(X_features, Y_labels, test_size=0.2, random_state=45)

#Check the size in input and output tables (dataframes)
print("No of rows, columns in Input table:", X_train.shape)
print("No of rows, columns in Output table:", X_test.shape)
X_train.head()

No of rows, columns in Input table: (353, 10)
No of rows, columns in Output table: (89, 10)


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
63,-0.034575,-0.044642,-0.037463,-0.060756,0.020446,0.043466,-0.013948,-0.002592,-0.030748,-0.071494
101,0.016281,0.05068,-0.045007,0.063187,0.010815,-0.000374,0.063367,-0.039493,-0.030748,0.036201
347,0.038076,0.05068,-0.029918,-0.074527,-0.012577,-0.012587,0.00446,-0.002592,0.003709,-0.030072
256,-0.049105,-0.044642,0.160855,-0.046985,-0.029088,-0.01979,-0.047082,0.034309,0.02802,0.011349


In [7]:
#Choose an algorithm to create an ML model with default parameters
mymodel_linear = LinearRegression()

In [8]:
#train the model (find optimal coefficients) for the diabetes data
mymodel_linear.fit(X_train, Y_train)

In [9]:
# Evaluate the model (R2, MSE. Sqrt(MSE)) 
# Using training data, find the degree of relationship among input features and output label
# 1. r2 (squared error) measures the relationship between x and y (1=totally related and 0=unrelated).
# Print the model equation

Y_predicted = mymodel_linear.predict(X_train)
r2 = r2_score(Y_train, Y_predicted)
MSE = mean_squared_error(Y_train, Y_predicted)
if r2 < 0.80:
    print(f"Is using LinearRegression right? output and input are poorly related (since R2={r2:.2f})")
else: 
    print("Using LinearRegression is right, output and input are well related (since R2={r2:.2f})")

print("Mean Squared Error (MSE)", MSE)

Is using LinearRegression right? output and input are poorly related (since R2=0.52)
Mean Squared Error (MSE) 2991.423365209392


In [10]:
#Using test data, find the degree of relationship among input features and output.
#r2 (squared error) measures the relationship (1=totally related aka Perfect prediction and 0=unrelated).

Y_predicted = mymodel_linear.predict(X_test)
r2 = r2_score(Y_test, Y_predicted)
MSE = mean_squared_error(Y_test, Y_predicted)
if r2 < 0.80:
    print(f"Is using LinearRegression right? output and input are poorly related (since R2={r2:.2f})")
else: 
    print("Using LinearRegression seems valid, x and y are well related (since R2={r2:.2f})")
print("Mean Squared Error (MSE): \n\t", MSE)

Is using LinearRegression right? output and input are poorly related (since R2=0.52)
Mean Squared Error (MSE): 
	 2374.3339396183187


In [11]:
#Use the tested model for predicting Blood sugar level on new blood data

New_Blood_input = ([[0.028076,	0.050680,	0.061696,	0.091872,	0.044223,	0.034821,	-0.043401,	-0.002592,	0.019907,	-0.017646]])
new_Prediction = mymodel_linear.predict(New_Blood_input)

print("patient blood sugar level is"+ str(new_Prediction)+" God bless")

patient blood sugar level is[[183.56853024]] God bless




In [12]:
#Normalize the features values in training and test dataset for equal feature weightage and improve R2
scaler = StandardScaler()                              # standard scaler to experiment
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
#Check the size in input and output tables (dataframes)
print("No of rows, columns in Input table:", X_train_scaled.shape)
print("No of rows, columns in Output table:", X_test_scaled.shape)
#X_train_scaled.head()                                 # works for pandas DF, but scaler changed to a numpy array 
print(X_train_scaled[:5])

No of rows, columns in Input table: (353, 10)
No of rows, columns in Output table: (89, 10)
[[-1.92716137 -0.91050291 -0.26090929 -0.73767008  0.25966142  0.53948495
  -0.79826736  0.79530475  0.48917461 -0.19953557]
 [-0.76586406 -0.91050291 -0.79444138 -1.2365173   0.43414203  0.92832934
  -0.33218045 -0.01571494 -0.63291945 -1.46789193]
 [ 0.31801343  1.09829413 -0.95005491  1.3289827   0.23058132  0.00564774
   1.29912374 -0.82673464 -0.63291945  0.73059243]
 [ 0.78253235  1.09829413 -0.63882785 -1.52157285 -0.26378041 -0.25138499
   0.05622531 -0.01571494  0.09064172 -0.62232103]
 [-1.07554334 -0.91050291  3.29597135 -0.95146174 -0.61274164 -0.4029684
  -1.03131081  0.79530475  0.60115431  0.22324988]]


In [13]:
#train the model 
mymodel_linear2 = LinearRegression()
mymodel_linear2.fit(X_train_scaled, Y_train)

In [14]:
#Evaluate the accuracy of a model1 using the test data
Y_predicted = mymodel_linear2.predict(X_test_scaled)

r2 = r2_score(Y_test, Y_predicted)
if r2 < 0.80:
    print(f"Is using LinearRegression right? output and input are poorly related :-( (since R2={r2:.2f})")
else: 
    print("Using LinearRegression is right. output and input are well related :-) (since R2={r2:.2f})")

print("Mean Squared Error (MSE)", mean_squared_error(Y_test, Y_predicted))
print("Linear Regression Coefficients: \n", mymodel_linear2.coef_)

Is using LinearRegression right? output and input are poorly related :-( (since R2=0.52)
Mean Squared Error (MSE) 2370.542127180934
Linear Regression Coefficients: 
 [[  1.10049251 -11.74096769  23.85914217  15.91174945 -45.93420019
   27.25162519   8.64573107  11.64434963  37.82166281   4.37554448]]


In [15]:
#Use the tested model (post scaling) for predicting Blood sugar level on new blood data

New_Blood_input = ([[0.028076,	0.050680,	0.061696,	0.091872,	0.044223,	0.034821,	-0.043401,	-0.002592,	0.019907,	-0.017646]])
new_Prediction = mymodel_linear2.predict(New_Blood_input)

print("patient blood sugar level is", new_Prediction)

patient blood sugar level is [[153.39045442]]


In [16]:
#print the model equation y = c + mx (actually, m1x1 + m2x2 + m3x3.....)
equation = f"{mymodel_linear2.intercept_[0]:.4f}"            #save the intercept first
for i, (feature, coef) in enumerate(zip(X_features.columns, mymodel_linear2.coef_[0])):
    equation += f" + {coef:.4f} * {feature}"
print("Model's equation is:\n", equation)

Model's equation is:
 151.8329 + 1.1005 * age + -11.7410 * sex + 23.8591 * bmi + 15.9117 * bp + -45.9342 * s1 + 27.2516 * s2 + 8.6457 * s3 + 11.6443 * s4 + 37.8217 * s5 + 4.3755 * s6


In [17]:
#print the model equation y = c + mx (actually, m1x1 + m2x2 + m3x3.....)
equation = f"{mymodel_linear2.intercept_[0]:.4f}"            #save the intercept first
for feature, coef in zip(X_features.columns, mymodel_linear2.coef_):
    equation += f" + {coef[0]:.4f} * {feature}"  

print(f"Model's equation is:\n{equation}")

Model's equation is:
151.8329 + 1.1005 * age
