In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.datasets import load_diabetes

In [3]:
diab = load_diabetes()
print(diab.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [4]:
diab

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286131, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04688253,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452873, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00422151,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [5]:
df = pd.DataFrame(diab.data, columns=diab.feature_names) # create a dataframe from the data
df['target'] = diab.target

In [6]:
df.head() # data is scaled normalization to put all the columns in the same scale (less same scale takes less compuation time for the model)


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


In [7]:
X = df.drop('target', axis=1) # input features / varibalbes
y = df['target'] # Target variable


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # random_state (it's a seed value to reproduce the same result (data split in the same way)


In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error


In [10]:
lr = LinearRegression() # initailize the model

lr.fit(X_train, y_train)  # train the model

In [11]:
#evaluate the model
y_pred = lr.predict(X_test)

In [12]:
df2 =pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

In [13]:
df2

Unnamed: 0,Actual,Predicted
287,219.0,139.547558
211,70.0,179.517208
72,202.0,134.038756
321,230.0,291.417029
73,111.0,123.789659
...,...,...
255,153.0,115.011800
90,98.0,78.955842
57,37.0,81.560873
391,63.0,54.379973


In [14]:
#add two more columns to the dataframe df2 abosulte error and squared error
df2['Absolute Error'] = abs(df2['Actual'] - df2['Predicted'])
df2['Squared Error'] = (df2['Actual'] - df2['Predicted'])**2
df2
#Delete Abs Err and Sq Err columns
df2.drop(['Absolute Error', 'Squared Error'], axis=1, inplace=True)

In [15]:
df2

Unnamed: 0,Actual,Predicted
287,219.0,139.547558
211,70.0,179.517208
72,202.0,134.038756
321,230.0,291.417029
73,111.0,123.789659
...,...,...
255,153.0,115.011800
90,98.0,78.955842
57,37.0,81.560873
391,63.0,54.379973


In [16]:
# np.mean(df2['Abs Error']) 

In [17]:
# np.mean(df2['Sq Error'])

In [18]:
mse = mean_squared_error(y_test, y_pred)
mse

2900.193628493483

In [19]:
mae = mean_absolute_error(y_test, y_pred)
mae

42.794094679599944

In [20]:
r2 = r2_score(y_test, y_pred)
r2 # fix this

0.45260276297191915

In [21]:
r2 = 1- (np.sum((y_test - y_pred)**2) / np.sum((y_test - np.mean(y_test))**2)) # manually calculate the r2 score
r2

0.45260276297191915

In [22]:
lr.coef_,lr.intercept_

(array([  37.90402135, -241.96436231,  542.42875852,  347.70384391,
        -931.48884588,  518.06227698,  163.41998299,  275.31790158,
         736.1988589 ,   48.67065743]),
 151.34560453985995)

In [23]:
#metrics for training data
y_pred_train = lr.predict(X_train)

mean_squared_err_trained_data = mean_squared_error(y_train, y_pred_train)
print("Mean Squared Error (Training Data):", mean_squared_err_trained_data)
mean_absolute_error_trained_data = mean_absolute_error(y_train, y_pred_train)
print("Mean Absolute Error (Training Data):", mean_absolute_error_trained_data)

r2_trained_data = 1 - (np.sum((y_train - y_pred_train)**2) / np.sum((y_train - np.mean(y_train))**2))
print("R2 Score (Training Data):", r2_trained_data)

Mean Squared Error (Training Data): 2868.549702835577
Mean Absolute Error (Training Data): 43.483503523980396
R2 Score (Training Data): 0.5279193863361498


In [24]:
# metrics for testing data
mean_squared_err_test_data = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (Testing Data):", mean_squared_err_test_data)
mean_absolute_error_test_data = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (Testing Data):", mean_absolute_error_test_data)
r2_test_data = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - np.mean(y_test))**2))
print("R2 Score (Testing Data):", r2_test_data)


Mean Squared Error (Testing Data): 2900.193628493483
Mean Absolute Error (Testing Data): 42.794094679599944
R2 Score (Testing Data): 0.45260276297191915


In [25]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet

ridge = Ridge() # initialize the model
ridge.fit(X_train, y_train) # train the model
y_pred_ridge = ridge.predict(X_test) # return the prediction for test data
y_pred_ridge_train = ridge.predict(X_train) # prediction for train data 

print("-- Metrics for Ridge Regression on training data--")
print("Mean Squared Error (Training Data):", mean_squared_error(y_train, y_pred_ridge_train))
print("Mean Absolute Error (Training Data):", mean_absolute_error(y_train, y_pred_ridge_train))
print("R2 Score (Training Data):", 1 - (np.sum((y_train - y_pred_ridge_train)**2) / np.sum((y_train - np.mean(y_train))**2)) )

print("-- Metrics for Ridge Regression on testing data--")
print("Mean Squared Error (Testing Data):", mean_squared_error(y_test, y_pred_ridge))
print("Mean Absolute Error (Testing Data):", mean_absolute_error(y_test, y_pred_ridge))
print("R2 Score (Testing Data):", 1 - (np.sum((y_test - y_pred_ridge)**2) / np.sum((y_test - np.mean(y_test))**2)))


-- Metrics for Ridge Regression on training data--
Mean Squared Error (Training Data): 3388.1826180801313
Mean Absolute Error (Training Data): 48.8051936622374
R2 Score (Training Data): 0.4424027835503952
-- Metrics for Ridge Regression on testing data--
Mean Squared Error (Testing Data): 3077.41593882723
Mean Absolute Error (Testing Data): 46.13885766697452
R2 Score (Testing Data): 0.41915292635986545


In [26]:
lasso = Lasso() # initialize the model

lasso.fit(X_train, y_train) # train the model
y_pred_lasso = lasso.predict(X_test) # return the prediction for test data
y_pred_lasso_train = lasso.predict(X_train) # prediction for train data

print("-- Metrics for Lasso Regression on training data--")
print("Mean Squared Error (Training Data):", mean_squared_error(y_train, y_pred_lasso_train))
print("Mean Absolute Error (Training Data):", mean_absolute_error(y_train, y_pred_lasso_train))
print("R2 Score (Training Data):", 1 - (np.sum((y_train - y_pred_lasso_train)**2) / np.sum((y_train - np.mean(y_train))**2)))
print("-----------------------------------")
print("-- Metrics for Lasso Regression on testing data--")
print("Mean Squared Error (Testing Data):", mean_squared_error(y_test, y_pred_lasso))
print("Mean Absolute Error (Testing Data):", mean_absolute_error(y_test, y_pred_lasso))
print("R2 Score (Testing Data):", 1 - (np.sum((y_test - y_pred_lasso)**2) / np.sum((y_test - np.mean(y_test))**2)))
      

-- Metrics for Lasso Regression on training data--
Mean Squared Error (Training Data): 3860.7549830123576
Mean Absolute Error (Training Data): 52.95878032849505
R2 Score (Training Data): 0.3646309911295581
-----------------------------------
-- Metrics for Lasso Regression on testing data--
Mean Squared Error (Testing Data): 3403.5757216070747
Mean Absolute Error (Testing Data): 49.73032753662261
R2 Score (Testing Data): 0.3575918767219112


In [27]:
elastic = ElasticNet() # initialize the model

elastic.fit(X_train, y_train) # train the model
y_pred_elastic = elastic.predict(X_test) # return the prediction for test data
y_pred_elastic_train = elastic.predict(X_train) # prediction for train data

print("-- Metrics for Elastic Net Regression on training data--")
print("Mean Squared Error (Training Data):", mean_squared_error(y_train, y_pred_elastic_train))
print("Mean Absolute Error (Training Data):", mean_absolute_error(y_train, y_pred_elastic_train))
print("R2 Score (Training Data):", 1 - (np.sum((y_train - y_pred_elastic_train)**2) / np.sum((y_train - np.mean(y_train))**2)))
print("-----------------------------------")
print("-- Metrics for Elastic Net Regression on testing data--")
print("Mean Squared Error (Testing Data):", mean_squared_error(y_test, y_pred_elastic))
print("Mean Absolute Error (Testing Data):", mean_absolute_error(y_test, y_pred_elastic))
print("R2 Score (Testing Data):", 1 - (np.sum((y_test - y_pred_elastic)**2) / np.sum((y_test - np.mean(y_test))**2)))

-- Metrics for Elastic Net Regression on training data--
Mean Squared Error (Training Data): 6022.3083478943745
Mean Absolute Error (Training Data): 66.16280406468873
R2 Score (Training Data): 0.008901600088515704
-----------------------------------
-- Metrics for Elastic Net Regression on testing data--
Mean Squared Error (Testing Data): 5311.21282167187
Mean Absolute Error (Testing Data): 63.70590076411911
R2 Score (Testing Data): -0.0024652131111431164


In [28]:
import pickle # best model is ridge because it has the lowest error and highest r2 score
pickle.dump(ridge, open('model_ridge.pkl', 'wb')) # save the model to disk

In [29]:
pickle.dump(lr,open('model_lr.pkl','wb'))
pickle.dump(lasso,open('model_lasso.pkl','wb'))

In [30]:
pickle.dump(elastic,open('model_elastic.pkl','wb'))

In [31]:
df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-2.511817e-19,1.23079e-17,-2.245564e-16,-4.79757e-17,-1.3814990000000001e-17,3.9184340000000004e-17,-5.777179e-18,-9.04254e-18,9.293722000000001e-17,1.130318e-17,152.133484
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,77.093005
min,-0.1072256,-0.04464164,-0.0902753,-0.1123988,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260971,-0.1377672,25.0
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665608,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324559,-0.03317903,87.0
50%,0.00538306,-0.04464164,-0.007283766,-0.005670422,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947171,-0.001077698,140.5
75%,0.03807591,0.05068012,0.03124802,0.03564379,0.02835801,0.02984439,0.0293115,0.03430886,0.03243232,0.02791705,211.5
max,0.1107267,0.05068012,0.1705552,0.1320436,0.1539137,0.198788,0.1811791,0.1852344,0.1335973,0.1356118,346.0
