# Diabetes Dataset - Linear Regression

Ten baseline variables, age, sex, body mass index, average blood pressure, and six blood serum measurements were obtained for each of n = 442 diabetes patients, as well as the response of interest, a quantitative measure of disease progression one year after baseline.

Source: https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

In [1]:
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
diabetes = datasets.load_diabetes()

In [3]:
diabetes.data

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286377, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04687948,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452837, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00421986,  0.00306441]])

In [4]:
diabetes.data.shape

(442, 10)

In [5]:
diabetes_X = diabetes.data

In [6]:
diabetes.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [7]:
# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

In [8]:
diabetes_X_train.shape

(422, 10)

In [9]:
diabetes_X_test.shape

(20, 10)

In [10]:
# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]

In [11]:
# Create linear regression object
regr = linear_model.LinearRegression()

In [12]:
# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [13]:
# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

In [14]:
print('Coefficients:')
regr.coef_

Coefficients:


array([ 3.03499549e-01, -2.37639315e+02,  5.10530605e+02,  3.27736980e+02,
       -8.14131709e+02,  4.92814588e+02,  1.02848452e+02,  1.84606489e+02,
        7.43519617e+02,  7.60951722e+01])

## Create tidypredict.yml

In [15]:
import yaml

In [16]:
general = {
    "model": "lm",
    "version": 2.0,
    "type": "regression",
    "residual": 0,
    "sigma2": 0,
    "is_glm": 0
}

In [17]:
terms = []
terms_intercept = {
        "label": "(Intercept)",
        "coef": float(regr.intercept_),
        "is_intercept": 1,
        "fields": [{"type": "ordinary", "col": "(Intercept)"}]
    } 
terms.append(terms_intercept)

In [18]:
for term_name, coef_ in zip(diabetes.feature_names, regr.coef_):
    term = {
        "label": term_name,
        "coef": float(coef_),
        "is_intercept": 0,
        "fields": [{"type": "ordinary", "col": term_name}]
    }
    terms.append(term)

In [19]:
tidypredict_data = {"general": general, "terms": terms}
print(yaml.dump(tidypredict_data, default_flow_style=False))

general:
  is_glm: 0
  model: lm
  residual: 0
  sigma2: 0
  type: regression
  version: 2.0
terms:
- coef: 152.76430691633442
  fields:
  - col: (Intercept)
    type: ordinary
  is_intercept: 1
  label: (Intercept)
- coef: 0.3034995490660432
  fields:
  - col: age
    type: ordinary
  is_intercept: 0
  label: age
- coef: -237.63931533353403
  fields:
  - col: sex
    type: ordinary
  is_intercept: 0
  label: sex
- coef: 510.5306054362253
  fields:
  - col: bmi
    type: ordinary
  is_intercept: 0
  label: bmi
- coef: 327.7369804093466
  fields:
  - col: bp
    type: ordinary
  is_intercept: 0
  label: bp
- coef: -814.1317093725387
  fields:
  - col: s1
    type: ordinary
  is_intercept: 0
  label: s1
- coef: 492.81458798373217
  fields:
  - col: s2
    type: ordinary
  is_intercept: 0
  label: s2
- coef: 102.8484521916802
  fields:
  - col: s3
    type: ordinary
  is_intercept: 0
  label: s3
- coef: 184.60648905984
  fields:
  - col: s4
    type: ordinary
  is_intercept: 0
  label: s4

## Compare tidypredict formula with sklearn

In [20]:
regr.intercept_


152.76430691633442

In [21]:
regr.coef_

array([ 3.03499549e-01, -2.37639315e+02,  5.10530605e+02,  3.27736980e+02,
       -8.14131709e+02,  4.92814588e+02,  1.02848452e+02,  1.84606489e+02,
        7.43519617e+02,  7.60951722e+01])

In [22]:
sklearn_predict = regr.predict(diabetes_X_train)

In [23]:
age = diabetes_X_train[:, 0]
sex = diabetes_X_train[:, 1]
bmi = diabetes_X_train[:, 2]
bp = diabetes_X_train[:, 3]
s1 = diabetes_X_train[:, 4]
s2 = diabetes_X_train[:, 5]
s3 = diabetes_X_train[:, 6]
s4 = diabetes_X_train[:, 7]
s5 = diabetes_X_train[:, 8]
s6 = diabetes_X_train[:, 9]

In [24]:
# Formula from tidypredict:

f = 152.764306916334 + (age * 0.303499549066043) + (sex * -237.639315333534) + \
    (bmi * 510.530605436225) + (bp * 327.736980409347) + (s1 * \
    -814.131709372539) + (s2 * 492.814587983732) + (s3 * 102.84845219168) + \
    (s4 * 184.60648905984) + (s5 * 743.519616750542) + (s6 * \
    76.0951722166239)

In [25]:
np.sum(np.abs(sklearn_predict - f))

1.8099655108017032e-10