# Loading the Dataset (Diabetes)

In [1]:
from sklearn import datasets

# Load Dataset 

In [2]:
data = datasets.load_diabetes()

In [3]:
data

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990842, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06832974, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286377, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04687948,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452837, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00421986,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [4]:
print(data.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

# Feature names

In [5]:
print(data.feature_names)

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


# Creating X and Y matricies

In [6]:
X, Y = datasets.load_diabetes(return_X_y=True)

X.shape, Y.shape

((442, 10), (442,))

# Data Split

## Import Library

In [7]:
from sklearn.model_selection import train_test_split

## Perform 80/20 Data Split

In [8]:
X_train,X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)

### Dimensions

In [9]:
X_train.shape, Y_train.shape

((353, 10), (353,))

In [10]:
X_test.shape, Y_test.shape

((89, 10), (89,))

# Build Linear Regression

## Import Libraries

In [11]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Build linear regression

### Defines regression model

In [12]:
model = linear_model.LinearRegression()

### Build training model

In [13]:
model.fit(X_train,Y_train)

LinearRegression()

### Apply model to test set

In [14]:
Y_pred = model.predict(X_test)

# Prediction results

## Model performance

In [15]:
print('Coefficients:', model.coef_)
print('Intercept:', model.intercept_)
print(f'Mean Squared Error (MSE):{mean_squared_error(Y_test,Y_pred):.2f}')
print(f'Coefficient of determination R^2: {r2_score(Y_test, Y_pred):.2f}')

Coefficients: [   3.86214866 -258.8302145   524.59158573  381.46296725 -921.86472546
  586.80431537  183.99044552  195.86725379  762.00005952   61.00339948]
Intercept: 154.08486908558717
Mean Squared Error (MSE):3023.29
Coefficient of determination R^2: 0.43


Y = 3.86(age) -258.83(sex) + ...+154.08