# Data science training (sklearn 'diabetes' dataset)

In [1]:
from sklearn.datasets import load_diabetes
import numpy as np
import pandas as pd

## Dataset import

In [2]:
dataset = load_diabetes()

In [3]:
print(dataset.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [4]:
dataset.keys()

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])

In [5]:
print(dataset.feature_names)

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


In [6]:
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)

In [7]:
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [8]:
df["disease_progression"] = dataset.target

In [9]:
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,disease_progression
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


## Dataset cleaning

In [10]:
df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,disease_progression
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-3.634285e-16,1.308343e-16,-8.045349e-16,1.281655e-16,-8.835316000000001e-17,1.327024e-16,-4.574646e-16,3.777301e-16,-3.830854e-16,-3.412882e-16,152.133484
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,77.093005
min,-0.1072256,-0.04464164,-0.0902753,-0.1123996,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260974,-0.1377672,25.0
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665645,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324879,-0.03317903,87.0
50%,0.00538306,-0.04464164,-0.007283766,-0.005670611,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947634,-0.001077698,140.5
75%,0.03807591,0.05068012,0.03124802,0.03564384,0.02835801,0.02984439,0.0293115,0.03430886,0.03243323,0.02791705,211.5
max,0.1107267,0.05068012,0.1705552,0.1320442,0.1539137,0.198788,0.1811791,0.1852344,0.133599,0.1356118,346.0


### null checking

In [11]:
df.isnull().any()

age                    False
sex                    False
bmi                    False
bp                     False
s1                     False
s2                     False
s3                     False
s4                     False
s5                     False
s6                     False
disease_progression    False
dtype: bool

### duplicates checking

In [12]:
df.duplicated().any()

False

## Supervised learning

### train-test set split

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X = df.iloc[:, :-1]
X.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [15]:
Y = df.iloc[:, -1]
Y.head()

0    151.0
1     75.0
2    141.0
3    206.0
4    135.0
Name: disease_progression, dtype: float64

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, random_state=1)

In [17]:
X.shape

(442, 10)

In [18]:
x_train.shape

(353, 10)

### regression (MLPRegressor)

In [19]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

#### best hyperparameters

- I took a cue from this [post](https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw) regarding the choice of these parameters.
- we have to choose the number of **hidden layers** of the neural network. We do it based on the current number of neurons in the input and output layers.
- **input layer**: we only have 1 input layer in the neural network, the number of neurons in the input layer is equals to the number of features (we have 10 features in this dataset), so we have **10** neurons here.
- **output layer**: we only have 1 output layer in the neural network, and since this is a **regression problem**, we have **1** neuron here.
- **hidden layers**:
    - determining the **number of hidden layers**:
        - **0** - only capable of representing linear separable functions or decisions.
        - **1** - can approximate any function that contains a continuous mapping
from one finite space to another.
        - **2** - can represent an arbitrary decision boundary to arbitrary accuracy
with rational activation functions and can approximate any smooth
mapping to any accuracy.
    - determining the **number of neurons in the hidden layers**:
        - **rule of thumb**:
            1. the number of hidden neurons should be between the size of the input layer and the size of the output layer.
            2. the number of hidden neurons should be 2/3 the size of the input layer, plus the size of the output layer.
            3. the number of hidden neurons should be less than twice the size of the input layer.
        - **conclusions** (input layer size: 10, output layer size: 1):
            1. hidden layer size: between $[10,1]$
            2. hidden layer size: (${2 \over 3} \cdot 10) + 1 = 6.67 + 1 = 7.67 \approx 8$
            3. hidden layer size: < $2 \cdot 10 = 20$

In [20]:
param_grid = {'hidden_layer_sizes': [(8,), (8,8), (8,8,8), (10,), (8,10,8), (10,10), (10,10,10), (100,)]}

In [21]:
grid = GridSearchCV(MLPRegressor(max_iter=5000), param_grid, cv=7, verbose=3)

In [22]:
grid.fit(x_train, y_train)

Fitting 7 folds for each of 8 candidates, totalling 56 fits




[CV 1/7] END ...........hidden_layer_sizes=(8,);, score=0.400 total time=   4.8s




[CV 2/7] END ...........hidden_layer_sizes=(8,);, score=0.225 total time=   4.6s




[CV 3/7] END ...........hidden_layer_sizes=(8,);, score=0.535 total time=   4.5s




[CV 4/7] END ...........hidden_layer_sizes=(8,);, score=0.413 total time=   4.5s




[CV 5/7] END ...........hidden_layer_sizes=(8,);, score=0.529 total time=   4.5s




[CV 6/7] END ...........hidden_layer_sizes=(8,);, score=0.527 total time=   4.7s




[CV 7/7] END ...........hidden_layer_sizes=(8,);, score=0.525 total time=   4.9s
[CV 1/7] END .........hidden_layer_sizes=(8, 8);, score=0.450 total time=   4.5s
[CV 2/7] END .........hidden_layer_sizes=(8, 8);, score=0.364 total time=   4.3s
[CV 3/7] END .........hidden_layer_sizes=(8, 8);, score=0.591 total time=   4.5s
[CV 4/7] END .........hidden_layer_sizes=(8, 8);, score=0.374 total time=   4.9s
[CV 5/7] END .........hidden_layer_sizes=(8, 8);, score=0.548 total time=   4.8s
[CV 6/7] END .........hidden_layer_sizes=(8, 8);, score=0.567 total time=   5.0s
[CV 7/7] END .........hidden_layer_sizes=(8, 8);, score=0.540 total time=   2.4s
[CV 1/7] END ......hidden_layer_sizes=(8, 8, 8);, score=0.454 total time=   4.0s
[CV 2/7] END ......hidden_layer_sizes=(8, 8, 8);, score=0.378 total time=   3.2s
[CV 3/7] END ......hidden_layer_sizes=(8, 8, 8);, score=0.585 total time=   2.0s
[CV 4/7] END ......hidden_layer_sizes=(8, 8, 8);, score=0.371 total time=   3.8s
[CV 5/7] END ......hidden_la



[CV 1/7] END ..........hidden_layer_sizes=(10,);, score=0.428 total time=   4.7s




[CV 2/7] END ..........hidden_layer_sizes=(10,);, score=0.221 total time=   4.7s




[CV 3/7] END ..........hidden_layer_sizes=(10,);, score=0.572 total time=   4.7s




[CV 4/7] END ..........hidden_layer_sizes=(10,);, score=0.358 total time=   4.7s




[CV 5/7] END ..........hidden_layer_sizes=(10,);, score=0.534 total time=   4.6s




[CV 6/7] END ..........hidden_layer_sizes=(10,);, score=0.530 total time=   4.6s




[CV 7/7] END ..........hidden_layer_sizes=(10,);, score=0.541 total time=   4.6s
[CV 1/7] END .....hidden_layer_sizes=(8, 10, 8);, score=0.442 total time=   4.4s
[CV 2/7] END .....hidden_layer_sizes=(8, 10, 8);, score=0.362 total time=   3.0s
[CV 3/7] END .....hidden_layer_sizes=(8, 10, 8);, score=0.592 total time=   3.7s
[CV 4/7] END .....hidden_layer_sizes=(8, 10, 8);, score=0.339 total time=   3.2s
[CV 5/7] END .....hidden_layer_sizes=(8, 10, 8);, score=0.548 total time=   2.6s
[CV 6/7] END .....hidden_layer_sizes=(8, 10, 8);, score=0.551 total time=   5.5s
[CV 7/7] END .....hidden_layer_sizes=(8, 10, 8);, score=0.494 total time=   6.3s
[CV 1/7] END .......hidden_layer_sizes=(10, 10);, score=0.452 total time=   4.3s
[CV 2/7] END .......hidden_layer_sizes=(10, 10);, score=0.369 total time=   3.7s
[CV 3/7] END .......hidden_layer_sizes=(10, 10);, score=0.586 total time=   4.7s
[CV 4/7] END .......hidden_layer_sizes=(10, 10);, score=0.381 total time=   2.8s
[CV 5/7] END .......hidden_l



[CV 7/7] END ...hidden_layer_sizes=(10, 10, 10);, score=0.516 total time=   8.0s
[CV 1/7] END .........hidden_layer_sizes=(100,);, score=0.451 total time=   5.3s
[CV 2/7] END .........hidden_layer_sizes=(100,);, score=0.364 total time=   6.0s
[CV 3/7] END .........hidden_layer_sizes=(100,);, score=0.585 total time=   5.6s
[CV 4/7] END .........hidden_layer_sizes=(100,);, score=0.363 total time=   5.5s
[CV 5/7] END .........hidden_layer_sizes=(100,);, score=0.545 total time=   4.8s
[CV 6/7] END .........hidden_layer_sizes=(100,);, score=0.568 total time=   6.0s
[CV 7/7] END .........hidden_layer_sizes=(100,);, score=0.546 total time=   6.0s


GridSearchCV(cv=7, estimator=MLPRegressor(max_iter=5000),
             param_grid={'hidden_layer_sizes': [(8,), (8, 8), (8, 8, 8), (10,),
                                                (8, 10, 8), (10, 10),
                                                (10, 10, 10), (100,)]},
             verbose=3)

In [23]:
grid.best_params_

{'hidden_layer_sizes': (10, 10)}

In [24]:
grid.best_score_

0.49235967792262364

#### learning

In [25]:
reg = MLPRegressor(max_iter=10000, random_state=1, **grid.best_params_)

In [26]:
reg.fit(x_train, y_train)

MLPRegressor(hidden_layer_sizes=(10, 10), max_iter=10000, random_state=1)

#### score

In [27]:
from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score

In [28]:
y_pred = reg.predict(x_test)

In [30]:
explained_variance_score(y_test, y_pred), mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)

(0.42993694134867577, 3056.737290640017, 0.4263925110882929)

### regression (LinearRegression)

In [31]:
from sklearn.linear_model import LinearRegression

#### best hyperparameters

In [32]:
param_grid = {'fit_intercept': [True, False], 'normalize': [True, False]}

In [33]:
grid = GridSearchCV(LinearRegression(), param_grid, cv=7, verbose=3)

In [34]:
grid.fit(x_train, y_train)

Fitting 7 folds for each of 4 candidates, totalling 28 fits
[CV 1/7] END fit_intercept=True, normalize=True;, score=0.420 total time=   0.7s
[CV 2/7] END fit_intercept=True, normalize=True;, score=0.344 total time=   0.0s
[CV 3/7] END fit_intercept=True, normalize=True;, score=0.587 total time=   0.0s
[CV 4/7] END fit_intercept=True, normalize=True;, score=0.340 total time=   0.0s
[CV 5/7] END fit_intercept=True, normalize=True;, score=0.540 total time=   0.0s
[CV 6/7] END fit_intercept=True, normalize=True;, score=0.572 total time=   0.0s
[CV 7/7] END fit_intercept=True, normalize=True;, score=0.535 total time=   0.0s
[CV 1/7] END fit_intercept=True, normalize=False;, score=0.420 total time=   0.0s
[CV 2/7] END fit_intercept=True, normalize=False;, score=0.344 total time=   0.0s
[CV 3/7] END fit_intercept=True, normalize=False;, score=0.587 total time=   0.0s
[CV 4/7] END fit_intercept=True, normalize=False;, score=0.340 total time=   0.0s
[CV 5/7] END fit_intercept=True, normalize=Fa

GridSearchCV(cv=7, estimator=LinearRegression(),
             param_grid={'fit_intercept': [True, False],
                         'normalize': [True, False]},
             verbose=3)

In [35]:
grid.best_params_

{'fit_intercept': True, 'normalize': False}

In [36]:
grid.best_score_

0.4768863974833785

#### learning

In [37]:
reg = LinearRegression(**grid.best_params_)

In [38]:
reg.fit(x_train, y_train)

LinearRegression()

#### score

In [39]:
y_pred = reg.predict(x_test)

In [40]:
explained_variance_score(y_test, y_pred), mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)

(0.4407260889108393, 2992.557681452944, 0.43843604017332705)

### regression (PolynomialRegression)

In [41]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

In [42]:
def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree), LinearRegression(**kwargs))

#### best hyperparameters

In [43]:
param_grid = {'polynomialfeatures__degree':np.arange(1, 10), 'linearregression__fit_intercept': [True, False], 'linearregression__normalize': [True, False]}

In [44]:
grid = GridSearchCV(PolynomialRegression(), param_grid, cv=7, verbose=3)

In [45]:
grid.fit(x_train, y_train)

Fitting 7 folds for each of 36 candidates, totalling 252 fits
[CV 1/7] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__degree=1;, score=0.420 total time=   0.0s
[CV 2/7] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__degree=1;, score=0.344 total time=   0.0s
[CV 3/7] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__degree=1;, score=0.587 total time=   0.0s
[CV 4/7] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__degree=1;, score=0.340 total time=   0.0s
[CV 5/7] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__degree=1;, score=0.540 total time=   0.0s
[CV 6/7] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__degree=1;, score=0.572 total time=   0.0s
[CV 7/7] END linearregression__fit_intercept=True, linea

[CV 7/7] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__degree=8;, score=-11.129 total time=   1.1s
[CV 1/7] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__degree=9;, score=-37.488 total time=   2.7s
[CV 2/7] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__degree=9;, score=-562.764 total time=   2.8s
[CV 3/7] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__degree=9;, score=-9.423 total time=   2.7s
[CV 4/7] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__degree=9;, score=-19.314 total time=   2.7s
[CV 5/7] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__degree=9;, score=-15.504 total time=   2.8s
[CV 6/7] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__d

[CV 6/7] END linearregression__fit_intercept=True, linearregression__normalize=False, polynomialfeatures__degree=7;, score=-28.832 total time=   0.4s
[CV 7/7] END linearregression__fit_intercept=True, linearregression__normalize=False, polynomialfeatures__degree=7;, score=-16.318 total time=   0.4s
[CV 1/7] END linearregression__fit_intercept=True, linearregression__normalize=False, polynomialfeatures__degree=8;, score=-37.762 total time=   1.0s
[CV 2/7] END linearregression__fit_intercept=True, linearregression__normalize=False, polynomialfeatures__degree=8;, score=-30.989 total time=   1.1s
[CV 3/7] END linearregression__fit_intercept=True, linearregression__normalize=False, polynomialfeatures__degree=8;, score=-8.737 total time=   1.1s
[CV 4/7] END linearregression__fit_intercept=True, linearregression__normalize=False, polynomialfeatures__degree=8;, score=-18.378 total time=   1.0s
[CV 5/7] END linearregression__fit_intercept=True, linearregression__normalize=False, polynomialfeatu

[CV 5/7] END linearregression__fit_intercept=False, linearregression__normalize=True, polynomialfeatures__degree=6;, score=-16.191 total time=   0.1s
[CV 6/7] END linearregression__fit_intercept=False, linearregression__normalize=True, polynomialfeatures__degree=6;, score=-28.833 total time=   0.1s
[CV 7/7] END linearregression__fit_intercept=False, linearregression__normalize=True, polynomialfeatures__degree=6;, score=-16.318 total time=   0.1s
[CV 1/7] END linearregression__fit_intercept=False, linearregression__normalize=True, polynomialfeatures__degree=7;, score=-37.767 total time=   0.4s
[CV 2/7] END linearregression__fit_intercept=False, linearregression__normalize=True, polynomialfeatures__degree=7;, score=-30.988 total time=   0.4s
[CV 3/7] END linearregression__fit_intercept=False, linearregression__normalize=True, polynomialfeatures__degree=7;, score=-8.737 total time=   0.3s
[CV 4/7] END linearregression__fit_intercept=False, linearregression__normalize=True, polynomialfeatu

[CV 4/7] END linearregression__fit_intercept=False, linearregression__normalize=False, polynomialfeatures__degree=5;, score=-18.387 total time=   0.0s
[CV 5/7] END linearregression__fit_intercept=False, linearregression__normalize=False, polynomialfeatures__degree=5;, score=-16.189 total time=   0.0s
[CV 6/7] END linearregression__fit_intercept=False, linearregression__normalize=False, polynomialfeatures__degree=5;, score=-28.842 total time=   0.0s
[CV 7/7] END linearregression__fit_intercept=False, linearregression__normalize=False, polynomialfeatures__degree=5;, score=-16.314 total time=   0.0s
[CV 1/7] END linearregression__fit_intercept=False, linearregression__normalize=False, polynomialfeatures__degree=6;, score=-37.767 total time=   0.1s
[CV 2/7] END linearregression__fit_intercept=False, linearregression__normalize=False, polynomialfeatures__degree=6;, score=-30.990 total time=   0.1s
[CV 3/7] END linearregression__fit_intercept=False, linearregression__normalize=False, polynom

GridSearchCV(cv=7,
             estimator=Pipeline(steps=[('polynomialfeatures',
                                        PolynomialFeatures()),
                                       ('linearregression',
                                        LinearRegression())]),
             param_grid={'linearregression__fit_intercept': [True, False],
                         'linearregression__normalize': [True, False],
                         'polynomialfeatures__degree': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
             verbose=3)

In [46]:
grid.best_params_

{'linearregression__fit_intercept': True,
 'linearregression__normalize': False,
 'polynomialfeatures__degree': 1}

In [47]:
grid.best_score_

0.4768863974833785

#### learning

In [48]:
reg = PolynomialRegression(degree=grid.best_params_['polynomialfeatures__degree'], \
                           fit_intercept=grid.best_params_['linearregression__fit_intercept'], \
                          normalize=grid.best_params_['linearregression__normalize'])

In [49]:
reg.fit(x_train, y_train)

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=1)),
                ('linearregression', LinearRegression())])

#### score

In [50]:
y_pred = reg.predict(x_test)

In [51]:
explained_variance_score(y_test, y_pred), mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)

(0.4407260889108393, 2992.557681452943, 0.43843604017332716)

## Unsupervised learning