## Advanced Machine Learning - Scikit Learn 
### with Adreas Mueller

## Regression
* Import packages

In [1]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

#### Load Boston Housing dataset

In [2]:
from sklearn.datasets import load_boston
boston = load_boston()
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR'])

In [3]:
print(boston.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

### Explore the dataset
* GOAL: Predict median house price in thousands of dollars (float) based on several features 
* Target is the value of the house that we want to predict
* Split the data into TRAIN and TEST sets

In [4]:
boston.data.shape

(506, 13)

In [5]:
boston.target.shape

(506,)

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target)

### Build Regression Model: "Learning a Regressor"
* Data features (X) are used to predict target outcomes of interest (y)
* Estimated regression coefficients and intercept are stored in ridge object

In [9]:
from sklearn.linear_model import Ridge
ridge = Ridge()

#### 1) Fit model to TEST data

In [10]:
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

#### 2) Predict new values from TEST set 

In [11]:
pred_test = ridge.predict(X_test)
pred_test

array([ 12.89711632,  17.38897704,  16.74387154,  24.59529923,
        27.92988296,  17.66465451,  13.09002963,  20.08072197,
        32.37444434,  28.64756608,  26.0749564 ,  20.28064123,
        16.6201264 ,  37.80213959,  32.18297883,  18.9703282 ,
        39.68878315,  29.74987887,  13.0035633 ,  22.95532742,
         1.52895515,  12.29735804,  27.76505738,  16.82578929,
        20.30124444,  28.67984681,  17.51674701,  32.03347219,
        30.52658488,  17.1320772 ,  34.51776459,  18.94472003,
        26.11904486,  20.18398479,  21.83309189,  30.01006102,
        17.29906086,  32.25939931,  21.51210914,  22.97437792,
        25.4904392 ,  17.47162479,   9.72076232,  34.76220522,
        30.30249674,  21.69511244,  19.40009935,  34.11796261,
        44.13099447,  28.26473985,   2.45644762,  17.070151  ,
        24.42947896,  36.36200056,  32.0450444 ,  11.12459781,
        22.09628895,  20.22811984,  11.50619471,  27.6152111 ,
        22.86580039,  25.18352666,  38.5159734 ,  15.86

#### 3) Evaluate model using R2 score, calculate MSE

In [12]:
ridge.score(X_test, y_test)

0.58268237456221206

In [13]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, pred_test)

32.763835755759779

## Random Forests Regression
* Import Random Forest Regressor model and instantiate it
* Call fit function using the training data
* Call predict function using the test data
* Evaluate the model using R2 or MSE

In [25]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [26]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [29]:
rf.predict(X_test)

array([ 31.9 ,  18.94,  10.96,  25.02,  44.68,  37.26,  19.1 ,  19.45,
        18.04,  26.17,  21.93,  18.5 ,  21.36,   8.02,  17.32,  15.54,
        23.79,  23.63,  15.11,  16.97,   9.71,  20.71,  22.49,  23.85,
        25.28,  11.61,  14.  ,  19.64,  46.1 ,  20.21,  13.47,  15.1 ,
        12.62,  20.64,  19.72,  34.57,  43.14,  14.46,  23.41,  20.84,
        17.77,  21.08,  20.77,  22.73,  26.04,  18.03,  15.43,  21.76,
        22.15,  18.58,  14.16,  13.81,  20.44,  45.51,  19.57,  20.78,
        20.32,  13.68,  16.2 ,  19.97,  31.35,  34.98,  15.65,  19.07,
        15.81,  20.46,  27.99,  16.42,  45.39,  20.98,  20.07,  20.66,
        31.  ,  29.66,   8.66,  19.71,  21.31,  18.97,  15.31,  11.47,
        22.06,  22.28,  25.73,  27.77,  22.59,  21.69,  28.33,  12.79,
        23.47,  25.53,  45.92,  19.54,   9.72,  20.51,  19.66,  28.74,
        29.39,  27.83,  25.92,  14.96,  20.7 ,  15.51,  24.67,  23.78,
        25.13,  18.71,  22.95,  19.93,  20.48,  16.51,  19.65,  32.62,
      

In [30]:
rf.score(X_test, y_test)

0.86704967526132126

In [31]:
mean_squared_error(y_test, rf.predict(X_test))

9.2267503937007849