# Comparing Regressors
This notebook compares multiple regression methods on multiple datasets and evaluates them in terms of the $r^2$-measure.

In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import r2_score, make_scorer
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

try:
    # Model Trees are installed / on the path
    from modeltrees import ModelTreeRegressor
except:
    # Assume project structure
    import sys
    sys.path.append("..")
    from modeltrees import ModelTreeRegressor

import pandas as pd
import numpy as np

## 1. Datasets
This gives a generator that iterates over all datasets.  
Each dataset is a triple consisting of 
- Features Matrix `X`, 
- Target Vector `y`, and 
- Name of the Dataset

See [Section 3.3](#characteristics) for a list of dataset characteristics

In [2]:
def get_datasets():
    # Using generators instead of lists for memory efficiency reasons.
    
    # Dataset 1: California Housing
    data = fetch_california_housing()
    X = data.data
    y = data.target
    
    yield (X, y, "Cal. Housing")

## 2. Regressors
We are comparing the following regressors:
- Linear Regression `Lin. Reg.`
- Decision Trees `DT` with maximal depth 3 and 6 
- Model Trees with maximal depth 1 and 3. We compare two split criteria:
    - Plain Gradient `MT-G`
    - Gradient with renormalization `MT-GR`

In [3]:
def get_regressors():
    return [
        (LinearRegression(normalize = True), "Lin. Reg."),
        (DecisionTreeRegressor(max_depth=3), "DT[D=3]"),
        (DecisionTreeRegressor(max_depth=6), "DT[D=6]"),
        (ModelTreeRegressor(max_depth=1), "MT-G[D=1]"),
        (ModelTreeRegressor(max_depth=3), "MT-G[D=3]"),
        (ModelTreeRegressor(max_depth=1, criterion="gradient-renorm-z"), "MT-GR[D=1]"),
        (ModelTreeRegressor(max_depth=3, criterion="gradient-renorm-z"), "MT-GR[D=3]")
    ]

## 3. Comparison
### 3.1 Parameters

In [4]:
# Cross Validation: Number of Folds
n_fold = 5

seed = 42   # We suggest to try other values to get a feeling for the stability

### 3.2 Evaluation
Iterating over datasets and regressors

In [5]:
# Create a DataFrame for results (see 3.4)
results = pd.DataFrame()

# Create a DataFrame for the Dataset Characteristics (see 3.3)
ds_characteristics = pd.DataFrame(columns=("#Samples", "#Features"))

# Create a scorer function
scorer = make_scorer(r2_score)

# Iterate over Datasets
for X, y, ds_name in get_datasets():
    
    # Store dataset  characteristics
    n_samples, n_features = X.shape
    ds_characteristics.loc[ds_name, "#Samples"] = n_samples
    ds_characteristics.loc[ds_name, "#Features"] = n_features
    
    # Iterate over Regressors
    for model, m_name in get_regressors():
        # Use the same seed for comparing different regressors
        kfold = KFold(n_splits=n_fold, shuffle=True, random_state=seed)
        
        scores = cross_val_score(model, X, y, scoring=scorer, cv=kfold)
        
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        
        cell_text = f"{mean_score*100:.2f} ± {std_score*100:.2f}"
        results.loc[ds_name, m_name] = cell_text

### 3.3 Dataset Characteristics <a id='characteristics'></a>

In [6]:
ds_characteristics["#Samples"] = ds_characteristics["#Samples"].astype(dtype=np.int)
ds_characteristics["#Features"] = ds_characteristics["#Features"].astype(dtype=np.int)
ds_characteristics

Unnamed: 0,#Samples,#Features
Cal. Housing,20640,8


### 3.4 Results
The regressors are evaluated in terms of the $r^2$ metric.  
The following results are given in percentage. The uncertainty is given as standard deviation of the $r^2$ score.

In [7]:
results

Unnamed: 0,Lin. Reg.,DT[D=3],DT[D=6],MT-G[D=1],MT-G[D=3],MT-GR[D=1],MT-GR[D=3]
Cal. Housing,60.14 ± 1.70,52.70 ± 1.46,64.16 ± 1.29,67.09 ± 1.57,68.32 ± 1.68,67.16 ± 1.50,72.24 ± 1.05
