# Cross-Validation Comparison

*This notebook compares the performance of ridge regression models set using different cross-validation metrics on the pollution dataset.*

## Import Dependencies

In [1]:
import peak_engines
from collections import defaultdict
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_validate
import numpy as np
import pandas as pd

## Set up datasets and models

In [2]:
df = pd.read_csv('pollution.tsv', header=0, delim_whitespace=True)
X = np.array(df.iloc[:, :-1].values, dtype=float)
y = np.array(df.iloc[:,-1].values, dtype=float)

In [3]:
datasets = {
    'pollution': (X, y),
}

In [4]:
models = {
    'LR' : LinearRegression(normalize=True),
    'RR-LOOCV' : peak_engines.RidgeRegressionModel(normalize=True, score='loocv'),
    'RR-GCV' : peak_engines.RidgeRegressionModel(normalize=True, score='gcv'),
}

## Compute the score matrix

In [5]:
def compute_score_matrix(datasets, models):
    score_matrix = defaultdict(list)
    rows = []
    index = []
    for dataset, (X, y) in datasets.items():
        index.append(dataset)
        for name, model in models.items():
            cv_result = cross_validate(model, X, y, cv=LeaveOneOut(), scoring='neg_mean_squared_error')
            score = np.sqrt(-cv_result['test_score'].mean())
            score_matrix[name].append(score)
    return pd.DataFrame(score_matrix, index=index)

In [6]:
compute_score_matrix(datasets, models)

Unnamed: 0,LR,RR-LOOCV,RR-GCV
pollution,46.22537,41.130251,40.89447
