In [1]:
%load_ext autoreload
%autoreload 2

In [28]:
from data_cleaning import *
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse
import matplotlib.pyplot as plt
import seaborn as sb
from scipy import stats
from sklearn.model_selection import ParameterGrid
from scipy.stats import pearsonr
from tqdm import tqdm

In [40]:
def dataset_creator(methods, models_names: list, columns_names: list, k1: int):
    header = pd.MultiIndex.from_product([methods, models_names, columns_names])
    df = pd.DataFrame(columns=header)
    df['KFold'] = np.arange(1, k1+1)
    df.set_index('KFold', inplace=True)
    return df

def twolevelcv(df, k1: int, k2: int, models: list, params: dict, rs: int, fill_methods: list, std_method: str):
    """Allows to compute two level crossvalidation.

    Args:
        X (np.array): Features (numeric)
        y (np.array): Class (objective variable)
        k1 (int): Nº of outer folds
        k2 (int): Nº of inner folds
        models (list): List of models for comparison
        params (dict): Dictionary including the set of parameters. In this case we only tune 1 parameter per model.
        rs (int): Random state
    Returns:
        df: Dataframe
    """
    test_error_dict = {}
    k = 0
    names = [type(m).__name__ for m in models]
    col_names = ['Param. Value', 'Error']
    results_df = dataset_creator(fill_methods, names, col_names, k1)
    kf1 = KFold(k1, shuffle = True, random_state=rs)
    X_raw = df.drop('y', axis=1)
    y = df['y']
    # first level split
    for train_idx1, test_idx1 in kf1.split(X_raw, y):
        k += 1
        kf2 = KFold(k2, shuffle = True, random_state=rs)
        print(f'Computing KFold {k}/{k1}...')
        # second level split
        for train_idx2, test_idx2 in tqdm(kf2.split(X_raw.iloc[train_idx1, :], y[train_idx1]), total = k2):
            for method in fill_methods:
                X_train = transform_data(X_raw.iloc[train_idx2, :], fill_method=method, std_method=std_method).values
                y_train = y[train_idx2]
                X_test = transform_data(X_raw, fill_method=method, std_method=std_method).iloc[test_idx2, :].values
                y_test = y[test_idx2]
                for name, model in zip(names, models):
                    grid = ParameterGrid(params[name])
                    error_test = []
                    for p_ in grid:
                        model = model.set_params(**p_)
                        # train the model
                        model.fit(X_train, y_train)
                        # evaluate performance
                        pred2_test = model.predict(X_test)
                        error_test.append(mse(y_test, pred2_test, squared=False))
                    idx = np.argmin(error_test)
                    min_param = grid[idx]
                    results_df.loc(axis = 1)[method, name, 'Error'][k] = error_test[idx]
                    results_df.loc(axis = 1)[method, name, 'Param. Value'][k] = min_param
    return results_df, test_idx1

### **Data flow**

In [41]:
# std_methods = ['standard', 'minmax', 'maxabs', 'robust']

In [42]:
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.dummy import DummyRegressor


df_raw, df_Xn_raw = load_data()
fill_methods = ['mean', 'median']
random_state = 3
params = {}
lam = np.logspace(-2, 4, 10)
# C = [200000000, 10000000, 0.1519911082952933, 0.2848035868435805 ]
params['DummyRegressor'] = {'strategy': ['mean', 'median']}
params['LinearRegression'] = {'fit_intercept': [True, False]}
params['Ridge'] = {'alpha': lam, 'fit_intercept': [True, False]}
models = [DummyRegressor(),
          LinearRegression(),
          Ridge(random_state=random_state)]
k1 = 10
k2 = 10
Table, test_set_outer = twolevelcv(df_raw, k1=k1, k2=k2, models=models, params=params, rs=random_state, fill_methods=fill_methods, std_method= 'standard')

Computing KFold 1/10...


100%|██████████| 10/10 [00:01<00:00,  5.17it/s]


Computing KFold 2/10...


100%|██████████| 10/10 [00:01<00:00,  5.18it/s]


Computing KFold 3/10...


100%|██████████| 10/10 [00:02<00:00,  4.71it/s]


Computing KFold 4/10...


100%|██████████| 10/10 [00:02<00:00,  4.83it/s]


Computing KFold 5/10...


100%|██████████| 10/10 [00:01<00:00,  5.05it/s]


Computing KFold 6/10...


100%|██████████| 10/10 [00:01<00:00,  5.18it/s]


Computing KFold 7/10...


100%|██████████| 10/10 [00:01<00:00,  5.02it/s]


Computing KFold 8/10...


100%|██████████| 10/10 [00:01<00:00,  5.53it/s]


Computing KFold 9/10...


100%|██████████| 10/10 [00:01<00:00,  5.53it/s]


Computing KFold 10/10...


100%|██████████| 10/10 [00:02<00:00,  4.33it/s]


In [43]:
Table

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,median,median,median,median,median,median
Unnamed: 0_level_1,DummyRegressor,DummyRegressor,LinearRegression,LinearRegression,Ridge,Ridge,DummyRegressor,DummyRegressor,LinearRegression,LinearRegression,Ridge,Ridge
Unnamed: 0_level_2,Param. Value,Error,Param. Value,Error,Param. Value,Error,Param. Value,Error,Param. Value,Error,Param. Value,Error
KFold,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
1,{'strategy': 'mean'},22.852697,{'fit_intercept': False},62.348789,"{'fit_intercept': True, 'alpha': 100.0}",13.336815,{'strategy': 'mean'},22.852697,{'fit_intercept': True},61.077778,"{'fit_intercept': True, 'alpha': 100.0}",13.200254
2,{'strategy': 'mean'},22.852697,{'fit_intercept': False},62.348789,"{'fit_intercept': True, 'alpha': 100.0}",13.336815,{'strategy': 'mean'},22.852697,{'fit_intercept': True},61.077778,"{'fit_intercept': True, 'alpha': 100.0}",13.200254
3,{'strategy': 'mean'},22.852697,{'fit_intercept': False},62.348789,"{'fit_intercept': True, 'alpha': 100.0}",13.336815,{'strategy': 'mean'},22.852697,{'fit_intercept': True},61.077778,"{'fit_intercept': True, 'alpha': 100.0}",13.200254
4,{'strategy': 'mean'},22.852697,{'fit_intercept': False},62.348789,"{'fit_intercept': True, 'alpha': 100.0}",13.336815,{'strategy': 'mean'},22.852697,{'fit_intercept': True},61.077778,"{'fit_intercept': True, 'alpha': 100.0}",13.200254
5,{'strategy': 'mean'},22.852697,{'fit_intercept': False},62.348789,"{'fit_intercept': True, 'alpha': 100.0}",13.336815,{'strategy': 'mean'},22.852697,{'fit_intercept': True},61.077778,"{'fit_intercept': True, 'alpha': 100.0}",13.200254
6,{'strategy': 'mean'},22.852697,{'fit_intercept': False},62.348789,"{'fit_intercept': True, 'alpha': 100.0}",13.336815,{'strategy': 'mean'},22.852697,{'fit_intercept': True},61.077778,"{'fit_intercept': True, 'alpha': 100.0}",13.200254
7,{'strategy': 'mean'},22.852697,{'fit_intercept': False},62.348789,"{'fit_intercept': True, 'alpha': 100.0}",13.336815,{'strategy': 'mean'},22.852697,{'fit_intercept': True},61.077778,"{'fit_intercept': True, 'alpha': 100.0}",13.200254
8,{'strategy': 'mean'},22.852697,{'fit_intercept': False},62.348789,"{'fit_intercept': True, 'alpha': 100.0}",13.336815,{'strategy': 'mean'},22.852697,{'fit_intercept': True},61.077778,"{'fit_intercept': True, 'alpha': 100.0}",13.200254
9,{'strategy': 'mean'},22.852697,{'fit_intercept': False},62.348789,"{'fit_intercept': True, 'alpha': 100.0}",13.336815,{'strategy': 'mean'},22.852697,{'fit_intercept': True},61.077778,"{'fit_intercept': True, 'alpha': 100.0}",13.200254
10,{'strategy': 'mean'},22.852697,{'fit_intercept': False},62.348789,"{'fit_intercept': True, 'alpha': 100.0}",13.336815,{'strategy': 'mean'},22.852697,{'fit_intercept': True},61.077778,"{'fit_intercept': True, 'alpha': 100.0}",13.200254
