In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from data_cleaning import *
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse
import matplotlib.pyplot as plt
import seaborn as sb
from scipy import stats
from sklearn.model_selection import ParameterGrid
from scipy.stats import pearsonr
from tqdm import tqdm

In [4]:
def dataset_creator(methods, models_names: list, columns_names: list, k1: int):
    header = pd.MultiIndex.from_product([methods, models_names, columns_names])
    df = pd.DataFrame(columns=header)
    df['KFold'] = np.arange(1, k1+1)
    df.set_index('KFold', inplace=True)
    return df

def twolevelcv(df, k1: int, k2: int, models: list, params: dict, rs: int, fill_methods: list, std_method: str):
    """Allows to compute two level crossvalidation.

    Args:
        X (np.array): Features (numeric)
        y (np.array): Class (objective variable)
        k1 (int): Nº of outer folds
        k2 (int): Nº of inner folds
        models (list): List of models for comparison
        params (dict): Dictionary including the set of parameters. In this case we only tune 1 parameter per model.
        rs (int): Random state
    Returns:
        df: Dataframe
    """
    k = 0
    min_error = np.inf
    min_param = None
    names = [type(m).__name__ for m in models]
    col_names = ['Param. Value', 'Error']
    results_df = dataset_creator(fill_methods, names, col_names, k1)
    kf1 = KFold(k1, shuffle = True, random_state=rs)
    X_raw = df.drop('y', axis=1)
    y = df['y']
    # first level split
    for train_idx1, test_idx1 in kf1.split(X_raw, y):
        error_test = {}
        k += 1
        kf2 = KFold(k2, shuffle = True, random_state=rs)
        print(f'Computing KFold {k}/{k1}...')
        # second level split
        mean_errors = []
        for train_idx2, test_idx2 in tqdm(kf2.split(X_raw.iloc[train_idx1, :], y[train_idx1]), total = k2):
            for method in fill_methods:
                X_train = transform_data(X_raw.iloc[train_idx2, :], fill_method=method, std_method=std_method).values
                y_train = y[train_idx2]
                X_test = transform_data(X_raw, fill_method=method, std_method=std_method).iloc[test_idx2, :].values
                y_test = y[test_idx2]
                for name, model in zip(names, models):
                    grid = ParameterGrid(params[name])
                    for p_ in grid:
                        model = model.set_params(**p_)
                        # train the model
                        model.fit(X_train, y_train)
                        # evaluate performance
                        pred2_test = model.predict(X_test)
                        error = mse(pred2_test, y_test)
                        if error < min_error:
                            min_error = error
                            min_param = p_


    # results_df.loc(axis = 1)[method, name, 'Error'][k] = error_test[idx]
    # results_df.loc(axis = 1)[method, name, 'Param. Value'][k] = min_param
    return error_test, test_idx1

### **Data flow**

In [5]:
# std_methods = ['standard', 'minmax', 'maxabs', 'robust']

In [7]:
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

df_raw, df_Xn_raw = load_data()
fill_methods = ['mean', 'median']
random_state = 3
params = {}
lam = np.logspace(-2, 4, 100)
lam_1 = np.logspace(2, 5, 100)
# C = [200000000, 10000000, 0.1519911082952933, 0.2848035868435805 ]
params['DummyRegressor'] = {'strategy': ['mean', 'median']}
params['LinearRegression'] = {'fit_intercept': [True, False]}
params['Ridge'] = {'alpha': lam, 'fit_intercept': [True, False]}
params['Lasso'] = {'alpha': lam_1, 'fit_intercept': [True, False], 'max_iter': [1000]}
params['ElasticNet'] = {'alpha': lam_1, 'l1_ratio': [0.1, 0.5], 'fit_intercept': [True,False], 'max_iter':[1000]}
params['RandomForestRegressor'] = {'n_estimators':[10, 50], 'max_depth':[None ,5], 'min_samples_split':[2 ,10], 'random_state':[random_state]}
params['GradientBoostingRegressor'] = {'n_estimators':[10 ,50], 'learning_rate':[0.01 ,0.1], 'max_depth':[3 ,5], 'min_samples_split':[2 ,10], 'random_state':[random_state]}

# Create a list of models
models = [DummyRegressor(),
          LinearRegression(),
          Ridge(random_state=random_state)]
        #   Lasso(random_state=random_state),
        #   ElasticNet(random_state=random_state),
        #   RandomForestRegressor(random_state=random_state),
        #   GradientBoostingRegressor(random_state=random_state)]
k1 = 2
k2 = 2
Table, test_set_outer = twolevelcv(df_raw, k1=k1, k2=k2, models=models, params=params, rs=random_state, fill_methods=fill_methods, std_method= 'standard')

Computing KFold 1/2...


100%|██████████| 2/2 [00:00<00:00,  4.74it/s]


Computing KFold 2/2...


100%|██████████| 2/2 [00:00<00:00,  4.89it/s]


In [9]:
Table

{('mean', 'DummyRegressor', "{'strategy': 'mean'}"): 41.81661555174129,
 ('mean', 'DummyRegressor', "{'strategy': 'median'}"): 43.56938954972183,
 ('mean', 'LinearRegression', "{'fit_intercept': True}"): 37.294577231091715,
 ('mean', 'LinearRegression', "{'fit_intercept': False}"): 38.285680660815466,
 ('mean',
  'Ridge',
  "{'alpha': 0.01, 'fit_intercept': True}"): 37.294080218907204,
 ('mean',
  'Ridge',
  "{'alpha': 0.01, 'fit_intercept': False}"): 38.2852522207144,
 ('mean',
  'Ridge',
  "{'alpha': 0.011497569953977356, 'fit_intercept': True}"): 37.294005915425345,
 ('mean',
  'Ridge',
  "{'alpha': 0.011497569953977356, 'fit_intercept': False}"): 38.2851882663566,
 ('mean',
  'Ridge',
  "{'alpha': 0.013219411484660288, 'fit_intercept': True}"): 37.29392052546618,
 ('mean',
  'Ridge',
  "{'alpha': 0.013219411484660288, 'fit_intercept': False}"): 38.28511480101334,
 ('mean',
  'Ridge',
  "{'alpha': 0.01519911082952934, 'fit_intercept': True}"): 37.293822401906155,
 ('mean',
  'Ridge'