In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from data_cleaning import *
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse
import matplotlib.pyplot as plt
import seaborn as sb
from scipy import stats
from sklearn.model_selection import ParameterGrid
from scipy.stats import pearsonr
from tqdm import tqdm
import itertools

In [15]:
def dataset_creator(methods, models_names: list, columns_names: list, k1: int):
    header = pd.MultiIndex.from_product([methods, models_names, columns_names])
    df = pd.DataFrame(columns=header)
    df['KFold'] = np.arange(1, k1+1)
    df.set_index('KFold', inplace=True)
    return df

def twolevelcv(df, k1: int, k2: int, models: list, params: dict, rs: int, fill_methods: list, std_method: list):
    """Allows to compute two level crossvalidation.

    Args:
        X (np.array): Features (numeric)
        y (np.array): Class (objective variable)
        k1 (int): Nº of outer folds
        k2 (int): Nº of inner folds
        models (list): List of models for comparison
        params (dict): Dictionary including the set of parameters. In this case we only tune 1 parameter per model.
        rs (int): Random state
    Returns:
        df: Dataframe
    """
    k = 0
    min_error = np.inf
    min_param = None
    names = [type(m).__name__ for m in models]
    col_names = ['Param. Value', 'Error']
    results_df = dataset_creator(fill_methods, names, col_names, k1)
    kf1 = KFold(k1, shuffle = True, random_state=rs)
    X_raw = df.drop('y', axis=1)
    y = df['y']
    combs = list(itertools.product(fill_methods,std_method,zip(names,models)))
    it_combs = len(combs)
    
    best_fill = [0]*k1
    best_std = [0]*k1
    best_name_model = [0]*k1
    errors_out = [0]*k1
    
    # first level split
    for z,(train_idx1, test_idx1) in enumerate(kf1.split(X_raw, y)):
        error_test = {}
        k += 1
        kf2 = KFold(k2, shuffle = True, random_state=rs)
        print(f'Computing KFold {k}/{k1}...')
        err = [[] for pp in range(it_combs)]
        # second level split
        for train_idx2, test_idx2 in tqdm(kf2.split(X_raw.iloc[train_idx1, :], y[train_idx1]), total = k2):
            for i in range(it_combs):
                method = combs[i][0]
                std_method = combs[i][1]
                name,model = combs[i][2]
                X_train,X_test = transform_data(X_raw.iloc[train_idx2, :],X_raw.iloc[test_idx2, :] \
                                                , fill_method=method, std_method=std_method)
                y_train = y[train_idx2]
                y_test = y[test_idx2]
                grid = list(ParameterGrid(params[name]))
                n_p = len(grid)
                if err[i] == []:
                    err[i] = [0]*n_p
                for j in range(n_p):
                    p_ = grid[j]
                    model = model.set_params(**p_)
                    # train the model
                    model.fit(X_train, y_train)
                    # evaluate performance
                    pred2_test = model.predict(X_test)
                    error = mse(pred2_test, y_test,squared = False)
                    err[i][j] += error*len(test_idx2)/len(train_idx1)
        # inner cv has finished, choose model and param
        best_err = np.inf
        i_best = None
        j_best = None
        for i in range(it_combs):
            name,model = combs[i][2]
            for j in range(len(err[i])):
                if name == "LinearRegression":
                    print(err[i][j])
                if err[i][j] < best_err:
                    i_best = i
                    j_best = j
                    best_err = err[i][j]
        method = combs[i_best][0]
        std_method = combs[i_best][1]
        name,model = combs[i_best][2]
        grid = list(ParameterGrid(params[name]))
        p_ = grid[j_best]
        model = model.set_params(**p_)

        X_tr,X_te = transform_data(X_raw.iloc[train_idx1, :], X_raw.iloc[test_idx1, :],  \
                                   fill_method=method, std_method=std_method)
        y_te = y[test_idx1]
        y_tr = y[train_idx1]
        model.fit(X_tr,y_tr)
        pred = model.predict(X_te)
        error = mse(pred,y_te,squared = False)
        
        best_fill[z] = method
        best_std[z] = std_method
        best_name_model[z] = (name,model)
        print(f"fill method: {method}, std_method: {std_method}, model: {name} with parameter: {p_}")
        print(f"error: {error}")
        errors_out[z] = error
    # results_df.loc(axis = 1)[method, name, 'Error'][k] = error_test[idx]
    # results_df.loc(axis = 1)[method, name, 'Param. Value'][k] = min_param
    return best_fill,best_std,best_name_model,errors_out

### **Data flow**

In [5]:
# std_methods = ['standard', 'minmax', 'maxabs', 'robust']

In [16]:
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

df_raw, df_Xn_raw = load_data()
std_methods = ['standard','minmax']
fill_methods = ['mean', 'median']
random_state = 3
params = {}
lam = np.logspace(-2, 4, 1000)
lam_1 = np.logspace(2, 5, 1000)
# C = [200000000, 10000000, 0.1519911082952933, 0.2848035868435805 ]
params['DummyRegressor'] = {'strategy': ['mean', 'median']}
params['LinearRegression'] = {'fit_intercept': [True, False]}
params['Ridge'] = {'alpha': lam, 'fit_intercept': [True, False]}
params['Lasso'] = {'alpha': lam_1, 'fit_intercept': [True, False], 'max_iter': [1000]}
params['ElasticNet'] = {'alpha': lam_1, 'l1_ratio': [0.1, 0.5], 'fit_intercept': [True,False], 'max_iter':[1000]}
params['RandomForestRegressor'] = {'n_estimators':[10, 50], 'max_depth':[None ,5], 'min_samples_split':[2 ,10], 'random_state':[random_state]}
params['GradientBoostingRegressor'] = {'n_estimators':[10 ,50], 'learning_rate':[0.01 ,0.1], 'max_depth':[3 ,5], 'min_samples_split':[2 ,10], 'random_state':[random_state]}
params['KNeighborsRegressor'] = {'n_neighbors':list(range(1,25)),'n_jobs': [-1],'weights':['uniform', 'distance'],'p':[1,2]}

# Create a list of models
models = [DummyRegressor(),
          LinearRegression(),
          Ridge(random_state=random_state),
          Lasso(random_state=random_state),
          ElasticNet(random_state=random_state),
          RandomForestRegressor(random_state=random_state),
          GradientBoostingRegressor(random_state=random_state),
          KNeighborsRegressor()
         ]
k1 = 5
k2 = 5
best_fill,best_std,best_name_model,errors_out = twolevelcv(df_raw, k1=k1, k2=k2, models=models,params=params, rs=random_state, fill_methods=fill_methods, std_method = std_methods)

Computing KFold 1/5...


100%|█████████████████████████████████████████████| 5/5 [03:22<00:00, 40.57s/it]


37.89471796138704
37.45820582035938
4583.906481322263
2569.3480682610234
37.72863752629877
37.00731940906497
4323.230208365834
2748.6906265607404
fill method: median, std_method: standard, model: Ridge with parameter: {'alpha': 14.426439512181574, 'fit_intercept': False}
error: 29.981012131384013
Computing KFold 2/5...


100%|█████████████████████████████████████████████| 5/5 [03:27<00:00, 41.43s/it]


37.89471796138704
37.45820582035938
4583.906481322263
2569.3480682610234
37.72863752629877
37.00731940906497
4323.230208365834
2748.6906265607404
fill method: median, std_method: standard, model: Ridge with parameter: {'alpha': 14.426439512181574, 'fit_intercept': False}
error: 30.062271223665828
Computing KFold 3/5...


100%|█████████████████████████████████████████████| 5/5 [03:25<00:00, 41.06s/it]


37.89471796138704
37.45820582035938
4583.906481322263
2569.3480682610234
37.72863752629877
37.00731940906497
4323.230208365834
2748.6906265607404
fill method: median, std_method: standard, model: Ridge with parameter: {'alpha': 14.426439512181574, 'fit_intercept': False}
error: 23.935875102475872
Computing KFold 4/5...


100%|█████████████████████████████████████████████| 5/5 [03:25<00:00, 41.07s/it]


37.89471796138704
37.45820582035938
4583.906481322263
2569.3480682610234
37.72863752629877
37.00731940906497
4323.230208365834
2748.6906265607404
fill method: median, std_method: standard, model: Ridge with parameter: {'alpha': 14.426439512181574, 'fit_intercept': False}
error: 36.88154633277559
Computing KFold 5/5...


100%|█████████████████████████████████████████████| 5/5 [03:48<00:00, 45.68s/it]

37.89471796138704
37.45820582035938
4583.906481322263
2569.3480682610234
37.72863752629877
37.00731940906497
4323.230208365834
2748.6906265607404
fill method: median, std_method: standard, model: Ridge with parameter: {'alpha': 14.426439512181574, 'fit_intercept': False}
error: 27.114728730466926





In [18]:
np.mean(errors_out)

29.595086704153648