In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from data_cleaning import *
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse
import matplotlib.pyplot as plt
import seaborn as sb
from scipy import stats
from sklearn.model_selection import ParameterGrid
from scipy.stats import pearsonr
from tqdm import tqdm
import itertools
import warnings
warnings.filterwarnings('ignore')

In [57]:
def twolevelcv(df, k1: int, k2: int, models: list, params: dict, rs: int, fill_methods: list, std_method: list):
    """Allows to compute two level crossvalidation.

    Args:
        X (np.array): Features (numeric)
        y (np.array): Class (objective variable)
        k1 (int): Nº of outer folds
        k2 (int): Nº of inner folds
        models (list): List of models for comparison
        params (dict): Dictionary including the set of parameters. In this case we only tune 1 parameter per model.
        rs (int): Random state
    Returns:
        look at return
    """
    k = 0
    names = [type(m).__name__ for m in models]
    kf1 = KFold(k1, shuffle = True, random_state=rs)
    X_raw = df.drop('y', axis=1)
    y = df['y']
    combs = list(itertools.product(fill_methods,std_method,zip(names,models)))
    it_combs = len(combs)
    N = X_raw.shape[0]
    
    best_fill = [0]*k1
    best_std = [0]*k1
    best_name_model = [0]*k1
    errors_out = [0]*k1
    
    # first level split
    for z,(train_idx1, test_idx1) in enumerate(kf1.split(X_raw, y)):
        k += 1
        kf2 = KFold(k2, shuffle = True, random_state=rs+z+1)
        print(f'Computing KFold {k}/{k1}...')
        err = [[] for pp in range(it_combs)]
        
        # second level split
        for t,(train_idx2, test_idx2) in tqdm(enumerate(kf2.split(X_raw.iloc[train_idx1, :], y[train_idx1])), total = k2):
            for i in range(it_combs):
                method = combs[i][0]
                std_method = combs[i][1]
                name,model = combs[i][2]
                X_train,X_test = transform_data(X_raw.iloc[train_idx2, :],X_raw.iloc[test_idx2, :] \
                                                , fill_method=method, std_method=std_method)
                y_train = y[train_idx2]
                y_test = y[test_idx2]
                grid = list(ParameterGrid(params[name]))
                n_p = len(grid)
                if err[i] == []:
                    err[i] = [[] for pp in range(n_p)]
                for j in range(n_p):
                    p_ = grid[j]
                    model = model.set_params(**p_)
                    # train the model
                    model.fit(X_train, y_train)
                    # evaluate performance
                    pred2_test = model.predict(X_test)
                    error = mse(pred2_test, y_test,squared = False)
                    err[i][j].append(error*len(test_idx2)/len(train_idx1))
        # inner cv has finished, choose model and param
        best_err = np.inf
        i_best = None
        j_best = None
        for i in range(it_combs):
            for j in range(len(err[i])):
                aux = np.sum(err[i][j])
                if aux < best_err:
                    i_best = i
                    j_best = j
                    best_err = aux
        method = combs[i_best][0]
        std_method = combs[i_best][1]
        name,model = combs[i_best][2]
        grid = list(ParameterGrid(params[name]))
        p_ = grid[j_best]
        model = model.set_params(**p_)

        X_tr,X_te = transform_data(X_raw.iloc[train_idx1, :], X_raw.iloc[test_idx1, :],  \
                                   fill_method=method, std_method=std_method)
        y_te = y[test_idx1]
        y_tr = y[train_idx1]
        model.fit(X_tr,y_tr)
        pred = model.predict(X_te)
        error = mse(pred,y_te,squared = False)
        
        best_fill[z] = method
        best_std[z] = std_method
        best_name_model[z] = (name,model)
        print(f"fill method: {method}, std_method: {std_method}, model: {name} with parameter: {p_}")
        print(f"error: {error}")
        errors_out[z] = error*len(test_idx1)/100
    
    gen_error = np.sum(errors_out)
    
    return best_fill,best_std,best_name_model,errors_out,gen_error

### **Data flow**

In [58]:
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

df_raw, df_Xn_raw = load_data()
std_methods = ['standard','minmax','maxabs','robust'] + list(range(10,81,5))
fill_methods = ['mean', 'median']

random_state = 3
params = {}
lam = np.logspace(-2, 3, 100)
lam_1 = lam[:]
lam_2 = lam[:]
l1 = np.linspace(0,1,100)
n_est = list(range(20,200,20))
m_depth = list(range(3,20,7))
lr = np.logspace(-3,0,10)
m_sm_spl = list(range(2,81,7))
n_neigh = list(range(5,80,5))

params['DummyRegressor'] = {'strategy': ['mean', 'median']}
params['LinearRegression'] = {'fit_intercept': [True, False]}
params['Ridge'] = {'alpha': lam, 'fit_intercept': [True, False],'random_state':[1657]}
params['Lasso'] = {'alpha': lam_1, 'fit_intercept': [True, False], 'max_iter': [1000],'random_state':[909123]}
params['ElasticNet'] = {'alpha': lam_2, 'l1_ratio': l1, 'fit_intercept': [True,False], \
                                                            'max_iter':[1000], 'random_state':[123]}
params['RandomForestRegressor'] = {'n_estimators':n_est, 'max_depth':m_depth, \
                                                'min_samples_split':m_sm_spl[:], 'random_state':[456]}
params['GradientBoostingRegressor'] = {'n_estimators': n_est, 'learning_rate':lr, \
                                        'max_depth':m_depth, 'min_samples_split':m_sm_spl, 'random_state':[789]}
params['KNeighborsRegressor'] = {'n_neighbors':n_neigh,'n_jobs': [-1],\
                                                           'weights':['uniform', 'distance'],'p':[1,2]}

# Create a list of models
models = [Ridge(),
          KNeighborsRegressor(), 
          LinearRegression(),
          Lasso(),
          ElasticNet(),
          RandomForestRegressor(),
          GradientBoostingRegressor(),
          DummyRegressor()]
k1 = 10
k2 = 10
best_fill,best_std,best_name_model,errors_out,gen_err = twolevelcv(df_raw, k1=k1, k2=k2, models=models,\
                        params=params, rs=random_state, fill_methods=fill_methods, std_method = std_methods)

Computing KFold 1/10...


  0%|                                                    | 0/10 [00:17<?, ?it/s]


KeyboardInterrupt: 

In [46]:
gen_err

27.742535519630007