In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data loading

In [4]:
df_train_original = pd.read_excel("train.xlsx")
df_test_original = pd.read_excel("test.xlsx")

# Under the condition that test2.xslx has "target" column but all values are nan
df_train_original.columns = ["Id","Gender","Age","A1","A2","A3","A4","A5","Target"]
df_test_original.columns = ["Id","Gender","Age","A1","A2","A3","A4","A5","Target"] 


df_train_original["Gender"] = df_train_original["Gender"].map({"男":1.0,"女":0.0})
df_test_original["Gender"] = df_test_original["Gender"].map({"男":1.0,"女":0.0})

df_train_original = df_train_original.drop("Id", axis=1)
df_test_original = df_test_original.drop("Id", axis=1)

# Delete rows with NaN in training set

df_train_original = df_train_original.dropna(axis=0, how='any')

## Functions for Preprocessing
    - (Removal of columns)
    - (Add reciprocal features)
    - (Oversampling) 
    - Outlier removal on training set for numerical features
    - Polynomial expansion on numerical features
    - MinMax normalization

In [5]:
# Function for outlier removal
def IQR_Outliers (X, features):

    #print('# of features: ', len(features))
    #print('Features: ', features)

    indices = [x for x in X.index]
    #print(indices)
    #print('Number of samples: ', len(indices))
    
    out_indexlist = []
        
    for col in features:
       
        # Using nanpercentile instead of percentile because of nan values
        Q1 = np.nanpercentile(X[col], 25.)
        Q3 = np.nanpercentile(X[col], 75.)
        
        cut_off = (Q3 - Q1) * 2.5 # parameter 2.5 is to remove the most extreme outliers.
        #cut_off=0
        upper, lower = Q3 + cut_off, Q1 - cut_off
        #print ('\nFeature: ', col)
        #print ('Upper and Lower limits: ', upper, lower)
                
        outliers_index = X[col][(X[col] < lower) | (X[col] > upper)].index.tolist()
        outliers = X[col][(X[col] < lower) | (X[col] > upper)].values
        #print('Number of outliers: ', len(outliers))
        #print('Outliers Index: ', outliers_index)
        #print('Outliers: ', outliers)
        
        out_indexlist.extend(outliers_index)
        
    # Using set to remove duplicates
    out_indexlist = list(set(out_indexlist))
    out_indexlist.sort()
    #print('\nNumber of rows with outliers: ', len(out_indexlist))
    #print('List of rows with outliers: ', out_indexlist)
    
    return out_indexlist

In [6]:
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE


def preprocessing(df_train, df_test, target_name, drop_col, non_numeric_features, outlier_rem_func, deg_poly,
                  oversampling=False, oversampling_name=None, oversampling_range=None,
                  add_reciprocal_features=False, features_for_reciprocal=None):
    
    # Drop certain columns
    if drop_col:
        df_train = df_train.drop(drop_col, axis=1)
        df_test = df_test.drop(drop_col, axis=1)
        
    # Oversampling    
    if oversampling:
        # Split the minnority for oversampling 
        df_train["class"] = 0
        ind_oversampling = np.where(np.logical_and(df_train[oversampling_name]>oversampling_range[0], 
                                                   df_train[oversampling_name]<oversampling_range[1]))
        num_samples = ind_oversampling[0].shape[0]
                                                
        df_train["class"].iloc[ind_oversampling] = 1
            
        # Oversampling with SMOTE

        oversamp_data, oversamp_class = df_train.drop("class", axis=1), df_train["class"]

        oversamp_class = LabelEncoder().fit_transform(oversamp_class)

        print(ind_oversampling[0].shape)
        
        samp_strategy = {1: int(num_samples*1.5)} # Increase the minority by a factor 1.5
        oversampler = SMOTE(sampling_strategy=samp_strategy)

        oversamp_data, oversamp_class = oversampler.fit_resample(oversamp_data, oversamp_class)

        df_train = oversamp_data.copy()
        
    # Add reciprocal features:
    if add_reciprocal_features:
        for feature in features_for_reciprocal:
            df_train["1/{}".format(feature)] = 1.0 / df_train[feature].values
            df_test["1/{}".format(feature)] = 1.0 / df_test[feature].values
            
    # Extract the target
    y = df_train[target_name]
    
    # Remove outliers on training set for numerical features as well as target
    col_numeric_features_plus_target = list(df_train.columns)
    if non_numeric_features:
        for feature in non_numeric_features:
            col_numeric_features_plus_target.remove(feature)
    
    if outlier_rem_func:
    
        outlier_index = outlier_rem_func(df_train, col_numeric_features_plus_target)

        df_train = df_train.drop(index=outlier_index)
        y = y.drop(index=outlier_index)
        
    # Columns for numerical features for further polynomial transformation
    col_numeric_features_plus_target.remove("Target")
    col_numeric = col_numeric_features_plus_target.copy()

    df_numeric_train = df_train[col_numeric]
    df_numeric_test = df_test[col_numeric]
    
    # Polynomial transformation and append back to original dataframe
    
    p = PolynomialFeatures(degree=deg_poly, interaction_only=False).fit(df_numeric_train)
    
    df_train_X_withpoly = pd.DataFrame(p.transform(df_numeric_train), columns=p.get_feature_names_out(df_numeric_train.columns))
    df_test_X_withpoly = pd.DataFrame(p.transform(df_numeric_test), columns=p.get_feature_names_out(df_numeric_test.columns))
    
    if non_numeric_features:
        
        df_train_X_withpoly[non_numeric_features] = df_train[non_numeric_features].values
        df_test_X_withpoly[non_numeric_features] = df_test[non_numeric_features].values
    
    # Normalization on the polynomially transformed features

    n = MinMaxScaler().fit(df_train_X_withpoly)
    
    df_out_train = pd.DataFrame(n.transform(df_train_X_withpoly), columns=df_train_X_withpoly.columns)
    df_out_test = pd.DataFrame(n.transform(df_test_X_withpoly), columns=df_test_X_withpoly.columns)
    
    df_out_train[target_name] = y.values 
    
    return df_out_train, df_out_test

## Functions for evaluation

In [7]:
def check_acc(predictions, target):
    diff = (target - predictions) * 0.68
    count = sum(abs(diff)<0.5)
    # Return in percent
    return count / predictions.shape[0]


## Models

In [8]:
from pycaret.regression import *

### Model 1 with count 0.761
* no column removal, no oversampling, no reciprocal features
* 5th order polynomial transformation on ["A1", "A2", "A3", "A4", "A5"]
* ML model: Huber regression

In [8]:
# Preprocessing
df_trans_train, df_trans_test = preprocessing(df_train_original, df_test_original, target_name=["Target"],
                                              drop_col=None, non_numeric_features=["Gender", "Age"], 
                                              outlier_rem_func=IQR_Outliers, deg_poly=5)

# setup on pycaret for test purpose
exp_reg = setup(data = df_trans_train, target = "Target", session_id=1, bin_numeric_features=["Gender"],
                train_size=0.85, experiment_name="regression1", n_jobs=5, use_gpu=True)



[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recomp

Unnamed: 0,Description,Value
0,Session id,1
1,Target,Target
2,Target type,Regression
3,Original data shape,"(1003, 255)"
4,Transformed data shape,"(1003, 255)"
5,Transformed train set shape,"(852, 255)"
6,Transformed test set shape,"(151, 255)"
7,Numeric features,254
8,Preprocess,True
9,Imputation type,simple


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1


In [9]:
# Create and primarily tune the model with pycaret

huber_params = {"alpha": [1e-7, 5e-7, 1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4, 0.001, 0.005, 0.008, 0.01, 0.02, 0.03, 0.1, 1.0, 2.0, 3.0, 10.0],
               "epsilon": [1.03, 1.05, 1.07, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1, 2.5, 3.0, 3.5, 4.5, 5.0, 10.0]}
huber = create_model('huber')

tuned_huber = tune_model(huber, n_iter=50, optimize="mae", choose_better=True, custom_grid=huber_params)

# Increase the max_iter manually due to insufficient computational power
paras_huber = dict(tuned_huber.get_params())
paras_huber["max_iter"] = 10000
paras_huber["tol"] = 1e-6

huber_manually_toned = create_model('huber', **paras_huber)

huber_final = finalize_model(huber_manually_toned)

save_model(huber_final, "model_huber_withAgeA5_poly5noAge_v2")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4572,0.4274,0.6537,0.9824,0.1246,0.1166
1,0.6023,1.4023,1.1842,0.958,0.2631,0.2573
2,0.5458,0.5597,0.7481,0.9819,0.175,0.1953
3,0.6311,4.3394,2.0831,0.8261,0.1684,0.2319
4,0.6435,1.4632,1.2096,0.9444,0.1938,0.2584
5,0.6561,0.8029,0.896,0.9719,0.1739,0.727
6,0.516,0.5687,0.7542,0.9758,0.14,0.159
7,0.587,1.0528,1.026,0.956,0.2575,0.5933
8,0.4748,0.3931,0.627,0.9834,0.159,0.2695
9,0.6911,3.6935,1.9218,0.8188,0.2396,0.2945


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4568,0.4038,0.6355,0.9834,0.1182,0.1162
1,0.5573,1.3744,1.1723,0.9588,0.2607,0.2242
2,0.4608,0.4235,0.6508,0.9863,0.1687,0.1676
3,0.604,4.1909,2.0472,0.832,0.1612,0.2209
4,0.6057,1.4246,1.1936,0.9459,0.1927,0.246
5,0.5629,0.6918,0.8318,0.9758,0.1652,0.6058
6,0.48,0.5002,0.7072,0.9787,0.1405,0.1721
7,0.5761,1.0354,1.0176,0.9567,0.2555,0.5448
8,0.4564,0.3325,0.5766,0.986,0.1559,0.2985
9,0.6538,3.4092,1.8464,0.8327,0.2377,0.2185


Fitting 10 folds for each of 50 candidates, totalling 500 fits


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['1', 'A1', 'A2', 'A3', 'A4', 'A5',
                                              'A1^2', 'A1 A2', 'A1 A3', 'A1 A4',
                                              'A1 A5', 'A2^2', 'A2 A3', 'A2 A4',
                                              'A2 A5', 'A3^2', 'A3 A4', 'A3 A5',
                                              'A4^2', 'A4 A5', 'A5^2', 'A1^3',
                                              'A1^2 A2', 'A1^2 A3', 'A1^2 A4',
                                              'A1^2 A5', 'A1 A2^2', 'A1 A2 A3',
                                              'A1 A2 A4', 'A1 A2 A5', ...],
                                     transformer=SimpleImputer()...
                  TransformerWrapper(include=[],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('bin_numeric_features',
                  Tra

In [9]:
model = load_model('model_huber_withAgeA5_poly5noAge_v2')

Transformation Pipeline and Model Successfully Loaded


In [12]:
# "Id","Gender","Age","A1","A2","A3","A4","A5","Target"
data = {'ID':1,'Gender':1.0, "Age":66, "A1":26.34,"A2":43.44,"A3":44.1,"A4":3.53,"A5":4.1}
input_df = pd.DataFrame([data])
_, df_pred = preprocessing(df_train_original, input_df, target_name=["Target"],
                                              drop_col=None, non_numeric_features=["Gender", "Age"], 
                                              outlier_rem_func=IQR_Outliers, deg_poly=5)
predictions = predict_model(model, df_pred)
predictions["prediction_label"][0]

11.964653467219478

### Model 2 with count 0.748
* no oversampling, no reciprocal features
* drop "A5"
* 5th order polynomial transformation on ["A1", "A2", "A3", "A4"]
* ML model: Huber regression

In [141]:
# Preprocessing
df_trans_train, df_trans_test = preprocessing(df_train_original, df_test_original, target_name=["Target"],
                                              drop_col=["A5"], non_numeric_features=["Gender", "Age"], 
                                              outlier_rem_func=IQR_Outliers, deg_poly=5)

# setup on pycaret for test purpose
exp_reg = setup(data = df_trans_train, target = "Target", session_id=1, bin_numeric_features=["Gender"],
                train_size=0.85, experiment_name="regression1", n_jobs=5, use_gpu=True)

Unnamed: 0,Description,Value
0,Session id,1
1,Target,Target
2,Target type,Regression
3,Original data shape,"(1005, 129)"
4,Transformed data shape,"(1005, 129)"
5,Transformed train set shape,"(854, 129)"
6,Transformed test set shape,"(151, 129)"
7,Numeric features,128
8,Preprocess,True
9,Imputation type,simple


In [142]:
# Create and primarily tune the model with pycaret
huber_params = {"alpha": [1e-7, 5e-7, 1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4, 0.001, 0.005, 0.008, 0.01, 0.02, 0.03, 0.1, 1.0, 2.0, 3.0, 10.0],
               "epsilon": [1.03, 1.05, 1.07, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1, 2.5, 3.0, 3.5, 4.5, 5.0, 10.0]}
huber = create_model('huber')

tuned_huber = tune_model(huber, n_iter=50, optimize="mae", choose_better=True, custom_grid=huber_params)

# Increase the max_iter manually due to insufficient computational power
paras_huber = dict(tuned_huber.get_params())
paras_huber["max_iter"] = 10000
paras_huber["tol"] = 1e-6

huber_manually_toned = create_model('huber', **paras_huber)

huber_final = finalize_model(huber_manually_toned)

save_model(huber_final, "model_huber_withAgeNoA5_poly5noAge")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.5292,0.6162,0.785,0.9725,0.1411,0.1183
1,0.4138,0.3262,0.5711,0.9869,0.142,0.2644
2,0.7496,3.8554,1.9635,0.8877,0.2663,0.6193
3,0.7209,5.1077,2.26,0.821,0.2569,0.243
4,0.4937,0.5297,0.7278,0.9783,0.1549,0.2421
5,0.5244,0.7042,0.8392,0.9755,0.1843,0.2148
6,0.4051,0.3307,0.575,0.9851,0.1516,0.186
7,0.601,1.2553,1.1204,0.9525,0.2213,0.258
8,0.4755,0.3699,0.6082,0.9836,0.1565,0.2835
9,0.4935,0.4496,0.6705,0.9819,0.1831,0.4413


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.5124,0.5982,0.7734,0.9733,0.1418,0.1123
1,0.4281,0.3567,0.5973,0.9857,0.1425,0.2692
2,0.7279,3.7911,1.9471,0.8896,0.2673,0.6055
3,0.718,5.1101,2.2606,0.8209,0.2612,0.2484
4,0.5221,0.5116,0.7153,0.979,0.163,0.5565
5,0.5215,0.6983,0.8356,0.9757,0.1809,0.1903
6,0.414,0.336,0.5797,0.9848,0.1478,0.1957
7,0.5867,1.23,1.1091,0.9534,0.2209,0.2425
8,0.4793,0.364,0.6033,0.9839,0.1576,0.294
9,0.4684,0.4047,0.6362,0.9837,0.1867,0.4536


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 50 candidates, totalling 500 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.5181,0.596,0.772,0.9734,0.1452,0.1095
1,0.4518,0.3736,0.6113,0.985,0.1542,0.2795
2,0.7366,3.97,1.9925,0.8844,0.2582,0.6074
3,0.7217,5.1884,2.2778,0.8182,0.2616,0.2319
4,0.4924,0.5354,0.7317,0.9781,0.1573,0.469
5,0.5271,0.7082,0.8416,0.9754,0.18,0.1941
6,0.4313,0.3499,0.5915,0.9842,0.1489,0.1981
7,0.6034,1.2557,1.1206,0.9525,0.2272,0.2711
8,0.4781,0.3641,0.6034,0.9839,0.1573,0.2986
9,0.4807,0.4255,0.6523,0.9828,0.2069,0.5732


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=C:\Users\KAIJIE\AppData\Local\Temp\joblib),
          steps=[('clean_column_names',
                  TransformerWrapper(transformer=CleanColumnNames())),
                 ('numerical_imputer',
                  TransformerWrapper(include=['1', 'A1', 'A2', 'A3', 'A4',
                                              'A1^2', 'A1 A2', 'A1 A3', 'A1 A4',
                                              'A2^2', 'A2 A3', 'A2 A4', 'A3^2',
                                              'A3 A4', 'A4^2', 'A1^3', 'A1^2 A2',
                                              'A1^2 A3', 'A1^2 A4', 'A1 A2^2',
                                              'A1 A...
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=[],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('bin_numeric_features',
                  Transfo

### Model 3 with count 0.761
* Oversampling of samples with 28.8 < A1 < 29.8
* Reciprocal features
* 3rd order polynomial transformation on ["A1", "A2", "A3", "A4", "A5"]
* ML model: Huber regression

In [128]:
# Preprocessing
df_trans_train, df_trans_test = preprocessing(df_train_original, df_test_original, target_name=["Target"],
                                              drop_col=None, non_numeric_features=["Gender", "Age"], 
                                              outlier_rem_func=IQR_Outliers, deg_poly=3,
                                              oversampling=True, oversampling_name="A1", oversampling_range=[28.8, 29.8],
                                              add_reciprocal_features=True, features_for_reciprocal=["A1", "A2", "A3", "A4", "A5"]
                                             )

# setup on pycaret for test purpose
exp_reg = setup(data = df_trans_train, target = "Target", session_id=1, bin_numeric_features=["Gender"],
                train_size=0.85, experiment_name="regression1", n_jobs=5, use_gpu=True)

(114,)


Unnamed: 0,Description,Value
0,Session id,1
1,Target,Target
2,Target type,Regression
3,Original data shape,"(1051, 289)"
4,Transformed data shape,"(1051, 289)"
5,Transformed train set shape,"(893, 289)"
6,Transformed test set shape,"(158, 289)"
7,Numeric features,288
8,Preprocess,True
9,Imputation type,simple


In [130]:
# Create and primarily tune the model with pycaret
huber_params = {"alpha": [1e-7, 5e-7, 1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4, 0.001, 0.005, 0.008, 0.01, 0.02, 0.03, 0.1, 1.0, 2.0, 3.0, 10.0],
               "epsilon": [1.03, 1.05, 1.07, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1, 2.5, 3.0, 3.5, 4.5, 5.0, 10.0]}
huber = create_model('huber')

tuned_huber = tune_model(huber, n_iter=50, optimize="mae", choose_better=True, custom_grid=huber_params)

# Increase the max_iter manually due to insufficient computational power
paras_huber = dict(tuned_huber.get_params())
paras_huber["max_iter"] = 10000
paras_huber["tol"] = 1e-6

huber_manually_toned = create_model('huber', **paras_huber)

huber_final = finalize_model(huber_manually_toned)

save_model(huber_final, "model_huber_withAgeA5_withrev_poly3noAge_OversampA1_28.8_29.8")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.5135,0.5786,0.7607,0.97,0.2049,0.1467
1,0.4499,0.4284,0.6545,0.9858,0.1725,0.2293
2,0.3774,0.2588,0.5088,0.9902,0.1045,0.0965
3,0.4361,0.3401,0.5832,0.9855,0.156,0.6353
4,0.4683,0.4392,0.6627,0.983,0.1458,0.2131
5,0.6772,1.7724,1.3313,0.943,0.271,0.329
6,0.443,0.3742,0.6117,0.9814,0.1401,0.2302
7,0.4641,0.4105,0.6407,0.9835,0.1782,0.1988
8,0.3807,0.3321,0.5763,0.9855,0.1266,0.29
9,0.5391,1.315,1.1467,0.949,0.2634,0.2363


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.5134,0.5801,0.7617,0.9699,0.2066,0.1479
1,0.4487,0.4281,0.6543,0.9858,0.1729,0.2282
2,0.3752,0.2577,0.5076,0.9903,0.1041,0.0955
3,0.4334,0.3405,0.5835,0.9855,0.1565,0.6304
4,0.4679,0.4358,0.6601,0.9831,0.1456,0.2142
5,0.6789,1.7732,1.3316,0.9429,0.2714,0.3273
6,0.4396,0.3689,0.6074,0.9817,0.1398,0.2284
7,0.4656,0.4151,0.6443,0.9833,0.1746,0.1985
8,0.3773,0.3269,0.5717,0.9857,0.1263,0.2919
9,0.5368,1.3433,1.159,0.9479,0.2675,0.2406


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 50 candidates, totalling 500 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.5214,0.5826,0.7633,0.9698,0.1982,0.1443
1,0.4705,0.4457,0.6676,0.9852,0.1792,0.2608
2,0.404,0.3954,0.6288,0.9851,0.1205,0.1075
3,0.4287,0.3481,0.59,0.9851,0.1579,0.7313
4,0.4854,0.4559,0.6752,0.9823,0.1567,0.2341
5,0.702,1.7803,1.3343,0.9427,0.2749,0.3673
6,0.4621,0.3861,0.6214,0.9808,0.1517,0.2502
7,0.4713,0.4224,0.6499,0.9831,0.1795,0.2074
8,0.367,0.2881,0.5367,0.9874,0.1312,0.3149
9,0.5461,1.308,1.1437,0.9493,0.2675,0.2385


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=C:\Users\KAIJIE\AppData\Local\Temp\joblib),
          steps=[('clean_column_names',
                  TransformerWrapper(transformer=CleanColumnNames())),
                 ('numerical_imputer',
                  TransformerWrapper(include=['1', 'A1', 'A2', 'A3', 'A4', 'A5',
                                              '1/A1', '1/A2', '1/A3', '1/A4',
                                              '1/A5', 'A1^2', 'A1 A2', 'A1 A3',
                                              'A1 A4', 'A1 A5', 'A1 1/A1',
                                              'A1 1/A2', 'A1 1/A3', 'A1 1/A4',
                                              'A1 1/A...
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=[],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('bin_numeric_features',
                  Transform