### Data set formatting

In [9]:
import pandas as pd

#Converted to DataFrame
file = '../datasets/training_data.csv'
df = pd.read_csv(file) 

#Data set formatting
from pycaret.regression import *
exp1 = setup(
             df, 
             target = 'prod',  #Specify output values
             train_size = 0.8, #Specify the ratio of training data to test data
             data_split_shuffle = True,
             fold = 10, #Number of folds for cross-validation
             session_id = 1, 

             #variable transformation (Yeo-Johnson transformation)
             transformation_method = "yeo-johnson", transformation = True,
             transform_target_method = "yeo-johnson", transform_target = True,
)
#
X_train =  get_config("X_train") 
y_train = get_config("y_train")
X_test =  get_config("X_test")
y_test = get_config("y_test")
X = get_config("X")
y = get_config("y")

Unnamed: 0,Description,Value
0,session_id,1
1,Target,prod
2,Original Data,"(168, 9)"
3,Missing Values,False
4,Numeric Features,8
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(134, 8)"


### Building a single ML model

In [3]:
ada = create_model('ada')
ard = create_model('ard')
br = create_model("br")
dt = create_model("dt")
en = create_model("en")
et = create_model("et")
gbr = create_model("gbr")
huber = create_model("huber")
knn = create_model("knn")
kr = create_model("kr")
lar = create_model("lar")
lasso = create_model("lasso")
lightgbm = create_model("lightgbm")
llar = create_model("llar")
lr = create_model("lr")
mlp = create_model("mlp")
omp = create_model("omp")
par = create_model("par")
ransac = create_model("ransac")
rf = create_model("rf")
ridge = create_model("ridge")
svm = create_model("svm")
tr = create_model("tr")
xgboost = create_model("xgboost")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.1469,6.207,2.4914,-0.228,0.4707,0.6541
1,1.5952,4.9202,2.2181,-0.3056,0.3563,0.2541
2,1.5999,3.2528,1.8036,0.2541,0.3499,0.4521
3,1.2328,2.5053,1.5828,0.1034,0.286,0.2785
4,1.6682,4.2104,2.0519,0.1507,0.3427,0.4112
5,1.4166,3.5072,1.8727,-1.168,0.3748,0.4584
6,1.859,5.2436,2.2899,-0.2761,0.4069,0.4328
7,1.3736,2.8717,1.6946,0.4323,0.318,0.3983
8,1.6487,3.7896,1.9467,-0.3369,0.3279,0.4235
9,1.3167,2.9804,1.7264,0.1479,0.3114,0.3108


### Building ensemble ML model

Voting model

In [7]:
import random
import csv
import math
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_validate

n_trial = 1000 #Number of models to build
filename0 = "voting_score.csv"

DIC = {ada:"ada",ard:"ard",br:"br",dt:"dt",en:"en",et:"et",gbr:"gbr",huber:"huber",knn:"knn",kr:"kr",lar:"lar",lasso:"lasso",lightgbm:"lightgbm",llar:"llar",lr:"lr",mlp:"mlp",omp:"omp",par:"par",ransac:"ransac",rf:"rf",ridge:"ridge",svm:"svm",tr:"tr",xgboost:"xgboost"}

#Specify the type of table to be output
hedder = ["(train) RMSE","(train) R2","(test) RMSE","(test) R2"]
with open(filename0, 'a') as f:
 writer = csv.writer(f, lineterminator='\n') 
 writer.writerow(hedder)
 
tuned_model = [ada,ard,br,dt,en,et,gbr,huber,knn,kr,lar,lasso,lightgbm,llar,lr,mlp,omp,par,ransac,rf,ridge,svm,tr,xgboost]

for num in range(n_trial):
    print(str(num/n_trial*100)  + "％ finished")

    #Set a random number from 2~24 and select that many single models to use in the ensemble model
    numbers = random.randint(2, 24)
    modeling = random.sample(tuned_model, k=numbers)

    blender = blend_models(estimator_list = modeling, choose_better = False, optimize = 'RMSE') 
    
    #Calculation of coefficient of determination and RMSE
    trainA = cross_validate(blender, X=X_train, y=y_train, scoring='neg_root_mean_squared_error', cv=10)  
    trainB = cross_validate(blender, X=X_train, y=y_train, scoring='r2', cv=10)
    train_rmse = trainA['test_score'].mean()
    train_r2 = trainB['test_score'].mean()

    y_pred = blender.predict(X_test)
    test_rmse = math.sqrt(mean_squared_error(y_test, y_pred))
    test_r2 = r2_score(y_test, y_pred)

    #Add calculated coefficient of determination and RMSE to table
    lio =[]
    lio.append(train_rmse)
    lio.append(train_r2) 
    lio.append(test_rmse)
    lio.append(test_r2)    
    for model in tuned_model:
      if model in modeling:
         lio.append(DIC[model]) 
      elif model not in modeling:
         pass 
   
    with open(filename0, 'a') as f:
       writer = csv.writer(f, lineterminator='\n') 
       writer.writerow(lio)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.7751,4.5512,2.1334,0.0996,0.403,0.5276
1,1.3116,3.2417,1.8005,0.1398,0.2585,0.2109
2,1.6092,3.5308,1.879,0.1904,0.3321,0.3964
3,1.0642,1.9362,1.3915,0.3071,0.2428,0.2595
4,1.3097,2.9246,1.7101,0.4101,0.2822,0.3243
5,1.2301,2.2697,1.5065,-0.403,0.3216,0.3932
6,1.6117,4.6088,2.1468,-0.1216,0.3517,0.3246
7,1.281,3.5638,1.8878,0.2955,0.3278,0.3439
8,1.1646,2.0709,1.4391,0.2694,0.2494,0.2984
9,1.1678,2.5311,1.5909,0.2763,0.2807,0.3009


KeyboardInterrupt: 

Bagging model

In [None]:
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.metrics import mean_squared_error, r2_score
import math,csv

DIC = {ada:"ada",ard:"ard",br:"br",dt:"dt",en:"en",et:"et",gbr:"gbr",huber:"huber",knn:"knn",kr:"kr",lar:"lar",lasso:"lasso",lightgbm:"lightgbm",llar:"llar",lr:"lr",mlp:"mlp",omp:"omp",par:"par",ransac:"ransac",rf:"rf",ridge:"ridge",svm:"svm",tr:"tr",xgboost:"xgboost"}
tuned_model = [ada,ard,br,dt,en,et,gbr,huber,knn,kr,lar,lasso,lightgbm,llar,lr,mlp,omp,par,ransac,rf,ridge,svm,tr,xgboost]

filename0 = "bagging_score.csv"

hedder = ["(train) RMSE","(train) R2","(test) RMSE","(test) R2"]
with open(filename0, 'a') as f:
 writer = csv.writer(f, lineterminator='\n') 
 writer.writerow(hedder)

for bag in tuned_model:
 model = ensemble_model(bag, method = 'Bagging', n_estimators = 10)
 ado = []
 ado.append(DIC[bag])

 trainA = cross_validate(model, X=X_train, y=y_train, scoring='neg_root_mean_squared_error', cv=10)  
 trainB = cross_validate(model, X=X_train, y=y_train, scoring='r2', cv=10)
 train_rmse = trainA['test_score'].mean()
 train_r2 = trainB['test_score'].mean()

 y_pred = model.predict(X_test)
 test_rmse = math.sqrt(mean_squared_error(y_test, y_pred))
 test_r2 = r2_score(y_test, y_pred)

 ado.append(train_rmse)
 ado.append(train_r2)
 ado.append(test_rmse)
 ado.append(test_r2)

 with open(filename0, 'a') as f:
    writer = csv.writer(f, lineterminator='\n') 
    writer.writerow(ado)

Boosting model

In [None]:
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.metrics import mean_squared_error, r2_score
import math,csv

DIC = {ada:"ada",ard:"ard",br:"br",dt:"dt",en:"en",et:"et",gbr:"gbr",huber:"huber",knn:"knn",kr:"kr",lar:"lar",lasso:"lasso",lightgbm:"lightgbm",llar:"llar",lr:"lr",mlp:"mlp",omp:"omp",par:"par",ransac:"ransac",rf:"rf",ridge:"ridge",svm:"svm",tr:"tr",xgboost:"xgboost"}
tuned_model = [ada,ard,br,dt,en,et,gbr,huber,knn,kr,lar,lasso,lightgbm,llar,lr,mlp,omp,par,ransac,rf,ridge,svm,tr,xgboost]

filename0 = "boosting_score.csv"

hedder = ["(train) RMSE","(train) R2","(test) RMSE","(test) R2"]
with open(filename0, 'a') as f:
 writer = csv.writer(f, lineterminator='\n') 
 writer.writerow(hedder)

for bag in tuned_model:
 model = ensemble_model(bag, method = 'Boosting', n_estimators = 10)
 ado = []
 ado.append(DIC[bag])

 trainA = cross_validate(model, X=X_train, y=y_train, scoring='neg_root_mean_squared_error', cv=10)  
 trainB = cross_validate(model, X=X_train, y=y_train, scoring='r2', cv=10)
 train_rmse = trainA['test_score'].mean()
 train_r2 = trainB['test_score'].mean()

 y_pred = model.predict(X_test)
 test_rmse = math.sqrt(mean_squared_error(y_test, y_pred))
 test_r2 = r2_score(y_test, y_pred)

 ado.append(train_rmse)
 ado.append(train_r2)
 ado.append(test_rmse)
 ado.append(test_r2)

 with open(filename0, 'a') as f:
    writer = csv.writer(f, lineterminator='\n') 
    writer.writerow(ado)

Stacking model

In [8]:
import random
import csv
import math
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_validate

n_trial = 1000
filename0 = "stacking_score.csv"

DIC = {ada:"ada",ard:"ard",br:"br",dt:"dt",en:"en",et:"et",gbr:"gbr",huber:"huber",knn:"knn",kr:"kr",lar:"lar",lasso:"lasso",lightgbm:"lightgbm",llar:"llar",lr:"lr",mlp:"mlp",omp:"omp",par:"par",ransac:"ransac",rf:"rf",ridge:"ridge",svm:"svm",tr:"tr",xgboost:"xgboost"}
tuned_model = [ada,ard,br,dt,en,et,gbr,huber,knn,kr,lar,lasso,lightgbm,llar,lr,mlp,omp,par,ransac,rf,ridge,svm,tr,xgboost]

hedder = ["(train) RMSE","(train) R2","(test) RMSE","(test) R2","meta_model"]
with open(filename0, 'a') as f:
 writer = csv.writer(f, lineterminator='\n')
 writer.writerow(hedder)

for num in range(n_trial):
    print(str(num/n_trial*100)  + "％ finished")
    numbers = random.randint(2, 24)
    modeling = random.sample(tuned_model, k=numbers)
    meta = random.choice(tuned_model)

    stacker = stack_models(estimator_list = modeling, meta_model = meta, choose_better = False, optimize = 'RMSE') 
    
    trainA = cross_validate(stacker, X=X_train, y=y_train, scoring='neg_root_mean_squared_error', cv=10)  
    trainB = cross_validate(stacker, X=X_train, y=y_train, scoring='r2', cv=10)
    train_rmse = trainA['test_score'].mean()
    train_r2 = trainB['test_score'].mean()

    y_pred = stacker.predict(X_test)
    test_rmse = math.sqrt(mean_squared_error(y_test, y_pred))
    test_r2 = r2_score(y_test, y_pred)
   
    lio =[]
    lio.append(train_rmse)
    lio.append(train_r2) 
    lio.append(test_rmse)
    lio.append(test_r2)
    lio.append(DIC[meta])    
    for model in tuned_model:
      if model in modeling:
         lio.append(DIC[model]) 
      elif model not in modeling:
         pass
   
    with open(filename0, 'a') as f:
       writer = csv.writer(f, lineterminator='\n')
       writer.writerow(lio)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.8251,4.7243,2.1736,0.0654,0.4182,0.5643
1,1.3309,3.3045,1.8178,0.1231,0.2652,0.211
2,1.7063,3.8758,1.9687,0.1112,0.3465,0.4102
3,0.9262,1.5789,1.2565,0.4349,0.2174,0.2217
4,1.5987,3.7004,1.9236,0.2536,0.3264,0.4028
5,1.2899,2.7956,1.672,-0.7281,0.3519,0.3914
6,1.6622,5.0159,2.2396,-0.2206,0.3777,0.3472
7,1.3062,3.8501,1.9622,0.239,0.3641,0.3694
8,1.2704,2.6666,1.633,0.0592,0.271,0.3257
9,1.1659,2.4794,1.5746,0.2911,0.2643,0.2741


KeyboardInterrupt: 