In [1]:
import pandas as pd

# Define the file paths and their corresponding methods
file_paths = {
    'randomsearch': 'hasil/uji4/combined_xgboost_evaluations_with_group_randomsearch.csv',
    'gridsearch': 'hasil/uji4/combined_xgboost_evaluations_with_group_gridsearch.csv',
    'notuning': 'hasil/uji4/combined_xgboost_evaluations_with_group_notuning.csv'
}


In [2]:

# Create an empty DataFrame to store the best evaluations summary
best_evaluations_summary = pd.DataFrame()

# Define the evaluation metrics
metrics = ['MSE', 'RMSE', 'MAE', 'R2']

# Iterate over each file and process it
for method, file_path in file_paths.items():
    try:
        # Load the data
        data = pd.read_csv(file_path)
        # Add a column for the method
        data['Method'] = method
        
        # Create a dictionary to store the best evaluations for each metric
        best_evaluations = {}
        
        for metric in metrics:
            if metric in data.columns:
                # For each metric, find the row with the best value (min for errors, max for R2)
                if metric == 'R2':
                    best_evaluations[metric] = data.loc[data.groupby('Group')[metric].idxmax()]
                else:
                    best_evaluations[metric] = data.loc[data.groupby('Group')[metric].idxmin()]
        
        # Combine the best evaluations into a single DataFrame for the current method
        best_evaluations_df = pd.concat(best_evaluations.values()).drop_duplicates().reset_index(drop=True)
        
        # Append to the summary DataFrame
        best_evaluations_summary = pd.concat([best_evaluations_summary, best_evaluations_df], ignore_index=True)
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# Display the summary DataFrame
# import ace_tools as tools; tools.display_dataframe_to_user(name="Best Evaluations Summary", dataframe=best_evaluations_summary)

# Save the summary DataFrame to a new CSV file
output_file = 'hasil/uji4/09091_0_best_evaluations_summary_all.csv'
best_evaluations_summary.to_csv(output_file, index=False)

# Provide the path for the summary CSV file
output_file
best_evaluations_summary

Unnamed: 0,Test Size,Method,Best Params,Start Time,End Time,Duration (s),MAE,MSE,RMSE,R2,Group
0,0.2,randomsearch,"{'subsample': 0.8, 'min_child_weight': 4, 'max...",2024-06-10 22:28:57,2024-06-10 22:29:18,20.960901,2.107251,12.305288,3.507889,0.613779,CV_RandomSearch_IDO
1,0.2,randomsearch,"{'subsample': 0.6, 'min_child_weight': 1, 'max...",2024-06-11 00:07:22,2024-06-11 00:07:41,18.545788,2.132653,11.863642,3.444364,0.536568,CV_RandomSearch_id
2,0.2,randomsearch,"{'subsample': 0.6, 'min_child_weight': 1, 'max...",2024-06-10 23:49:09,2024-06-10 23:49:22,13.573731,2.132653,11.863642,3.444364,0.536568,CV_RandomSearch_if_by
3,0.25,randomsearch,"{'subsample': 0.8, 'min_child_weight': 2, 'max...",2024-06-10 22:50:11,2024-06-10 22:50:27,16.194669,2.259697,12.986075,3.60362,0.794887,CV_RandomSearch_if_by_lof
4,0.2,randomsearch,"{'subsample': 0.8, 'min_child_weight': 2, 'max...",2024-06-10 23:56:25,2024-06-10 23:56:52,26.704711,2.178499,10.503711,3.240943,0.504632,CV_RandomSearch_if_lof
5,0.25,randomsearch,"{'subsample': 0.8, 'min_child_weight': 2, 'max...",2024-06-10 23:56:54,2024-06-10 23:57:19,25.546977,2.152573,10.705231,3.271885,0.447152,CV_RandomSearch_if_lof
6,0.3,randomsearch,"{'subsample': 0.7, 'min_child_weight': 4, 'max...",2024-06-10 22:29:43,2024-06-10 22:30:03,20.362327,2.207441,12.458306,3.529633,0.627463,CV_RandomSearch_IDO
7,0.2,randomsearch,"{'subsample': 0.8, 'min_child_weight': 2, 'max...",2024-06-10 22:49:52,2024-06-10 22:50:09,17.56033,2.262814,13.040409,3.611151,0.799246,CV_RandomSearch_if_by_lof
8,0.2,gridsearch,"{'colsample_bytree': 0.9, 'learning_rate': 0.1...",2024-06-10 22:22:14,2024-06-10 22:22:54,40.510912,2.248336,12.84676,3.584238,0.615847,CV_GridSearch_IDO
9,0.2,gridsearch,"{'colsample_bytree': 0.9, 'learning_rate': 0.1...",2024-06-11 00:02:44,2024-06-11 00:03:52,67.549993,2.141528,11.6673,3.415743,0.544237,CV_GridSearch_id


In [3]:
import pandas as pd

# Load the 'No Tuning' CSV file
no_tuning_file_path = 'hasil/uji4/combined_xgboost_evaluations_with_group_notuning.csv'
no_tuning_data = pd.read_csv(no_tuning_file_path)

# Display the first few rows to understand the data structure
print(no_tuning_data.head())

# Define the evaluation metrics
metrics = ['MSE', 'RMSE', 'MAE', 'R2']

# Create a dictionary to store the best evaluations for each metric
best_evaluations = {}

for metric in metrics:
    if metric in no_tuning_data.columns:
        # Find the row with the best value (min for errors, max for R2)
        if metric == 'R2':
            best_evaluations[metric] = no_tuning_data.loc[no_tuning_data.groupby('Group')[metric].idxmax()]
        else:
            best_evaluations[metric] = no_tuning_data.loc[no_tuning_data.groupby('Group')[metric].idxmin()]

# Combine the best evaluations into a single DataFrame
best_evaluations_df = pd.concat(best_evaluations.values()).drop_duplicates().reset_index(drop=True)

# Add a column for the method group
best_evaluations_df['MethodGroup'] = 'No Tuning'

# Display the summarized results
# import ace_tools as tools; tools.display_dataframe_to_user(name="Best Evaluations Summary for No Tuning", dataframe=best_evaluations_df)

# Save the summarized results to a CSV file
summary_file_path = 'hasil/uji4/09091_0_best_evaluations_summary_no_tuning.csv'
best_evaluations_df.to_csv(summary_file_path, index=False)

summary_file_path
best_evaluations_df

   Test Size           Method       MAE        MSE      RMSE        R2  \
0       0.20    NoCV NoTuning  2.222683  11.382420  3.373784  0.463191   
1       0.25    NoCV NoTuning  2.201396  11.093006  3.330616  0.427126   
2       0.30    NoCV NoTuning  2.356404  13.598621  3.687631  0.346781   
3       0.35    NoCV NoTuning  2.406574  13.597178  3.687435  0.322187   
4       0.20  XGB.CV NoTuning  2.268856  12.776146  3.574373  0.457696   

             Group  
0  NoTuning_if_lof  
1  NoTuning_if_lof  
2  NoTuning_if_lof  
3  NoTuning_if_lof  
4  NoTuning_if_lof  


Unnamed: 0,Test Size,Method,MAE,MSE,RMSE,R2,Group,MethodGroup
0,0.2,XGB.CV NoTuning,2.247128,12.620082,3.552475,0.622625,NoTuning_IDO,No Tuning
1,0.2,XGB.CV NoTuning,2.225079,12.349899,3.514242,0.472592,NoTuning_id,No Tuning
2,0.2,XGB.CV NoTuning,2.225079,12.349899,3.514242,0.472592,NoTuning_if_by,No Tuning
3,0.2,NoCV NoTuning,2.328151,13.738608,3.706563,0.788497,NoTuning_if_by_lof,No Tuning
4,0.25,NoCV NoTuning,2.201396,11.093006,3.330616,0.427126,NoTuning_if_lof,No Tuning
5,0.2,NoCV NoTuning,2.147627,12.516828,3.537913,0.511052,NoTuning_id,No Tuning
6,0.2,NoCV NoTuning,2.147627,12.516828,3.537913,0.511052,NoTuning_if_by,No Tuning
7,0.2,NoCV NoTuning,2.222683,11.38242,3.373784,0.463191,NoTuning_if_lof,No Tuning


In [4]:
import pandas as pd

# Load the 'No Tuning' CSV file
no_tuning_file_path = 'hasil/uji4/combined_xgboost_evaluations_with_group_gridsearch.csv'
no_tuning_data = pd.read_csv(no_tuning_file_path)

# Display the first few rows to understand the data structure
print(no_tuning_data.head())

# Define the evaluation metrics
metrics = ['MSE', 'RMSE', 'MAE', 'R2']

# Create a dictionary to store the best evaluations for each metric
best_evaluations = {}

for metric in metrics:
    if metric in no_tuning_data.columns:
        # Find the row with the best value (min for errors, max for R2)
        if metric == 'R2':
            best_evaluations[metric] = no_tuning_data.loc[no_tuning_data.groupby('Group')[metric].idxmax()]
        else:
            best_evaluations[metric] = no_tuning_data.loc[no_tuning_data.groupby('Group')[metric].idxmin()]

# Combine the best evaluations into a single DataFrame
best_evaluations_df = pd.concat(best_evaluations.values()).drop_duplicates().reset_index(drop=True)

# Add a column for the method group
best_evaluations_df['MethodGroup'] = 'Grid Search'

# Display the summarized results
# import ace_tools as tools; tools.display_dataframe_to_user(name="Best Evaluations Summary for No Tuning", dataframe=best_evaluations_df)

# Save the summarized results to a CSV file
summary_file_path = 'hasil/uji4/09091_0_best_evaluations_summary_gridsearch.csv'
best_evaluations_df.to_csv(summary_file_path, index=False)

summary_file_path
best_evaluations_df

   Test Size                             Method  \
0       0.20  No Cross Validation (Grid Search)   
1       0.20               xgb.cv (Grid Search)   
2       0.25  No Cross Validation (Grid Search)   
3       0.25               xgb.cv (Grid Search)   
4       0.30  No Cross Validation (Grid Search)   

                                         Best Params           Start Time  \
0  {'colsample_bytree': 0.9, 'learning_rate': 0.1...  2024-06-10 23:52:27   
1  {'colsample_bytree': 0.9, 'learning_rate': 0.1...  2024-06-10 23:52:27   
2  {'colsample_bytree': 0.8, 'learning_rate': 0.1...  2024-06-10 23:53:17   
3  {'colsample_bytree': 0.8, 'learning_rate': 0.1...  2024-06-10 23:53:17   
4  {'colsample_bytree': 0.7, 'learning_rate': 0.1...  2024-06-10 23:54:15   

              End Time  Duration (s)       MAE        MSE      RMSE        R2  \
0  2024-06-10 23:53:15     47.913392  2.158425  10.538540  3.246312  0.502989   
1  2024-06-10 23:53:15     47.913392  2.266055  12.854361  3.585298 

Unnamed: 0,Test Size,Method,Best Params,Start Time,End Time,Duration (s),MAE,MSE,RMSE,R2,Group,MethodGroup
0,0.2,xgb.cv (Grid Search),"{'colsample_bytree': 0.9, 'learning_rate': 0.1...",2024-06-10 22:22:14,2024-06-10 22:22:54,40.510912,2.248336,12.84676,3.584238,0.615847,CV_GridSearch_IDO,Grid Search
1,0.2,No Cross Validation (Grid Search),"{'colsample_bytree': 0.9, 'learning_rate': 0.1...",2024-06-11 00:02:44,2024-06-11 00:03:52,67.549993,2.141528,11.6673,3.415743,0.544237,CV_GridSearch_id,Grid Search
2,0.2,No Cross Validation (Grid Search),"{'colsample_bytree': 0.9, 'learning_rate': 0.1...",2024-06-10 23:44:37,2024-06-10 23:45:32,55.237929,2.141528,11.6673,3.415743,0.544237,CV_GridSearch_if_by,Grid Search
3,0.2,No Cross Validation (Grid Search),"{'colsample_bytree': 0.9, 'learning_rate': 0.1...",2024-06-10 22:42:07,2024-06-10 22:43:22,75.056266,2.299184,13.222709,3.636304,0.796439,CV_GridSearch_if_by_lof,Grid Search
4,0.2,No Cross Validation (Grid Search),"{'colsample_bytree': 0.9, 'learning_rate': 0.1...",2024-06-10 23:52:27,2024-06-10 23:53:15,47.913392,2.158425,10.53854,3.246312,0.502989,CV_GridSearch_if_lof,Grid Search
5,0.2,No Cross Validation (Grid Search),"{'colsample_bytree': 0.9, 'learning_rate': 0.1...",2024-06-10 22:22:14,2024-06-10 22:22:54,40.510912,2.192995,13.139077,3.624786,0.587609,CV_GridSearch_IDO,Grid Search
6,0.25,No Cross Validation (Grid Search),"{'colsample_bytree': 0.8, 'learning_rate': 0.1...",2024-06-10 22:43:27,2024-06-10 22:44:34,67.737101,2.228306,13.255879,3.640862,0.790625,CV_GridSearch_if_by_lof,Grid Search
7,0.25,No Cross Validation (Grid Search),"{'colsample_bytree': 0.8, 'learning_rate': 0.1...",2024-06-10 23:53:17,2024-06-10 23:54:13,56.681327,2.100097,10.623501,3.259371,0.451373,CV_GridSearch_if_lof,Grid Search


In [5]:
import pandas as pd

# Load the 'No Tuning' CSV file
no_tuning_file_path = 'hasil/uji4/combined_xgboost_evaluations_with_group_randomsearch.csv'
no_tuning_data = pd.read_csv(no_tuning_file_path)

# Display the first few rows to understand the data structure
print(no_tuning_data.head())

# Define the evaluation metrics
metrics = ['MSE', 'RMSE', 'MAE', 'R2']

# Create a dictionary to store the best evaluations for each metric
best_evaluations = {}

for metric in metrics:
    if metric in no_tuning_data.columns:
        # Find the row with the best value (min for errors, max for R2)
        if metric == 'R2':
            best_evaluations[metric] = no_tuning_data.loc[no_tuning_data.groupby('Group')[metric].idxmax()]
        else:
            best_evaluations[metric] = no_tuning_data.loc[no_tuning_data.groupby('Group')[metric].idxmin()]

# Combine the best evaluations into a single DataFrame
best_evaluations_df = pd.concat(best_evaluations.values()).drop_duplicates().reset_index(drop=True)

# Add a column for the method group
best_evaluations_df['MethodGroup'] = 'Random Search'

# Display the summarized results
# import ace_tools as tools; tools.display_dataframe_to_user(name="Best Evaluations Summary for No Tuning", dataframe=best_evaluations_df)

# Save the summarized results to a CSV file
summary_file_path = 'hasil/uji4/09091_0_best_evaluations_summary_randomsearch.csv'
best_evaluations_df.to_csv(summary_file_path, index=False)

summary_file_path
best_evaluations_df

   Test Size                               Method  \
0       0.20  No Cross Validation (Random Search)   
1       0.20               xgb.cv (Random Search)   
2       0.25  No Cross Validation (Random Search)   
3       0.25               xgb.cv (Random Search)   
4       0.30  No Cross Validation (Random Search)   

                                         Best Params           Start Time  \
0  {'subsample': 0.8, 'min_child_weight': 4, 'max...  2024-06-10 22:28:57   
1  {'subsample': 0.8, 'min_child_weight': 4, 'max...  2024-06-10 22:28:57   
2  {'subsample': 0.8, 'min_child_weight': 2, 'max...  2024-06-10 22:29:21   
3  {'subsample': 0.8, 'min_child_weight': 2, 'max...  2024-06-10 22:29:21   
4  {'subsample': 0.7, 'min_child_weight': 4, 'max...  2024-06-10 22:29:43   

              End Time  Duration (s)       MAE        MSE      RMSE        R2  \
0  2024-06-10 22:29:18     20.960901  2.107251  12.305288  3.507889  0.613779   
1  2024-06-10 22:29:18     20.960901  2.219266  12.63016

Unnamed: 0,Test Size,Method,Best Params,Start Time,End Time,Duration (s),MAE,MSE,RMSE,R2,Group,MethodGroup
0,0.2,No Cross Validation (Random Search),"{'subsample': 0.8, 'min_child_weight': 4, 'max...",2024-06-10 22:28:57,2024-06-10 22:29:18,20.960901,2.107251,12.305288,3.507889,0.613779,CV_RandomSearch_IDO,Random Search
1,0.2,No Cross Validation (Random Search),"{'subsample': 0.6, 'min_child_weight': 1, 'max...",2024-06-11 00:07:22,2024-06-11 00:07:41,18.545788,2.132653,11.863642,3.444364,0.536568,CV_RandomSearch_id,Random Search
2,0.2,No Cross Validation (Random Search),"{'subsample': 0.6, 'min_child_weight': 1, 'max...",2024-06-10 23:49:09,2024-06-10 23:49:22,13.573731,2.132653,11.863642,3.444364,0.536568,CV_RandomSearch_if_by,Random Search
3,0.25,No Cross Validation (Random Search),"{'subsample': 0.8, 'min_child_weight': 2, 'max...",2024-06-10 22:50:11,2024-06-10 22:50:27,16.194669,2.259697,12.986075,3.60362,0.794887,CV_RandomSearch_if_by_lof,Random Search
4,0.2,No Cross Validation (Random Search),"{'subsample': 0.8, 'min_child_weight': 2, 'max...",2024-06-10 23:56:25,2024-06-10 23:56:52,26.704711,2.178499,10.503711,3.240943,0.504632,CV_RandomSearch_if_lof,Random Search
5,0.25,No Cross Validation (Random Search),"{'subsample': 0.8, 'min_child_weight': 2, 'max...",2024-06-10 23:56:54,2024-06-10 23:57:19,25.546977,2.152573,10.705231,3.271885,0.447152,CV_RandomSearch_if_lof,Random Search
6,0.3,xgb.cv (Random Search),"{'subsample': 0.7, 'min_child_weight': 4, 'max...",2024-06-10 22:29:43,2024-06-10 22:30:03,20.362327,2.207441,12.458306,3.529633,0.627463,CV_RandomSearch_IDO,Random Search
7,0.2,No Cross Validation (Random Search),"{'subsample': 0.8, 'min_child_weight': 2, 'max...",2024-06-10 22:49:52,2024-06-10 22:50:09,17.56033,2.262814,13.040409,3.611151,0.799246,CV_RandomSearch_if_by_lof,Random Search


In [7]:
import pandas as pd
import numpy as np

# Definisikan file path untuk setiap metode tuning
file_paths = {
    'randomsearch': 'hasil/uji4/combined_xgboost_evaluations_with_group_randomsearch.csv',
    'gridsearch': 'hasil/uji4/combined_xgboost_evaluations_with_group_gridsearch.csv',
    'notuning': 'hasil/uji4/combined_xgboost_evaluations_with_group_notuning.csv'
}

# Metode untuk menyimpan hasil evaluasi terbaik
summary_data = {}

# Metrik evaluasi yang akan digunakan
metrics = ['MSE', 'RMSE', 'MAE', 'R2']

# Load dan proses setiap file metode tuning
for method_group, file_path in file_paths.items():
    try:
        # Load data
        data = pd.read_csv(file_path)
        # Hapus kolom 'B'
        
        data = data.drop('Best Params', axis=1)

        data
        # Simpan evaluasi terbaik untuk setiap metrik
        best_evaluations = {}
        
        for metric in metrics:
            if metric in data.columns:
                # Temukan baris dengan nilai terbaik (min untuk error, max untuk R2)
                if metric == 'R2':
                    best_evaluations[metric] = data.loc[data.groupby('Group')[metric].idxmax()]
                else:
                    best_evaluations[metric] = data.loc[data.groupby('Group')[metric].idxmin()]
        
        # Kombinasikan hasil terbaik dalam satu DataFrame
        best_evaluations_df = pd.concat(best_evaluations.values()).drop_duplicates().reset_index(drop=True)
        
        # Tambahkan kolom untuk metode group
        best_evaluations_df['MethodGroup'] = method_group
        
        # Simpan hasil terbaik ke dalam summary data
        summary_data[method_group] = best_evaluations_df
    
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# Gabungkan semua summary dalam satu DataFrame
summary_df = pd.concat(summary_data.values(), ignore_index=True)

# Hitung rata-rata metrik evaluasi untuk setiap metode
average_metrics = summary_df.groupby('MethodGroup')[metrics].mean()

# Tampilkan hasil rata-rata untuk setiap metode tuning
print(average_metrics)
average_metrics

# Tentukan metode terbaik berdasarkan nilai rata-rata (lebih rendah lebih baik kecuali R2)
best_method = average_metrics.idxmin() if 'R2' not in average_metrics.columns else average_metrics.idxmax()
print("\nMetode terbaik berdasarkan rata-rata metrik evaluasi:")
print(best_method)

# Simpan hasil summary ke file CSV
summary_file_path = 'hasil/uji4/09091_0_average_score_evaluation_summary.csv'
summary_df.to_csv(summary_file_path, index=False)

print(f"Hasil summary disimpan ke file: {summary_file_path}")
summary_df


Error processing hasil/uji4/combined_xgboost_evaluations_with_group_notuning.csv: "['Best Params'] not found in axis"
                    MSE      RMSE       MAE        R2
MethodGroup                                          
gridsearch    12.120133  3.477920  2.188800  0.604170
randomsearch  11.965788  3.456731  2.179198  0.607537

Metode terbaik berdasarkan rata-rata metrik evaluasi:
MSE       gridsearch
RMSE      gridsearch
MAE       gridsearch
R2      randomsearch
dtype: object
Hasil summary disimpan ke file: hasil/uji4/09091_0_average_score_evaluation_summary.csv


Unnamed: 0,Test Size,Method,Start Time,End Time,Duration (s),MAE,MSE,RMSE,R2,Group,MethodGroup
0,0.2,No Cross Validation (Random Search),2024-06-10 22:28:57,2024-06-10 22:29:18,20.960901,2.107251,12.305288,3.507889,0.613779,CV_RandomSearch_IDO,randomsearch
1,0.2,No Cross Validation (Random Search),2024-06-11 00:07:22,2024-06-11 00:07:41,18.545788,2.132653,11.863642,3.444364,0.536568,CV_RandomSearch_id,randomsearch
2,0.2,No Cross Validation (Random Search),2024-06-10 23:49:09,2024-06-10 23:49:22,13.573731,2.132653,11.863642,3.444364,0.536568,CV_RandomSearch_if_by,randomsearch
3,0.25,No Cross Validation (Random Search),2024-06-10 22:50:11,2024-06-10 22:50:27,16.194669,2.259697,12.986075,3.60362,0.794887,CV_RandomSearch_if_by_lof,randomsearch
4,0.2,No Cross Validation (Random Search),2024-06-10 23:56:25,2024-06-10 23:56:52,26.704711,2.178499,10.503711,3.240943,0.504632,CV_RandomSearch_if_lof,randomsearch
5,0.25,No Cross Validation (Random Search),2024-06-10 23:56:54,2024-06-10 23:57:19,25.546977,2.152573,10.705231,3.271885,0.447152,CV_RandomSearch_if_lof,randomsearch
6,0.3,xgb.cv (Random Search),2024-06-10 22:29:43,2024-06-10 22:30:03,20.362327,2.207441,12.458306,3.529633,0.627463,CV_RandomSearch_IDO,randomsearch
7,0.2,No Cross Validation (Random Search),2024-06-10 22:49:52,2024-06-10 22:50:09,17.56033,2.262814,13.040409,3.611151,0.799246,CV_RandomSearch_if_by_lof,randomsearch
8,0.2,xgb.cv (Grid Search),2024-06-10 22:22:14,2024-06-10 22:22:54,40.510912,2.248336,12.84676,3.584238,0.615847,CV_GridSearch_IDO,gridsearch
9,0.2,No Cross Validation (Grid Search),2024-06-11 00:02:44,2024-06-11 00:03:52,67.549993,2.141528,11.6673,3.415743,0.544237,CV_GridSearch_id,gridsearch
