# **LIBRARIES**

In [None]:
!pip install joblib
!pip install shap
!pip install pytorch-tabnet
!pip install pykan
!pip install SHAP
!pip install xgboost



In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.neural_network import MLPRegressor
import joblib
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import shap

# Ignore the warning
import warnings
warnings.filterwarnings("ignore")

# **DATA PREPROCESSING**

### *Pre-Trained*

In [None]:
# Import and show the clean dataset
dataset = pd.read_csv('/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Dataset/SF_clean_dataset.csv')
dataset.head()

Unnamed: 0,PGA(g),B(m),DF(m),Y(kN/m3),Beban(kN),SF
0,0.0,0.5,0.5,17,50,2.25
1,0.0,0.6,0.5,17,50,2.46
2,0.0,0.7,0.5,17,50,2.51
3,0.0,0.8,0.5,17,50,2.54
4,0.0,0.9,0.5,17,50,2.66


In [None]:
dataset.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PGA(g),3840.0,0.4,0.393752,0.0,0.075,0.3,0.625,1.0
B(m),3840.0,2.09375,0.661967,0.5,1.6,2.2,2.625,3.0
DF(m),3840.0,1.1875,0.634388,0.5,0.5,1.0,1.5,2.5
Y(kN/m3),3840.0,19.0,1.633206,17.0,17.0,19.0,21.0,21.0
Beban(kN),3840.0,125.0,55.90898,50.0,87.5,125.0,162.5,200.0
SF,3840.0,2.34417,0.879235,1.07,1.68,2.18,2.76,6.62


In [None]:
# Separate features and targets
X = dataset[['PGA(g)', 'B(m)', 'DF(m)', 'Y(kN/m3)', 'Beban(kN)']]
y = dataset['SF']

# Split the dataset into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Save the training and test datasets using joblib
datasets = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test }
joblib.dump(datasets, '/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Pickles/datasets.pkl')

['/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Pickles/datasets.pkl']

### *Trained*

In [None]:
# Load the datasets
datasets = joblib.load('/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Pickles/datasets.pkl')

# Access individual datasets
X_train = datasets['X_train']
X_test = datasets['X_test']
y_train = datasets['y_train']
y_test = datasets['y_test']

In [None]:
print(f'Jumlah data latih: {len(X_train)}')
print(f'Jumlah data uji: {len(X_test)}')

Jumlah data latih: 3072
Jumlah data uji: 768


In [None]:
# Save train dataset (X_train+y_train) to a dataframe
train_df = pd.concat([X_train, y_train], axis=1)
#Save train_df to a csv file
train_df.to_csv('/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Output Data/train_120225.csv', index=False)
#Show the train_df
train_df.head()

Unnamed: 0,PGA(g),B(m),DF(m),Y(kN/m3),Beban(kN),SF
1729,0.1,1.7,1.5,21,100,2.69
2422,0.5,2.7,0.5,19,150,2.05
433,0.0,1.7,1.0,19,100,2.61
2603,0.5,2.7,1.0,21,50,3.09
371,0.0,1.9,1.5,19,50,4.05


In [None]:
# Save train dataset (X_test+y_test) to a dataframe
test_df = pd.concat([X_test, y_test], axis=1)
#Save train_df to a csv file
test_df.to_csv('/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Output Data/test_120225.csv', index=False)
#Show the train_df
test_df.head()

Unnamed: 0,PGA(g),B(m),DF(m),Y(kN/m3),Beban(kN),SF
746,0.0,1.0,1.0,21,100,2.23
3837,1.0,2.8,2.5,21,200,2.38
3218,1.0,2.3,0.5,19,50,1.6
2778,0.5,2.6,1.5,21,150,2.25
1904,0.1,2.1,2.0,21,200,2.19


# **MODELING**

## **Multiple Linear Regression**

### *Pre-Trained*

In [None]:
# Create a pipeline with standard scaling and linear regression
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

# Define the parameter grid for GridSearchCV
lr_param_grid = {
    'regressor__fit_intercept': [True, False]
}

# Initialize KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV with the pipeline, parameter grid, and cross-validation
lr_grid_search = GridSearchCV(lr_pipeline, lr_param_grid, cv=kf, scoring='neg_mean_squared_error')

# Fit the model
lr_grid_search.fit(X_train, y_train)

# Get the best estimator
lr_best_model = lr_grid_search.best_estimator_

# Save the trained model using joblib
joblib.dump(lr_best_model, '/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Pickles/lr_best_model.pkl')

# Show the number of best estimators
lr_best_params = lr_grid_search.best_params_
print(f'Best parameters: {lr_best_params}')

# Number of best estimators (number of combinations tried)
print(f'Number of estimators tried: {len(lr_grid_search.cv_results_["params"])}')

Best parameters: {'regressor__fit_intercept': True}
Number of estimators tried: 2


### *Trained*

In [None]:
# Load the trained Linear Regression model
lr_best_model = joblib.load('/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Pickles/lr_best_model.pkl')

In [None]:
# Collect RMSE and R-squared scores from each fold
lr_fold_rmse = []
lr_fold_r2 = []

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Fit the model on the training fold
    lr_best_model.fit(X_train_fold, y_train_fold)

    # Make predictions on the validation fold
    lr_y_pred_fold = lr_best_model.predict(X_val_fold)

    # Calculate metrics
    lr_rmse_fold = np.sqrt(mean_squared_error(y_val_fold, lr_y_pred_fold))
    lr_r2_fold = r2_score(y_val_fold, lr_y_pred_fold)

    # Append to lists
    lr_fold_rmse.append(lr_rmse_fold)
    lr_fold_r2.append(lr_r2_fold)

# Print fold-wise RMSE and R-squared
print(f'Fold-wise RMSE: {lr_fold_rmse}')
print(f'Fold-wise R-squared: {lr_fold_r2}')

Fold-wise RMSE: [0.3864451804806857, 0.38661297095534836, 0.35613533369085115, 0.3979609435618536, 0.36850471436900906]
Fold-wise R-squared: [0.8174616830427134, 0.8221409381509411, 0.8028519793505189, 0.8145594208352307, 0.83263251251451]


In [None]:
# Make predictions on the train set
lr_train_pred = lr_best_model.predict(X_train)

# Calculate RMSE and R-squared
lr_train_rmse = np.sqrt(mean_squared_error(y_train, lr_train_pred))
lr_train_r2 = r2_score(y_train, lr_train_pred)

# Print RMSE and R-squared values
print(f'Test RMSE: {lr_train_rmse}')
print(f'Test R-squared: {lr_train_r2}')

Test RMSE: 0.37825944322342575
Test R-squared: 0.8198279584032697


In [None]:
# Make predictions on the test set
lr_y_pred = lr_best_model.predict(X_test)

# Calculate RMSE and R-squared
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_y_pred))
lr_r2 = r2_score(y_test, lr_y_pred)

# Print RMSE and R-squared values
print(f'Test RMSE: {lr_rmse}')
print(f'Test R-squared: {lr_r2}')

Test RMSE: 0.35495303494541536
Test R-squared: 0.8167975836481938


## **Decision Tree Regression**

### *Pre-Trained*

In [None]:
# Create a pipeline with standard scaling and Decision Tree Regressor
dt_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', DecisionTreeRegressor())
])

# Define the parameter grid for GridSearchCV
dt_param_grid = {
    'regressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'regressor__splitter': ['best', 'random'],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 10, 20],
    'regressor__min_samples_leaf': [1, 5, 10]
}

# Initialize KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV with the pipeline, parameter grid, and cross-validation
dt_grid_search = GridSearchCV(dt_pipeline, dt_param_grid, cv=kf, scoring='neg_mean_squared_error')

# Fit the model
dt_grid_search.fit(X_train, y_train)

# Get the best estimator
dt_best_model = dt_grid_search.best_estimator_

# Save the trained model using joblib
joblib.dump(dt_best_model, '/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Pickles/dt_best_model.pkl')

# Show the best parameters
dt_best_params = dt_grid_search.best_params_
print(f'Best parameters: {dt_best_params}')

# Number of best estimators (number of combinations tried)
print(f'Number of estimators tried: {len(dt_grid_search.cv_results_["params"])}')

Best parameters: {'regressor__criterion': 'poisson', 'regressor__max_depth': 20, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__splitter': 'best'}
Number of estimators tried: 288


### *Trained*

In [None]:
# Load the trained Decision Tree Regression model
dt_best_model = joblib.load('/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Pickles/dt_best_model.pkl')

In [None]:
# Collect RMSE and R-squared scores from each fold
dt_fold_rmse = []
dt_fold_r2 = []

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Fit the model on the training fold
    dt_best_model.fit(X_train_fold, y_train_fold)

    # Make predictions on the validation fold
    dt_y_pred_fold = dt_best_model.predict(X_val_fold)

    # Calculate metrics
    dt_rmse_fold = np.sqrt(mean_squared_error(y_val_fold, dt_y_pred_fold))
    dt_r2_fold = r2_score(y_val_fold, dt_y_pred_fold)

    # Append to lists
    dt_fold_rmse.append(dt_rmse_fold)
    dt_fold_r2.append(dt_r2_fold)

# Print fold-wise RMSE and R-squared
print(f'Fold-wise RMSE: {dt_fold_rmse}')
print(f'Fold-wise R-squared: {dt_fold_r2}')

Fold-wise RMSE: [0.08244727312366648, 0.08332881288552339, 0.08245117990910687, 0.12136772803000855, 0.09040498818788052]
Fold-wise R-squared: [0.9916913427512902, 0.9917374605573926, 0.9894328945075904, 0.9827523347154536, 0.9899267459385691]


In [None]:
# Make predictions on the train set
dt_train_pred = dt_best_model.predict(X_train)

# Calculate RMSE and R-squared
dt_train_rmse = np.sqrt(mean_squared_error(y_train, dt_train_pred))
dt_train_r2 = r2_score(y_train, dt_train_pred)

# Print RMSE and R-squared values
print(f'Test RMSE: {dt_train_rmse}')
print(f'Test R-squared: {dt_train_r2}')

Test RMSE: 0.057148077163840835
Test R-squared: 0.9958874509884388


In [None]:
# Make predictions on the test set
dt_y_pred = dt_best_model.predict(X_test)

# Calculate RMSE and R-squared
dt_rmse = np.sqrt(mean_squared_error(y_test, dt_y_pred))
dt_r2 = r2_score(y_test, dt_y_pred)

# Print RMSE and R-squared values
print(f'RMSE: {dt_rmse}')
print(f'R-squared: {dt_r2}')

RMSE: 0.0962918530862059
R-squared: 0.9865175599180853


## **KNearest Neighbors Regression**

### *Pre-Trained*

In [None]:
# Create a pipeline with standard scaling and K-Nearest Neighbors Regressor
kn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', KNeighborsRegressor())
])

# Define the parameter grid for GridSearchCV
kn_param_grid = {
    'regressor__n_neighbors': [3, 5, 7, 10],
    'regressor__weights': ['uniform', 'distance'],
    'regressor__p': [1, 2]  # p=1 for Manhattan distance, p=2 for Euclidean distance
}

# Initialize KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV with the pipeline, parameter grid, and cross-validation
kn_grid_search = GridSearchCV(kn_pipeline, kn_param_grid, cv=kf, scoring='neg_mean_squared_error')

# Fit the model
kn_grid_search.fit(X_train, y_train)

# Get the best estimator
kn_best_model = kn_grid_search.best_estimator_

# Save the trained model using joblib
joblib.dump(kn_best_model, '/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Pickles/kn_best_model.pkl')

# Show the best parameters
kn_best_params = kn_grid_search.best_params_
print(f'Best parameters: {kn_best_params}')

# Number of best estimators (number of combinations tried)
print(f'Number of estimators tried: {len(kn_grid_search.cv_results_["params"])}')

Best parameters: {'regressor__n_neighbors': 3, 'regressor__p': 1, 'regressor__weights': 'distance'}
Number of estimators tried: 16


### *Trained*

In [None]:
# Load the trained KNeighbors Regression Regression model
kn_best_model = joblib.load('/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Pickles/kn_best_model.pkl')

In [None]:
# Collect RMSE and R-squared scores from each fold
kn_fold_rmse = []
kn_fold_r2 = []

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Fit the model on the training fold
    kn_best_model.fit(X_train_fold, y_train_fold)

    # Make predictions on the validation fold
    kn_y_pred_fold = kn_best_model.predict(X_val_fold)

    # Calculate metrics
    kn_rmse_fold = np.sqrt(mean_squared_error(y_val_fold, kn_y_pred_fold))
    kn_r2_fold = r2_score(y_val_fold, kn_y_pred_fold)

    # Append to lists
    kn_fold_rmse.append(kn_rmse_fold)
    kn_fold_r2.append(kn_r2_fold)

# Print fold-wise RMSE and R-squared
print(f'Fold-wise RMSE: {kn_fold_rmse}')
print(f'Fold-wise R-squared: {kn_fold_r2}')

Fold-wise RMSE: [0.09420975821735957, 0.10313704314098493, 0.08996336058822443, 0.09518578560469097, 0.09004708651994002]
Fold-wise R-squared: [0.9891514916013229, 0.9873423667295611, 0.98741962365044, 0.9893911558724358, 0.9900063454866391]


In [None]:
# Make predictions on the train set
kn_train_pred = kn_best_model.predict(X_train)

# Calculate RMSE and R-squared
kn_train_rmse = np.sqrt(mean_squared_error(y_train, kn_train_pred))
kn_train_r2 = r2_score(y_train, kn_train_pred)

# Print RMSE and R-squared values
print(f'Test RMSE: {kn_train_rmse}')
print(f'Test R-squared: {kn_train_r2}')

Test RMSE: 0.0
Test R-squared: 1.0


In [None]:
# Make predictions on the test set
kn_y_pred = kn_best_model.predict(X_test)

# Calculate RMSE and R-squared
kn_rmse = np.sqrt(mean_squared_error(y_test, kn_y_pred))
kn_r2 = r2_score(y_test, kn_y_pred)

# Print RMSE and R-squared values
print(f'RMSE: {kn_rmse}')
print(f'R-squared: {kn_r2}')

RMSE: 0.08060909634188462
R-squared: 0.9905516164475394


## **Support Vector Regression**

### *Pre-Trained*

In [None]:
# Create a pipeline with standard scaling and SVR
svr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', SVR())
])

# Define the parameter grid for GridSearchCV
svr_param_grid = {
    'regressor__kernel': ['linear', 'poly', 'sigmoid'],
    'regressor__C': [0.1, 1, 10],
    'regressor__epsilon': [0.01, 0.1, 1]
}

# Initialize KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV with the pipeline, parameter grid, and cross-validation
svr_grid_search = GridSearchCV(svr_pipeline, svr_param_grid, cv=kf, scoring='neg_mean_squared_error')

# Fit the model
svr_grid_search.fit(X_train, y_train)

# Get the best estimator
svr_best_model = svr_grid_search.best_estimator_

# Save the trained model using joblib
joblib.dump(svr_best_model, '/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Pickles/svr_best_model.pkl')

# Show the best parameters
svr_best_params = svr_grid_search.best_params_
print(f'Best parameters: {svr_best_params}')

# Number of best estimators (number of combinations tried)
print(f'Number of estimators tried: {len(svr_grid_search.cv_results_["params"])}')

Best parameters: {'regressor__C': 100, 'regressor__epsilon': 0.01, 'regressor__kernel': 'rbf'}
Number of estimators tried: 48


### *Trained*

In [None]:
# Load the trained Support Vector Regression model
svr_best_model = joblib.load('/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Pickles/svr_best_model.pkl')

In [None]:
# Collect RMSE and R-squared scores from each fold
svr_fold_rmse = []
svr_fold_r2 = []

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Fit the model on the training fold
    svr_best_model.fit(X_train_fold, y_train_fold)

    # Make predictions on the validation fold
    svr_y_pred_fold = svr_best_model.predict(X_val_fold)

    # Calculate metrics
    svr_rmse_fold = np.sqrt(mean_squared_error(y_val_fold, svr_y_pred_fold))
    svr_r2_fold = r2_score(y_val_fold, svr_y_pred_fold)

    # Append to lists
    svr_fold_rmse.append(svr_rmse_fold)
    svr_fold_r2.append(svr_r2_fold)

# Print fold-wise RMSE and R-squared
print(f'Fold-wise RMSE: {svr_fold_rmse}')
print(f'Fold-wise R-squared: {svr_fold_r2}')

Fold-wise RMSE: [0.050220368474644846, 0.05530631863208187, 0.05754056707035336, 0.06013931989018722, 0.05284023182245248]
Fold-wise R-squared: [0.9969172518849639, 0.9963602407472191, 0.99485351669402, 0.9957651220734508, 0.9965587642292135]


In [None]:
# Make predictions on the train set
svr_train_pred = svr_best_model.predict(X_train)

# Calculate RMSE and R-squared
svr_train_rmse = np.sqrt(mean_squared_error(y_train, svr_train_pred))
svr_train_r2 = r2_score(y_train, svr_train_pred)

# Print RMSE and R-squared values
print(f'Test RMSE: {svr_train_rmse}')
print(f'Test R-squared: {svr_train_r2}')

Test RMSE: 0.1679046885076507
Test R-squared: 0.9644995802119201


In [None]:
# Make predictions on the test set
svr_y_pred = svr_best_model.predict(X_test)

# Calculate RMSE and R-squared
svr_rmse = np.sqrt(mean_squared_error(y_test, svr_y_pred))
svr_r2 = r2_score(y_test, svr_y_pred)

# Print RMSE and R-squared values
print(f'RMSE: {svr_rmse}')
print(f'R-squared: {svr_r2}')

RMSE: 0.16475800339579375
R-squared: 0.9605286072743483


## **XGBoost Regression**

### *Pre-Trained*

In [None]:
# Create a pipeline with standard scaling and XGBRegressor
xgb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', XGBRegressor())
])

# Define the parameter grid for GridSearchCV
xgb_param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [3, 6, 10],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__subsample': [0.8, 1.0],
    'regressor__colsample_bytree': [0.8, 1.0]
}

# Initialize KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV with the pipeline, parameter grid, and cross-validation
xgb_grid_search = GridSearchCV(xgb_pipeline, xgb_param_grid, cv=kf, scoring='neg_mean_squared_error')

# Fit the model
xgb_grid_search.fit(X_train, y_train)

# Get the best estimator
xgb_best_model = xgb_grid_search.best_estimator_

# Save the trained model using joblib
joblib.dump(xgb_best_model, '/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Pickles/xgb_best_model.pkl')

# Show the best parameters
xgb_best_params = xgb_grid_search.best_params_
print(f'Best parameters: {xgb_best_params}')

# Number of best estimators (number of combinations tried)
print(f'Number of estimators tried: {len(xgb_grid_search.cv_results_["params"])}')

Best parameters: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.01, 'regressor__max_depth': 3, 'regressor__n_estimators': 50, 'regressor__subsample': 0.8}
Number of estimators tried: 108


### *Trained*

In [None]:
# Load the trained XGBoost Regression model
xgb_best_model = joblib.load('/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Pickles/xgb_v2_best_model.pkl')

In [None]:
# Collect RMSE and R-squared scores from each fold
xgb_fold_rmse = []
xgb_fold_r2 = []

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Fit the model on the training fold
    xgb_best_model.fit(X_train_fold, y_train_fold)

    # Make predictions on the validation fold
    xgb_y_pred_fold = xgb_best_model.predict(X_val_fold)

    # Calculate metrics
    xgb_rmse_fold = np.sqrt(mean_squared_error(y_val_fold, xgb_y_pred_fold))
    xgb_r2_fold = r2_score(y_val_fold, xgb_y_pred_fold)

    # Append to lists
    xgb_fold_rmse.append(xgb_rmse_fold)
    xgb_fold_r2.append(xgb_r2_fold)

# Print fold-wise RMSE and R-squared
print(f'Fold-wise RMSE: {xgb_fold_rmse}')
print(f'Fold-wise R-squared: {xgb_fold_r2}')

Fold-wise RMSE: [0.04796394892314542, 0.049017311858611086, 0.046207881586065994, 0.06180465691520925, 0.04847017272892805]
Fold-wise R-squared: [0.9971880466288306, 0.9971409474547603, 0.9966810981409386, 0.9955273360287609, 0.9971044295572935]


In [None]:
# Make predictions on the train set
final_model = xgb_best_model.named_steps["regressor"]
xgb_train_pred = final_model.predict(X_train)

# Calculate RMSE and R-squared
xgb_train_rmse = np.sqrt(mean_squared_error(y_train, xgb_train_pred))
xgb_train_r2 = r2_score(y_train, xgb_train_pred)

# Print RMSE and R-squared values
print(f'Test RMSE: {xgb_train_rmse}')
print(f'Test R-squared: {xgb_train_r2}')

Test RMSE: 0.8539843366629984
Test R-squared: 0.08165122268295366


In [None]:
# Make predictions on the test set
xgb_y_pred = final_model.predict(X_test)

# Calculate RMSE and R-squared
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_y_pred))
xgb_r2 = r2_score(y_test, xgb_y_pred)

# Print RMSE and R-squared values
print(f'RMSE: {xgb_rmse}')
print(f'R-squared: {xgb_r2}')

RMSE: 0.7947352057095302
R-squared: 0.08159484236345316


## **TabNet Regressor**

### *Pre-Trained*

In [None]:
# Reshape y to be 2D
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

In [None]:
# Create a pipeline with standard scaling and TabNetRegressor
tn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', TabNetRegressor())
])

# Define the parameter grid for GridSearchCV
tn_param_grid = {
    'regressor__n_d': [16, 24, 32],
    'regressor__n_a': [16, 24, 32],
    'regressor__n_steps': [2, 3, 5],
    'regressor__gamma': [1.3, 1.5],
    'regressor__n_independent': [2, 3],
    'regressor__n_shared': [2, 3]
}

# Initialize KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV with the pipeline, parameter grid, and cross-validation
tn_grid_search = GridSearchCV(tn_pipeline, tn_param_grid, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model
tn_grid_search.fit(X_train, y_train)

# Get the best estimator
tn_best_model = tn_grid_search.best_estimator_

# Print the best parameters
print("Best parameters found: ", tn_grid_search.best_params_)

epoch 0  | loss: 4.58216 |  0:00:00s
epoch 1  | loss: 0.81571 |  0:00:00s
epoch 2  | loss: 0.27756 |  0:00:00s
epoch 3  | loss: 0.19024 |  0:00:00s
epoch 4  | loss: 0.14548 |  0:00:00s
epoch 5  | loss: 0.12596 |  0:00:00s
epoch 6  | loss: 0.10731 |  0:00:01s
epoch 7  | loss: 0.09459 |  0:00:01s
epoch 8  | loss: 0.09538 |  0:00:01s
epoch 9  | loss: 0.08362 |  0:00:01s
epoch 10 | loss: 0.06862 |  0:00:01s
epoch 11 | loss: 0.07368 |  0:00:01s
epoch 12 | loss: 0.07445 |  0:00:01s
epoch 13 | loss: 0.0687  |  0:00:01s
epoch 14 | loss: 0.0652  |  0:00:02s
epoch 15 | loss: 0.05651 |  0:00:02s
epoch 16 | loss: 0.05701 |  0:00:02s
epoch 17 | loss: 0.04637 |  0:00:02s
epoch 18 | loss: 0.04957 |  0:00:02s
epoch 19 | loss: 0.05202 |  0:00:02s
epoch 20 | loss: 0.04734 |  0:00:02s
epoch 21 | loss: 0.0442  |  0:00:03s
epoch 22 | loss: 0.04097 |  0:00:03s
epoch 23 | loss: 0.04516 |  0:00:03s
epoch 24 | loss: 0.04673 |  0:00:03s
epoch 25 | loss: 0.04806 |  0:00:03s
epoch 26 | loss: 0.04442 |  0:00:03s
e

In [None]:
# Save the trained model using joblib
joblib.dump(tn_best_model, '/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Pickles/tn_best_model.pkl')

In [None]:
# Create a pipeline with standard scaling and TabNetRegressor
tn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', TabNetRegressor())
])

# Define the parameter grid for GridSearchCV
tn_param_grid_2 = {
    'regressor__n_d': [16, 24, 32],
    'regressor__n_a': [16, 24, 32],
    'regressor__n_steps': [2, 3, 5],
    'regressor__gamma': [1.3, 1.5],
    'regressor__n_independent': [2, 3],
    'regressor__n_shared': [2, 3]
}

# Initialize KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV with the pipeline, parameter grid, and cross-validation
tn_grid_search_2 = GridSearchCV(tn_pipeline, tn_param_grid_2, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model
tn_grid_search_2.fit(X_train, y_train)

# Get the best estimator
tn_best_model_2 = tn_grid_search_2.best_estimator_

# Print the best parameters
print("Best parameters found: ", tn_grid_search_2.best_params_)

In [None]:
# Save the trained model using joblib
joblib.dump(tn_best_model_2, '/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Pickles/tn_best_model_2.pkl')

### *Trained*

In [None]:
# Load the trained Polynomial Regression model
tn_best_model = joblib.load('/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Pickles/tn_best_model.pkl')

In [None]:
# Make predictions on the train set
tn_train_pred = tn_best_model.predict(X_train)

# Calculate RMSE and R-squared for the test set
tn_train_rmse = np.sqrt(mean_squared_error(y_train, tn_train_pred))
tn_train_r2 = r2_score(y_train, tn_train_pred)

print(f'Train RMSE: {tn_train_rmse}')
print(f'Train R-squared: {tn_train_r2}')

Train RMSE: 0.0902155621005605
Train R-squared: 0.9897512532761336


In [None]:
# Make predictions on the test set
tn_y_pred = tn_best_model.predict(X_test)

# Calculate RMSE and R-squared for the test set
tn_test_rmse = np.sqrt(mean_squared_error(y_test, tn_y_pred))
tn_test_r2 = r2_score(y_test, tn_y_pred)

print(f'Test RMSE: {tn_test_rmse}')
print(f'Test R-squared: {tn_test_r2}')

Test RMSE: 0.0938044370070563
Test R-squared: 0.9872051213200111


In [None]:
# Save y_train and mlp_train_pred into a dataframe
tn_train_out = pd.DataFrame({'Actual': y_train, 'Predicted': tn_train_pred.flatten()})

#save mlp_train_out into a csv file
tn_train_out.to_csv('/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Output Data/tn_train_out_120225.csv', index=False)

tn_train_out

Unnamed: 0,Actual,Predicted
1729,2.69,2.779933
2422,2.05,1.985445
433,2.61,2.498246
2603,3.09,3.054048
371,4.05,3.996905
...,...,...
1130,1.51,1.601439
1294,2.83,2.870162
860,2.98,2.940365
3507,1.72,1.781179


In [None]:
# Save y_test and tn_test_pred into a dataframe
tn_test_out = pd.DataFrame({'Actual': y_test, 'Predicted': tn_y_pred.flatten()})

#save mlp_train_out into a csv file
tn_test_out.to_csv('/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Output Data/tn_test_out_120225.csv', index=False)

tn_test_out

Unnamed: 0,Actual,Predicted
746,2.23,2.333257
3837,2.38,2.321867
3218,1.60,1.540810
2778,2.25,2.244516
1904,2.19,2.198639
...,...,...
1018,4.06,3.981230
449,2.88,2.839347
1288,2.61,2.628154
376,4.30,4.357842


## **Multi Layer Perceptron**

### *Pre-Trained*

In [None]:
# Create a pipeline with standard scaling and MLPRegressor
mlp_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', MLPRegressor(max_iter=500))
])

# Define the parameter grid for GridSearchCV
mlp_param_grid = {
    'regressor__hidden_layer_sizes': [(30,)],
    'regressor__activation': ['relu', 'tanh', 'logistic'],
    'regressor__solver': ['adam', 'lbfgs'],
    'regressor__alpha': [0.0001, 0.001, 0.01],
    'regressor__learning_rate': ['constant', 'invscaling', 'adaptive']
}

# Initialize KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV with the pipeline, parameter grid, and cross-validation
mlp_grid_search = GridSearchCV(mlp_pipeline, mlp_param_grid, cv=kf, scoring='neg_mean_squared_error')

# Fit the model
mlp_grid_search.fit(X_train, y_train)

# Get the best estimator
mlp_best_model = mlp_grid_search.best_estimator_

# Save the trained model using joblib
#joblib.dump(mlp_best_model, '/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Pickles/mlp_30_best_model.pkl')

# Show the best parameters
mlp_best_params = mlp_grid_search.best_params_
print(f'Best parameters: {mlp_best_params}')

# Number of best estimators (number of combinations tried)
print(f'Number of estimators tried: {len(mlp_grid_search.cv_results_["params"])}')

Best parameters: {'regressor__activation': 'relu', 'regressor__alpha': 0.01, 'regressor__hidden_layer_sizes': (30,), 'regressor__learning_rate': 'invscaling', 'regressor__solver': 'lbfgs'}
Number of estimators tried: 54


In [None]:
# Extract the mean training and validation scores from the GridSearchCV results
mlp_train_rmse = np.sqrt(-mlp_grid_search.cv_results_['mean_train_score'])
mlp_validation_rmse = np.sqrt(-mlp_grid_search.cv_results_['mean_test_score'])

# Plot the RMSE curves
plt.figure(figsize=(10, 6))
plt.plot(range(len(train_rmse)), mlp_train_rmse, label='Training RMSE', marker='o')
plt.plot(range(len(validation_rmse)), mlp_validation_rmse, label='Validation RMSE', marker='x')

# Add labels and title
plt.xlabel('Parameter Combination Index')
plt.ylabel('RMSE')
plt.title('Training and Validation RMSE Curves During Grid Search')
plt.legend()
plt.grid(True)
plt.show()

### *Trained*

In [None]:
# Load the trained Multilayer Perceptron Regression model
mlp_best_model = joblib.load('/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Pickles/mlp_30_best_model.pkl')

In [None]:
# Collect RMSE and R-squared scores from each fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mlp_fold_rmse = []
mlp_fold_r2 = []

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Fit the model on the training fold
    mlp_best_model.fit(X_train_fold, y_train_fold)

    # Make predictions on the validation fold
    mlp_y_pred_fold = mlp_best_model.predict(X_val_fold)

    # Calculate metrics
    mlp_rmse_fold = np.sqrt(mean_squared_error(y_val_fold, mlp_y_pred_fold))
    mlp_r2_fold = r2_score(y_val_fold, mlp_y_pred_fold)

    # Append to lists
    mlp_fold_rmse.append(mlp_rmse_fold)
    mlp_fold_r2.append(mlp_r2_fold)

# Print fold-wise RMSE and R-squared
print(f'Fold-wise RMSE: {mlp_fold_rmse}')
print(f'Fold-wise R-squared: {mlp_fold_r2}')

Fold-wise RMSE: [0.03732176484127716, 0.035537797425130455, 0.03401343642268756, 0.035679647180412985, 0.036726791084166]
Fold-wise R-squared: [0.9982974393354108, 0.9984971896727414, 0.9982016951476252, 0.9985093867629268, 0.9983375409015023]


In [None]:
# Make predictions on the train set
mlp_train_pred = mlp_best_model.predict(X_train)

# Calculate RMSE and R-squared
mlp_train_rmse = np.sqrt(mean_squared_error(y_train, mlp_train_pred))
mlp_train_r2 = r2_score(y_train, mlp_train_pred)

# Print RMSE and R-squared values
print(f'Test RMSE: {mlp_train_rmse}')
print(f'Test R-squared: {mlp_train_r2}')

Test RMSE: 0.07003931979887994
Test R-squared: 0.9938227997381763


In [None]:
# Save y_train and mlp_train_pred into a dataframe
mlp_train_out = pd.DataFrame({'Actual': y_train, 'Predicted': mlp_train_pred})

#save mlp_train_out into a csv file
mlp_train_out.to_csv('/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Output Data/mlp_train_out_120225.csv', index=False)

In [None]:
# Make predictions on the test set
mlp_y_pred = mlp_best_model.predict(X_test)

# Calculate RMSE and R-squared
mlp_rmse = np.sqrt(mean_squared_error(y_test, mlp_y_pred))
mlp_r2 = r2_score(y_test, mlp_y_pred)

# Print RMSE and R-squared values
print(f'RMSE: {mlp_rmse}')
print(f'R-squared: {mlp_r2}')

RMSE: 0.07326873552912774
R-squared: 0.9921940313475407


In [None]:
# Save y_train and mlp_train_pred into a dataframe
mlp_test_out = pd.DataFrame({'Actual': y_test, 'Predicted': mlp_y_pred})

#save mlp_train_out into a csv file
mlp_test_out.to_csv('/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Output Data/mlp_test_out_120225.csv', index=False)

# **RESULT EXTRACTION**

In [None]:
# Save prediction from the train set to a dataframe
# Make the dictionary of the outputs
train_outputs = {
    "y_actual": y_train,
    "y_linear": lr_train_pred,
    "y_supportvector": svr_train_pred,
    "y_decisiontree": dt_train_pred,
    "y_xgboost": xgb_train_pred,
    "y_kneighbors": kn_train_pred
}

# Turn dictionary into the dataframe
train_output_df = pd.DataFrame(train_outputs)

# Save the dataframe to csv file
train_output_df.to_csv('/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Output Data/outputs_train_120225.csv', index=False)

#Show the output dataframe
train_output_df

Unnamed: 0,y_actual,y_linear,y_supportvector,y_decisiontree,y_xgboost,y_kneighbors
1729,2.69,2.938335,2.759585,2.680000,2.650202,2.69
2422,2.05,1.935664,1.832353,2.166667,2.000052,2.05
433,2.61,2.666353,2.552237,2.610000,2.621479,2.61
2603,3.09,3.147709,3.061021,2.972500,3.035399,3.09
371,4.05,3.437544,3.893770,4.026667,4.118050,4.05
...,...,...,...,...,...,...
1130,1.51,1.703802,1.469565,1.534000,1.607371,1.51
1294,2.83,2.818186,2.984685,2.792500,2.784090,2.83
860,2.98,3.062804,2.838130,2.910000,2.929428,2.98
3507,1.72,1.720277,1.953100,1.736000,1.694535,1.72


In [None]:
# Save prediction from the train set to a dataframe
# Make the dictionary of the outputs
test_outputs = {
    "y_actual": y_test,
    "y_linear": lr_y_pred,
    "y_supportvector": svr_y_pred,
    "y_decisiontree": dt_y_pred,
    "y_xgboost": xgb_y_pred,
    "y_kneighbors": kn_y_pred
}

# Turn dictionary into the dataframe
test_output_df = pd.DataFrame(test_outputs)

# Save the dataframe to csv file
test_output_df.to_csv('/content/drive/MyDrive/Research/Regression Analysis - Qult and SF/Modeling - SF/Output Data/outputs_test_120225.csv', index=False)

#Show the output dataframe
test_output_df

Unnamed: 0,y_actual,y_linear,y_supportvector,y_decisiontree,y_xgboost,y_kneighbors
746,2.23,2.475230,2.329879,2.322000,2.253752,2.226565
3837,2.38,2.250088,2.411766,2.268000,2.423724,2.363697
3218,1.60,2.159784,1.619671,1.605000,1.513176,1.603500
2778,2.25,2.521585,2.294335,2.196667,2.246303,2.258500
1904,2.19,2.519968,2.375549,1.960000,2.104186,2.242638
...,...,...,...,...,...,...
1018,4.06,3.537044,4.089720,4.211667,3.871821,4.176959
449,2.88,2.930221,2.798168,2.892500,2.812025,2.818551
1288,2.61,2.568877,2.826619,2.552500,2.633450,2.648642
376,4.30,3.645301,4.399290,4.310000,4.443676,4.342012
