In [4]:
import pandas as pd
import numpy as np

from scipy.stats import norm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, RBF, RationalQuadratic
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [5]:
# Load the data from the Excel file
df = pd.read_csv('data/count_encode_vanillate_total.csv')

In [6]:
#specify the feature column(x) and target column(y)
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [7]:
# Split into training and testing data (90:10 split)
'''
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1, random_state=42)
'''

'\nX_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1, random_state=42)\n'

In [8]:
# setup K-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [9]:
model = GaussianProcessRegressor(kernel= Matern(length_scale=1.0, nu=0.01), alpha = 1e-4, random_state = 42)

# Store metrics (e.g., Mean Squared Error and R²) for each fold
mse_list = []
r2_list = []

# Perform K-fold cross val
for train_index, test_index in kf.split(X):
    # split data
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    #fit model on train
    model.fit(X_train, y_train)

    #predict on test data
    y_pred = model.predict(X_test)

    # calculate performance metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    #store results
    mse_list.append(mse)
    r2_list.append(r2)

# Print the results for all folds
print(f"Mean Squared Error for each fold: {mse_list}")
print(f"R² score for each fold: {r2_list}")

# Calculate and print the average performance metrics across all folds
print(f"Average Mean Squared Error: {np.mean(mse_list)}")
print(f"Average R² score: {np.mean(r2_list)}")

Mean Squared Error for each fold: [990691.885196874, 1742415.3333911004, 880983.2652980501, 1892648.8068419397, 758996.0595131952]
R² score for each fold: [-0.3052150881474043, -0.213105732924173, -0.34972322903116493, -0.13178173509679114, -0.1956205320417157]
Average Mean Squared Error: 1253147.070048232
Average R² score: -0.23908926344824982
