In [24]:
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.utils import resample


In [25]:

data_directory = '' # Directory containing the data files
data_files = [f for f in os.listdir(data_directory) if f.endswith('.npy')]

models = {
    'Linear Regression': LinearRegression(),
    'Support Vector Regression': SVR(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor()
}


n_bootstraps = 1000
alpha = 0.05  # 95% confidence interval


In [None]:
for file_name in data_files:
    file_path = os.path.join(data_directory, file_name)
    print(f"Processing {file_name}")

    data = np.load(file_path)
    X = data[:, :-1]
    y = data[:, -1]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

    for name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        mse_samples = []
        scc_samples = []
        for _ in range(n_bootstraps):
            resample_indices = resample(range(len(y_test)), replace=True)
            y_test_resampled = y_test[resample_indices]
            predictions_resampled = predictions[resample_indices]

            mse_resampled = mean_squared_error(y_test_resampled, predictions_resampled)
            mse_samples.append(mse_resampled)

            correlation_matrix = np.corrcoef(y_test_resampled, predictions_resampled)
            correlation_xy = correlation_matrix[0,1]
            scc_resampled = correlation_xy**2
            scc_samples.append(scc_resampled)

        mse_confidence = np.percentile(mse_samples, [100 * alpha / 2, 100 * (1 - alpha / 2)])
        scc_confidence = np.percentile(scc_samples, [100 * alpha / 2, 100 * (1 - alpha / 2)])

        mse = mean_squared_error(y_test, predictions)
        correlation_matrix = np.corrcoef(y_test, predictions)
        correlation_xy = correlation_matrix[0,1]
        scc = correlation_xy**2

        print(f"{name} MSE: {mse} ± {mse_confidence}")
        print(f"{name} SCC: {scc} ± {scc_confidence}")

Processing ccle_data_pca.npy
