In [1]:
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score, precision_score, recall_score, fbeta_score  # Add accuracy_score here


# Known data
X_known = np.array([
    [4.438920455, 5.285132921, 4.560795403, 4.390586582, 3.892868265, 4.897159647, 5.64068132,  5.40664279],
    [np.nan, np.nan, 9.954516157, 9.271499642, 8.972183987, 8.070824172, 7.431900566, 7.492638483],
    [9.238216655, 8.223075595, 8.016096321, 7.447864927, 6.822957889, 6.576394689, 5.6417114 ,  5.40752516],
    [8.503818525, 9.480793808, 8.496032353, 6.594218189, 5.872404397, 5.533452487, 4.975099626, 5.40453246],
    [7.421535813, 6.591132291, 7.34705273, 6.382394802, 5.447780302, 6.12332374, 6.035003018, 5.71944956],
    [9.287097098, 7.891107971, 6.957538145, 6.584759574, 6.435627635, 6.679654884, 5.775478282, 4.89180948],
    [10.95223, 10.72156, 11.27192, 9.62067598717756, np.nan, 5.96438668 , 5.64595999 , 5.41149996],
    [5.563877595, 5.458904944, 4.908698213, 4.772533701, 4.367120287, 4.768909644, 5.064849554, 4.493651],
    [7.200221545, 5.727048852, 4.822065773, 5.45553737, 5.057657293,5.95554526 , 5.64023731 , 5.40527255],
    [5.682786839, 5.240815471, 4.812782751, 5.251273434, 4.56412597, 4.544215214, 4.543595802, 4.795933049],
    [np.nan, 7.099751509, 4.954910316, 4.835356, 5.644296, 4.738663, 5.64318416 , 5.40986682],
    [np.nan, 8.076386463, 7.740272574, 7.942890616, 7.260156202, 7.153075823, 5.64606215 , 5.4067328],
    [5.65355043, 4.967463117, 4.251158441, 4.202563564, 4.082299151, 4.216920543, 4.210881, 4.162677],
    [5.337603416, 4.790189692, 4.526935265, 4.385387888, 4.676393565, 4.554563673, 4.647488, 4.57645],
    [5.408328826, 5.325758921, 5.479026742, 5.589194224, 5.334732219, 5.95295397 , 5.64525189 , 5.4074844],
    [np.nan, 6.745362563, 6.549002914, 6.363752068, 6.443714157, 6.237135907, 6.080136, 6.137982],
    [5.228211429, 5.152779925, 5.386189809, 4.226006846, 4.285408185, 4.436754071, 5.6405497 ,  5.40199019],
    [6.930967563, 6.947339169, 6.404098623, 6.683598449, 6.619447938, 5.96674103 , 5.64761233,  5.41005782],
    [5.140643328, 5.124860231, 4.814215064, 4.898882609, 4.577649419, 4.925226113, 4.926969, 4.69985],
    [7.47160789, 5.600358423, 6.203089138, 6.378771449, 5.382131324, 5.96674103 , 5.64761233,  5.41005782],
    [5.599731213, 6.513385006, 4.858378273, 6.243756244, 5.904930617, 5.068680622, 5.529139, 5.5617],
    [5.325664377, 4.937296337, 5.652272213, 4.179029629, np.nan,5.95956732,  5.64542066,  5.40676063],
    [15.50411634, 18.69787966, 9.567546881, 8.4623847, 8.52442247, 7.391529307, 7.071136, 6.1379],
    [5.929088106, 5.525167136, 6.105378839, 5.694436536, 5.50509221, 5.857201429, 5.822416, 5.40912467],
    [6.023552089, 6.06718804, 6.218982823, 5.84040509, 6.152785981, 6.044597037, 5.715723, 5.41352483],
    [9.267840593, 8.14000814, 6.927126628, 8.680555556, 9.272137228, 10.82708069,5.6474692  , 5.41095663],
    [10.41037706, 7.099751509, 6.310740881, 5.764685536, 5.28178313, 5.97073732 , 5.64462795,  5.40472864],
    [6.918220864, 6.228265947, 7.975647689, 7.975647689, 6.525672589, 5.652458584, 5.583604, 4.40867247],
    [10.21815767, 8.297722275, np.nan, 10.67950105, 7.687951187, 6.999168654, 5.890962217, 6.225439775]
])

y_known = np.array([
    [4.897159647, 5.64068132,  5.40664279],
    [8.070824172, 7.431900566, 7.492638483],
    [6.576394689, 5.6417114 ,  5.40752516],
    [5.533452487, 4.975099626, 5.40453246],
    [6.12332374, 6.035003018, 5.71944956],
    [6.679654884, 5.775478282, 4.89180948],
    [5.96438668 , 5.64595999 , 5.41149996],
    [4.768909644, 5.064849554, 4.493651],
    [5.95554526 , 5.64023731 , 5.40527255],
    [4.544215214, 4.543595802, 4.795933049],
    [4.738663, 5.64318416 , 5.40986682],
    [7.153075823, 5.64606215 , 5.4067328],
    [4.216920543, 4.210881, 4.162677],
    [4.554563673, 4.647488, 4.57645],
    [5.95295397 , 5.64525189 , 5.4074844],
    [6.237135907, 6.080136, 6.137982],
    [4.436754071, 5.6405497 ,  5.40199019],
    [5.96105183 , 5.65078962 , 5.41220723],
    [4.925226113, 4.926969, 4.69985],
    [5.96674103 , 5.64761233,  5.41005782],
    [5.068680622, 5.529139, 5.5617],
    [ 5.95956732,  5.64542066,  5.40676063],
    [7.391529307, 7.071136, 6.1379],
    [5.857201429, 5.822416, 5.40912467],
    [6.044597037, 5.715723, 5.41352483],
    [10.82708069, 5.6474692  , 5.41095663],
    [5.97073732 , 5.64462795,  5.40472864],
    [5.652458584, 5.583604, 4.40867247],
    [6.999168654, 5.890962217, 6.225439775]
])

# Impute missing values
imputer_X = SimpleImputer(strategy='mean')
imputer_y = SimpleImputer(strategy='mean')

X_known_imputed = imputer_X.fit_transform(X_known)
y_known_imputed = imputer_y.fit_transform(y_known)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_known_imputed, y_known_imputed, test_size=0.3, random_state=42)

# Create a Gradient Boosting Regressor for each target column
regression_models = [GradientBoostingRegressor(random_state=42) for _ in range(y_known_imputed.shape[1])]

# Fit each model separately on the training data
for i, model in enumerate(regression_models):
    model.fit(X_train, y_train[:, i])

# Displaying the results on the test set
predicted_targets_test = np.column_stack([model.predict(X_test) for model in regression_models])

# Calculate R-squared score for each column on the test set
r2_scores_test = [r2_score(y_test[:, i], predicted_targets_test[:, i]) for i in range(y_known_imputed.shape[1])]

print("\nR-squared scores on the test set:")
for i, score in enumerate(r2_scores_test):
    print(f"Target {i + 1}: {score}")

# Displaying the results on the test set
print("\nOriginal values on the test set:")
print(y_test)

# Print predicted regression values on the test set
print("\nPredicted regression values on the test set:")
print(predicted_targets_test)

# Evaluate accuracy without thresholding
accuracy_test_regression = r2_score(y_test, predicted_targets_test)
print("Accuracy (Regression):", accuracy_test_regression)

# Optionally, you can convert the regression predictions to classification predictions and evaluate accuracy
threshold = 5.0  # Set your threshold based on your problem
y_pred_class_test = (predicted_targets_test > threshold).astype(int)
y_test_class = (y_test > threshold).astype(int)

# Classification metrics on the test set
accuracy_test = accuracy_score(y_test_class, y_pred_class_test)
precision_test = precision_score(y_test_class, y_pred_class_test, average='micro')
recall_test = recall_score(y_test_class, y_pred_class_test, average='micro')
f2_test = fbeta_score(y_test_class, y_pred_class_test, beta=2, average='micro')

print("\nAccuracy on the test set:", accuracy_test)
print("Precision on the test set:", precision_test)
print("Recall on the test set:", recall_test)
print("F2 Score on the test set:", f2_test)


R-squared scores on the test set:
Target 1: 0.8849282410211541
Target 2: 0.9062001206318206
Target 3: 0.7403122311847203

Original values on the test set:
[[5.65245858 5.583604   4.40867247]
 [4.43675407 5.6405497  5.40199019]
 [4.21692054 4.210881   4.162677  ]
 [7.39152931 7.071136   6.1379    ]
 [5.95554526 5.64023731 5.40527255]
 [4.54421521 4.5435958  4.79593305]
 [5.95956732 5.64542066 5.40676063]
 [4.89715965 5.64068132 5.40664279]
 [5.97073732 5.64462795 5.40472864]]

Predicted regression values on the test set:
[[5.39929564 5.53760393 4.73852282]
 [5.0277709  5.64043769 5.35722546]
 [4.56992646 4.70809308 4.50592593]
 [7.94845475 6.67093165 6.86112381]
 [5.66638599 5.63172455 5.40553991]
 [4.6698861  4.81483052 4.68807242]
 [5.95192895 5.64421061 5.40669852]
 [4.83395427 5.63160231 5.36089988]
 [5.96598276 5.64514637 5.40460962]]
Accuracy (Regression): 0.8438135309458984

Accuracy on the test set: 0.8888888888888888
Precision on the test set: 0.9473684210526315
Recall on the 