In [1]:
import os
import sys

# Get the directory path of the notebook
notebook_directory = os.getcwd()

# Navigate one level up to reach the root directory
root_directory = os.path.abspath(os.path.join(notebook_directory, os.pardir))
# Add the root directory to the Python path
sys.path.append(root_directory)

src_directory = os.path.join(root_directory, "src")

# Add the 'src' directory to the Python path
sys.path.append(src_directory)

In [2]:
import numpy as np
import pandas as pd

from rulefit import RuleFit
from sklearn.exceptions import NotFittedError
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import LinearRegression, RidgeCV, ElasticNet, Ridge
from pyod.models.ecod import ECOD
from sklearn.ensemble import RandomForestClassifier, IsolationForest, RandomForestRegressor, GradientBoostingRegressor, \
    StackingRegressor, AdaBoostRegressor, VotingRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import QuantileTransformer, StandardScaler, RobustScaler
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process.kernels import RationalQuadratic, Matern, WhiteKernel
from sklearn.gaussian_process.kernels import RBF

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Now you can import the preprocessing module and use its functions
from preprocessing import Preprocessing

In [4]:
imputer = SimpleImputer(strategy='median')
ecod = ECOD(contamination=0.01)
knn_imputer = KNNImputer(n_neighbors=15, weights="uniform")
constant_filter = VarianceThreshold(threshold=0)
selector = SelectKBest(score_func=f_regression, k=175)
scaler_x = RobustScaler()


# Instantiate Preprocessing object
preprocessing = Preprocessing(imputer, ecod, knn_imputer, constant_filter, selector, scaler_x)
X_train = Preprocessing.load_data(os.path.join(root_directory, "data", "X_train.csv"))
y_train = Preprocessing.load_data(os.path.join(root_directory, "data", "y_train.csv"))['y']


feature_names = X_train.columns

In [5]:
# Imputation and Outlier detection
X_train, y_train = preprocessing.simple_imputation_and_outlier_ECOD(X_train, feature_names, y_train, fit_transformers=True)

In [6]:
# Scaling the data
X_train, y_train = preprocessing.scale_data(X_train, y_train, fit_transformers=True)

In [7]:
# Performing feature selection
X_train_updated, y_train_updated = preprocessing.feature_selection(X_train,y_train, fit_transformers=True)

In [8]:
notebook_filename = "Prediction_of_Brain_Age.ipynb"

# Get the directory of the current notebook file
notebook_directory = os.path.dirname(os.path.abspath(notebook_filename))

# Get the parent directory (main folder) of the notebook directory
main_folder_path = os.path.dirname(notebook_directory)

X_train_updated_path = os.path.join(main_folder_path, 'data', 'X_train_updated.csv')
y_train_updated_path = os.path.join(main_folder_path, 'data', 'y_train_updated.csv')


pd.DataFrame(X_train_updated).to_csv(X_train_updated_path, index=False)
y_train_updated.to_csv(y_train_updated_path, index=False)


In [None]:
from train import Training

In [None]:
# Perform cross validation and Grid Search to see the best hyperparameters
from hyperparameters import SVR_PARAMS



svr_model = SVR()
svr = Training(svr_model, SVR_PARAMS)
svr.find_best_parameters(X_train_updated, y_train_updated)

In [None]:
# Perform cross validation and Grid Search to see the best hyperparameters
from hyperparameters import RFR_PARAMS


rfr_model=RandomForestRegressor()
rfr = Training(rfr_model, RFR_PARAMS)
rfr.find_best_parameters(X_train_updated, y_train_updated)

In [None]:
from hyperparameters import GB_PARAMS


gb_model=GradientBoostingRegressor()
gb = Training(gb_model, GB_PARAMS)
gb.find_best_parameters(X_train_updated, y_train_updated)

In [None]:
from hyperparameters import DTR_PARAMS

dtr_model = DecisionTreeRegressor()
dtr = Training(dtr_model, DTR_PARAMS)
dtr.find_best_parameters(X_train_updated, y_train_updated)

In [None]:
AB_PARAMS = {
 'n_estimators': [50, 100, 500],
 'learning_rate': [0.001, 0.01, 0.1],
 'loss': ['linear', 'square', 'exponential'],
 'random_state': [42]}

ab_model = AdaBoostRegressor()

# Create an instance of the Training class
ab = Training(ab_model, param_grid=AB_PARAMS)

# Assuming X_train_updated and y_train_updated are defined elsewhere
ab.find_best_parameters(X_train_updated, y_train_updated)

In [None]:
from hyperparameters import GPR_PARAMS

gpr_model = GaussianProcessRegressor()
gpr = Training(gpr_model, GPR_PARAMS)
gpr.find_best_parameters(X_train_updated, y_train_updated)

In [None]:
from hyperparameters import XGB_PARAMS

xgb_model = XGBRegressor()
xgb = Training(xgb_model, XGB_PARAMS)
xgb.find_best_parameters(X_train_updated, y_train_updated)

In [None]:
from hyperparameters import ET_PARAMS

et_model = ExtraTreesRegressor()
et = Training(et_model, ET_PARAMS)
et.find_best_parameters(X_train_updated, y_train_updated)

In [None]:
from src.hyperparameters import RIDGE_PARAMS

ridge_model = Ridge()
ridge = Training(ridge_model, RIDGE_PARAMS)
ridge.find_best_parameters(X_train_updated, y_train_updated)

In [None]:
from src.hyperparameters import ELASTICNET_PARAMS

elasticnet_model = ElasticNet()
elasticnet = Training(elasticnet_model, ELASTICNET_PARAMS)
elasticnet.find_best_parameters(X_train_updated, y_train_updated)

Optimize the stacking regressor to achieve the best performance

In [None]:
from skopt import gp_minimize
from skopt.space import Categorical
from tuned_models import svr, rfr, gb, dtr, ab, gpr, xgb, et, ridge, elasticnet
from preprocessing import Preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import StackingRegressor
from sklearn.gaussian_process.kernels import ConstantKernel



# Define base_estimators_pool dictionary
base_estimators_pool = {
    'svr': svr,
    'rf': rfr,
    'gb': gb,
    'dt': dtr,
    'ab': ab,
    'et': et,
    'xg': xgb,
    'gpr': gpr,
    'en': elasticnet,
}

def objective(params):
    """
    Objective function for optimizing a stacking regressor.
    """
  
    base_estimators = [(name, estimator) for name, use, estimator in zip(base_estimators_pool.keys(), params, base_estimators_pool.values()) if use]

    # Define the stacking regressor
    stack = StackingRegressor(estimators=base_estimators, final_estimator=ridge)

    # Compute the cross-validated score
    # Compute the cross-validated R^2 score
    score = cross_val_score(stack, X_train_updated, y_train_updated, cv=5, scoring='r2', n_jobs=-1)
    return -np.mean(score)  # Negate the score because we want to maximize R^2

# Define the search space dimensions
search_space = [Categorical([True, False]) for _ in base_estimators_pool]

# Run Bayesian optimization
result = gp_minimize(objective, search_space, n_calls=50, n_initial_points=10, random_state=0, n_jobs=-1)

# Interpret the result
best_combination = [name for name, use in zip(base_estimators_pool.keys(), result.x) if use]

print(f"Best combination of base estimators: {best_combination}")

Optimize the parameters of the kernels used in GPR to achieve the best performance

In [None]:
# Objective function to minimize: Negative R-squared score of GPR with Rational Quadratic kernel

from skopt import gp_minimize
from skopt.space import Real
from sklearn.gaussian_process.kernels import ConstantKernel
from sklearn.gaussian_process.kernels import RationalQuadratic, Matern, WhiteKernel

# For objective gaussian
def objective_gaussian(params):
    """
    Objective function for optimizing a Gaussian process regressor.

    Parameters:
    -----------
    params : array-like
        Array-like object containing the hyperparameters to be optimized.
        The elements of params are in the order: constant_value, length_scale, alpha.

    Returns:
    --------
    float
        Negative mean of the cross-validated R^2 scores of the Gaussian process regressor.
        The negative value is returned because optimization algorithms typically 
        seek to minimize the objective function, whereas we want to maximize R^2.
    """
     
    constant_value, length_scale, alpha = params
    kernel = ConstantKernel(constant_value=constant_value**2) * RationalQuadratic(length_scale=length_scale, alpha=alpha)
    model = GaussianProcessRegressor(kernel=kernel, random_state=0)
    # Negate the R^2 score because gp_minimize seeks to minimize the objective
    return -np.mean(cross_val_score(model, X_train_updated, y_train_updated, cv=5, scoring='r2', n_jobs=-1))

# Define the bounds of the search space
search_space = [
    Real(0.1, 2.0, name='constant_value'),  # Scaling factor
    Real(0.1, 10.0, name='length_scale'),   # Kernel length scale
    Real(0.1, 10.0, name='alpha')           # Kernel alpha
]

# Run Bayesian optimization
result = gp_minimize(objective_gaussian, search_space, n_calls=200, n_initial_points=25, random_state=0, n_jobs=-1)

# Best parameters found
best_constant_value = result.x[0]
best_length_scale = result.x[1]
best_alpha = result.x[2]
best_kernel = ConstantKernel(constant_value=best_constant_value**2) * RationalQuadratic(length_scale=best_length_scale, alpha=best_alpha)
print(f"Best kernel: {best_kernel}")

## Instantiating StackingRegressor for performing predictions on unseen data

In [None]:
from tuned_models import svr, rfr, gb, dtr, ab, gpr, xgb, et, ridge, elasticnet

estimators = [
            ('svr', svr),
            ('rf', rfr),
            ('gb', gb),
            ('dt', dtr),
            ('ab', ab),
            ('et', et),
            ('xg', xgb),
            #('lgbm', lgbm),
            ('gpr', gpr),
            #('cb', cb),
            #('rvr', rvr),
            ('en', elasticnet),
            # ('ridge', ridge)
]

stack1 = StackingRegressor(
    estimators=estimators[0:4],
    final_estimator=gpr,
    n_jobs=-1
)
stack2 = StackingRegressor(
    estimators=estimators[4:],
    final_estimator=gpr,
    n_jobs=-1
)
stack_final = StackingRegressor(
    estimators=[("stack1",stack1),("stack2",stack2)],
    final_estimator=gpr,
    n_jobs=-1
)

In [None]:
# Training the datasets and performing predictions
from predict import Prediction
from train import Training

train_final = Training(stack_final)
prediction = Prediction(train_final)


#Load the test set
data_directory = './data'
X_test = os.path.join(data_directory, 'X_test.csv')
# Generating predictions

y_pred = prediction.generate_predictions(X_train, y_train, X_test)

y_pred

In [None]:

# Sample data

#notebook_filename = "Prediction_of_Brain_Age.ipynb"

# Get the directory of the current notebook file
#notebook_directory = os.path.dirname(os.path.abspath(notebook_filename))

# Get the parent directory (main folder) of the notebook directory
#main_folder_path = os.path.dirname(notebook_directory)
# Create DataFrame
df_final = pd.DataFrame({'id': np.arange(len(y_pred_final)), 'y': y_pred_final.flatten()})

# Get the current directory

# Adjust file path for Windows
csv_file_path = os.path.join(main_folder_path, "data", "y_pred_stackdoublestack_2.csv")

# Save DataFrame to CSV
df_final.to_csv(csv_file_path, index=False)

print("CSV file saved successfully at:", csv_file_path)
