# Bagging Model one one drug with Tuning

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputRegressor
from skopt import BayesSearchCV
from sklearn.model_selection import RandomizedSearchCV
import os

_FOLDER = "fitted_datasets_drug_properties/"
_FOLDER_2 = "test_train/"

import json

## Coding

In [8]:
# Define parameters and labels
params = ['param_1', 'param_2', 'param_3', 'param_4']
notIncludeColumns = [ 'COSMIC_ID', 'DRUG_ID', 'DRUGID_COSMICID', 'MAX_CONC', 'Target_Pathway', 'elements', 'Target', 'Drug_Name', 'molecular_formula', ] + params
conc_labels = ["fd_num_"+str(i) for i in range(10)]
resp_labels = ['norm_cells_'+str(i) for i in range(10)]
pred_resp_labels = ['pred_norm_cells_'+str(i) for i in range(10)]
fullPredLabels = ['DRUG_COSMICID'] + pred_resp_labels

# Initialize storage for results
predictedResults = pd.DataFrame(columns=fullPredLabels)
trueResults = pd.DataFrame(columns=['DRUG_COSMICID'] + resp_labels + params)
performanceMetrics = []

# Loop through datasets
for dataset in os.listdir(_FOLDER):
    if dataset.startswith("merged_fitted_drug_properties"):
        # Extract dataset version
        version = dataset.split("_")[-1].split(".csv")[0]
        
        # Load dataset
        drugProfiles = pd.read_csv(_FOLDER + dataset)

        # Load train and test splits
        train_data = pd.read_csv(_FOLDER_2 + f"train_{version}.csv.csv")
        test_data = pd.read_csv(_FOLDER_2 + f"test_{version}.csv.csv")

        for drugId in train_data['DRUG_ID'].unique():
            trainIndex = train_data[train_data['DRUG_ID'] == drugId].index
            testIndex = test_data[test_data['DRUG_ID'] == drugId].index
            
            # Prepare data
            trainData = drugProfiles.loc[drugProfiles.index.isin(trainIndex)]
            testData = drugProfiles.loc[drugProfiles.index.isin(testIndex)]
            testDataCosmicIds = np.array(testData['DRUGID_COSMICID'])

            notXValuesFilter = resp_labels + notIncludeColumns + conc_labels
            trainXValues = trainData.drop(columns=notXValuesFilter)
            trainYValues = trainData[resp_labels]
            testXValues = testData.drop(columns=notXValuesFilter)
            testYValues = testData[resp_labels]
            testParamsValues = testData[params]

            # Define model and hyperparameter search space
            base_estimator = SVR()
            valModel = MultiOutputRegressor(BaggingRegressor(base_estimator))
            searchSpace = {
                # search space definition as before
            }

            # Perform randomized search
            bayes = RandomizedSearchCV(valModel, searchSpace, cv=5, n_iter=20, n_jobs=-1)
            bayes.fit(trainXValues, trainYValues)
            bestParams = bayes.best_params_

            # After performing randomized search
            bestParams = bayes.best_params_
            print(bestParams)  # Debug: Print the bestParams to see the actual keys

            # Then access parameters (use correct keys as per the print output)
            svm_bagging = MultiOutputRegressor(BaggingRegressor(base_estimator=SVR(
                kernel=bestParams['<correct_key_for_kernel>'], 
                C=bestParams['<correct_key_for_C>'],
                gamma=bestParams['<correct_key_for_gamma>'],
                degree=bestParams['<correct_key_for_degree>'],
                epsilon=bestParams['<correct_key_for_epsilon>']),
                n_estimators=bestParams['<correct_key_for_n_estimators>'],
                max_samples=bestParams['<correct_key_for_max_samples>'],
                max_features=bestParams['<correct_key_for_max_features>'],
                bootstrap=bestParams['<correct_key_for_bootstrap>'],
                bootstrap_features=bestParams['<correct_key_for_bootstrap_features>']))


            # Train the best model
            svm_bagging = MultiOutputRegressor(BaggingRegressor(base_estimator=SVR(
                kernel=bestParams['estimator__base_estimator__kernel'], 
                C=bestParams['estimator__base_estimator__C'],
                gamma=bestParams['estimator__base_estimator__gamma'],
                degree=bestParams['estimator__base_estimator__degree'],
                epsilon=bestParams['estimator__base_estimator__epsilon']),
                n_estimators=bestParams['estimator__n_estimators'],
                max_samples=bestParams['estimator__max_samples'],
                max_features=bestParams['estimator__max_features'],
                bootstrap=bestParams['estimator__bootstrap'],
                bootstrap_features=bestParams['estimator__bootstrap_features']))
            svm_bagging.fit(trainXValues, trainYValues)

            # Predict and evaluate
            yTestPredict = svm_bagging.predict(testXValues)
            test_mse = mean_squared_error(testYValues, yTestPredict)

            # Store results
            trueResult = pd.DataFrame(testDataCosmicIds, columns=['DRUG_COSMICID'])
            trueResult[resp_labels] = testYValues
            trueResult[params] = testParamsValues
            trueResults = trueResults.append(trueResult, ignore_index=True, sort=False)

            result = pd.DataFrame(testDataCosmicIds, columns=['DRUG_COSMICID'])
            result[pred_resp_labels] = yTestPredict
            predictedResults = predictedResults.append(result, ignore_index=True, sort=False)

            # Log performance
            performanceMetrics.append({'dataset_version': version, 'drug_id': drugId, 'test_mse': test_mse})



{}


KeyError: '<correct_key_for_kernel>'