In [1]:
# Import necessary libraries for data processing, machine learning, and evaluation
import pandas as pd  # For handling tabular data
import numpy as np  # For numerical operations
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text into numerical features
from sklearn.ensemble import RandomForestClassifier  # For using the Random Forest algorithm
from sklearn.pipeline import Pipeline  # For combining preprocessing and modeling into a single workflow
from sklearn.model_selection import GridSearchCV  # For finding the best model parameters through cross-validation
from sklearn.metrics import (  # For evaluating model performance
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score, average_precision_score
)
from sklearn.compose import ColumnTransformer  # For preprocessing multiple types of features
from sklearn.preprocessing import StandardScaler  # For standardizing numerical data

# Step 1: Load the data from CSV files
data9 = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/mgh_bidmc_3948.csv')  # Training dataset
data10 = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/mgh_bidmc_1664.csv')  # Testing dataset

# Step 2: Define a vectorizer for text data preprocessing
vectorizer = TfidfVectorizer(
    stop_words='english',  # Ignore common English words
    ngram_range=(1, 3),  # Include single words, bigrams, and trigrams
    max_features=3000  # Limit the number of features to the top 3000
)

# Step 3: Define a preprocessor for combining text and numerical data
preprocessor = ColumnTransformer([
    ('vectorizer', vectorizer, 'report_text'),  # Apply text vectorization to the 'report_text' column
    ('scaler', StandardScaler(), ['icd', 'med'])  # Scale numerical features 'icd' and 'med'
], n_jobs=-1)

# Step 4: Define a Random Forest classifier
model = RandomForestClassifier(
    n_estimators=50,  # Number of trees in the forest
    max_depth=10,  # Maximum depth of each tree
    class_weight="balanced",  # Handle class imbalance
    random_state=42,  # Ensure reproducibility
    n_jobs=-1
)

# Step 5: Combine preprocessing and the model into a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocess the data
    ('classifier', model)  # Train the model
])

# Step 6: Define a parameter grid for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [50, 100, 200],  # Vary the number of trees
    'classifier__max_depth': [10, None, 20],  # Test different tree depths
    'classifier__min_samples_split': [2, 5, 10],  # Minimum samples to split a node
    'classifier__min_samples_leaf': [1, 2, 4]  # Minimum samples per leaf node
}

# Step 7: Perform hyperparameter tuning using GridSearchCV
grid_search4 = GridSearchCV(
    pipeline,  # Pipeline to optimize
    param_grid=param_grid,  # Parameter grid
    cv=5,  # 5-fold cross-validation
    n_jobs=-1  # Use all available processors
)
grid_search4.fit(data9[['icd', 'med', 'report_text']], data9['annot'])  # Train the model on the training data

# Step 8: Output the best parameters and score from GridSearchCV
print("Best Parameters: ", grid_search4.best_params_)
print("Best Score: ", grid_search4.best_score_)

# Step 9: Test the model on the test dataset
X_test_data10 = data10[['icd', 'med', 'report_text']]  # Features from the test set
y_test_data10 = data10['annot']  # Labels from the test set
y_pred_data10 = grid_search4.predict(X_test_data10)  # Make predictions on the test set

# Step 10: Evaluate model performance using bootstrapping
n_iterations = 10  # Number of bootstrap iterations
metrics_values = {  # Dictionary to store metrics for each iteration
    'accuracy': [], 'precision': [], 'recall': [], 'f1': [],
    'specificity': [], 'roc_auc': [], 'auprc': []
}

for _ in range(n_iterations):
    # Sample the test data with replacement
    indices = np.random.choice(len(X_test_data10), len(X_test_data10), replace=True)
    X_sampled = X_test_data10.iloc[indices]  # Sampled features
    y_sampled = y_test_data10.iloc[indices]  # Sampled labels

    # Make predictions on the sampled data
    y_pred_sampled = grid_search4.predict(X_sampled)
    y_pred_prob_sampled = grid_search4.predict_proba(X_sampled)[:, 1]  # Predicted probabilities for the positive class

    # Calculate specificity
    tn, fp, fn, tp = confusion_matrix(y_sampled, y_pred_sampled).ravel()
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0

    # Store calculated metrics
    metrics_values['accuracy'].append(accuracy_score(y_sampled, y_pred_sampled))
    metrics_values['precision'].append(precision_score(y_sampled, y_pred_sampled))
    metrics_values['recall'].append(recall_score(y_sampled, y_pred_sampled))
    metrics_values['f1'].append(f1_score(y_sampled, y_pred_sampled))
    metrics_values['specificity'].append(specificity)
    metrics_values['roc_auc'].append(roc_auc_score(y_sampled, y_pred_prob_sampled))
    metrics_values['auprc'].append(average_precision_score(y_sampled, y_pred_prob_sampled))

# Step 11: Calculate the mean and 95% confidence interval for each metric
for metric, values in metrics_values.items():
    mean_value = np.mean(values)  # Mean value of the metric
    lower_band = np.percentile(values, 2.5)  # Lower bound of 95% confidence interval
    upper_band = np.percentile(values, 97.5)  # Upper bound of 95% confidence interval

    print(f"{metric.capitalize()}: Mean={mean_value}, 95% CI=({lower_band}, {upper_band})")




Best Parameters:  {'classifier__max_depth': None, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}
Best Score:  0.8946488906001828
Accuracy: Mean=0.8450120192307692, 95% CI=(0.8318960336538462, 0.8544170673076923)
Precision: Mean=0.9384197883573051, 95% CI=(0.9252238647013717, 0.9479235258685723)
Recall: Mean=0.7137192533087208, 95% CI=(0.6859680013309672, 0.7342510485780072)
F1: Mean=0.8106716991823179, 95% CI=(0.7918895830625711, 0.8243219158649506)
Specificity: Mean=0.9591649711402059, 95% CI=(0.9498505455553304, 0.967382274001964)
Roc_auc: Mean=0.9467531422120677, 95% CI=(0.9374200838358637, 0.9545132094463199)
Auprc: Mean=0.907748938775204, 95% CI=(0.8919393323985926, 0.9261643761661414)


Best Parameters:  {'classifier__max_depth': 20, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Best Score:  0.8890718903916189
Accuracy: Mean=0.8220552884615383, 95% CI=(0.8156400240384616, 0.8317908653846153)
Precision: Mean=0.9239405839191643, 95% CI=(0.9080767114494016, 0.9383105140790494)
Recall: Mean=0.6697311911722641, 95% CI=(0.6487397195895668, 0.6919827669201887)
F1: Mean=0.7764011612218064, 95% CI=(0.7621629541730918, 0.7944219648926621)
Specificity: Mean=0.9526991896825967, 95% CI=(0.94106142936084, 0.9608481107222899)
Roc_auc: Mean=0.9413359556233969, 95% CI=(0.9300670303544616, 0.946915722355956)
Auprc: Mean=0.9013509970938347, 95% CI=(0.8780906732584564, 0.9233003272627427)

## Random Forest

## Naive Bayes

In [2]:
# Import necessary libraries for data handling, machine learning, and performance evaluation
import pandas as pd  # For working with tabular data
import numpy as np  # For numerical operations
from sklearn.feature_extraction.text import TfidfVectorizer  # For text feature extraction
from sklearn.naive_bayes import BernoulliNB  # Naive Bayes classifier (Bernoulli for binary features)
from sklearn.pipeline import Pipeline  # For combining preprocessing and modeling into a single workflow
from sklearn.model_selection import GridSearchCV  # For hyperparameter optimization
from sklearn.metrics import (  # For evaluating model performance
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score, average_precision_score
)
from sklearn.compose import ColumnTransformer  # For preprocessing text and numerical data
from sklearn.preprocessing import StandardScaler  # For scaling numerical data

# Step 1: Load datasets
data9 = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/mgh_bidmc_3948.csv')  # Training dataset
data10 = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/mgh_bidmc_1664.csv')  # Test dataset

# Step 2: Define the text vectorizer
vectorizer = TfidfVectorizer(
    stop_words='english',  # Exclude common English words
    ngram_range=(1, 3),  # Include unigrams, bigrams, and trigrams
    max_features=3000,  # Limit features to the top 3000 by importance
    norm='l1'  # Normalize to make values non-negative
)

# Step 3: Define preprocessing for both text and numerical features
preprocessor = ColumnTransformer([
    ('vectorizer', vectorizer, 'report_text'),  # Apply text vectorizer to 'report_text'
    ('scaler', StandardScaler(), ['icd', 'med'])  # Scale numerical features ('icd', 'med')
], sparse_threshold=0, n_jobs=-1)  # Ensure output is a dense matrix (not sparse)

# Step 4: Define the Naive Bayes classifier
model = BernoulliNB()  # Suitable for binary or indicator features

# Step 5: Combine preprocessing and model into a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocessing step
    ('classifier', model)  # Classification step
])

# Step 6: Define hyperparameter grid for optimization
param_grid = {
    'classifier__alpha': [0.01, 0.1, 1.0, 10.0]  # Regularization parameter for Naive Bayes
}

# Step 7: Use GridSearchCV for hyperparameter tuning
grid_search4 = GridSearchCV(
    pipeline,  # The pipeline to optimize
    param_grid=param_grid,  # Hyperparameter grid
    cv=5,  # 5-fold cross-validation
    n_jobs=-1  # Utilize all available CPU cores
)
grid_search4.fit(data9[['icd', 'med', 'report_text']], data9['annot'])  # Train the model

# Step 8: Output the best parameters and corresponding score
print("Best Parameters: ", grid_search4.best_params_)
print("Best Score: ", grid_search4.best_score_)

# Step 9: Predict on test data
X_test_data10 = data10[['icd', 'med', 'report_text']]  # Features from test dataset
y_test_data10 = data10['annot']  # Labels from test dataset
y_pred_data10 = grid_search4.predict(X_test_data10)  # Predictions
y_pred_prob_data10 = grid_search4.predict_proba(X_test_data10)[:, 1]  # Probabilities for positive class

# Step 10: Evaluate metrics using bootstrapping
n_iterations = 10  # Number of bootstrap samples
metrics_values = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'specificity': [], 'roc_auc': [], 'auprc': []}

for _ in range(n_iterations):
    # Resample test data with replacement
    indices = np.random.choice(len(X_test_data10), len(X_test_data10), replace=True)
    X_sampled = X_test_data10.iloc[indices]
    y_sampled = y_test_data10.iloc[indices]

    # Make predictions on the resampled data
    y_pred_sampled = grid_search4.best_estimator_.predict(X_sampled)
    y_pred_prob_sampled = grid_search4.best_estimator_.predict_proba(X_sampled)[:, 1]

    # Calculate confusion matrix and specificity
    tn, fp, fn, tp = confusion_matrix(y_sampled, y_pred_sampled).ravel()
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0

    # Store evaluation metrics
    metrics_values['accuracy'].append(accuracy_score(y_sampled, y_pred_sampled))
    metrics_values['precision'].append(precision_score(y_sampled, y_pred_sampled))
    metrics_values['recall'].append(recall_score(y_sampled, y_pred_sampled))
    metrics_values['f1'].append(f1_score(y_sampled, y_pred_sampled))
    metrics_values['specificity'].append(specificity)
    metrics_values['roc_auc'].append(roc_auc_score(y_sampled, y_pred_prob_sampled))
    metrics_values['auprc'].append(average_precision_score(y_sampled, y_pred_prob_sampled))

# Step 11: Calculate mean and 95% confidence intervals for each metric
for metric, values in metrics_values.items():
    mean_value = np.mean(values)
    lower_band = np.percentile(values, 2.5)
    upper_band = np.percentile(values, 97.5)

    print(f"{metric.capitalize()}: Mean={mean_value}, 95% CI=({lower_band}, {upper_band})")




Best Parameters:  {'classifier__alpha': 0.1}
Best Score:  0.7594041488184049
Accuracy: Mean=0.6150841346153845, 95% CI=(0.6003004807692308, 0.6306640625)
Precision: Mean=0.5811091435381704, 95% CI=(0.5584917272080078, 0.6175252723785309)
Recall: Mean=0.6204689002111665, 95% CI=(0.5999882237355656, 0.6405943650377909)
F1: Mean=0.6000091860930791, 95% CI=(0.5823706771114112, 0.6275441710399119)
Specificity: Mean=0.6102356476928656, 95% CI=(0.586841814821466, 0.6302638295952626)
Roc_auc: Mean=0.6413962575594194, 95% CI=(0.6283590832835436, 0.6559458587480677)
Auprc: Mean=0.5731513227041299, 95% CI=(0.5520553530710124, 0.6043068560895215)


Best Parameters:  {'classifier__alpha': 10.0}
Best Score:  0.7594019027450225
Accuracy: Mean=0.6019831730769232, 95% CI=(0.5893629807692308, 0.615850360576923)
Precision: Mean=0.5708670822863977, 95% CI=(0.5555554262844379, 0.6017076752239154)
Recall: Mean=0.5895687241376917, 95% CI=(0.5717752270574469, 0.6043827865633813)
F1: Mean=0.5798908941338062, 95% CI=(0.5698344613614915, 0.590695557574248)
Specificity: Mean=0.612975671861568, 95% CI=(0.5940088395126994, 0.6413795649018011)
Roc_auc: Mean=0.6268913992151288, 95% CI=(0.6127829991337536, 0.6447232939760171)
Auprc: Mean=0.5636663277837618, 95% CI=(0.5443730018353451, 0.5914388665507918)

## MLP

In [3]:
# Import required libraries
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text data into numerical features
from sklearn.neural_network import MLPClassifier  # Multi-layer Perceptron (MLP) classifier for classification tasks
from sklearn.pipeline import Pipeline  # For creating a workflow combining preprocessing and modeling
from sklearn.model_selection import GridSearchCV  # For hyperparameter tuning using cross-validation
from sklearn.metrics import (  # For evaluating model performance
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
)
from sklearn.compose import ColumnTransformer  # For preprocessing both text and numerical data
from sklearn.preprocessing import StandardScaler  # For standardizing numerical features

# Load the datasets
data9 = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/mgh_bidmc_3948.csv')  # Training dataset
data10 = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/mgh_bidmc_1664.csv')  # Testing dataset

# Define a TF-IDF vectorizer for text preprocessing
vectorizer = TfidfVectorizer(
    stop_words='english',  # Exclude common English words
    ngram_range=(1, 3),  # Consider unigrams, bigrams, and trigrams
    max_features=3000  # Limit the number of features to the top 3000
)

# Define preprocessing for both text and numerical data
preprocessor = ColumnTransformer([
    ('vectorizer', vectorizer, 'report_text'),  # Apply the TF-IDF vectorizer to the 'report_text' column
    ('scaler', StandardScaler(), ['icd', 'med'])  # Standardize the numerical columns 'icd' and 'med'
], sparse_threshold=0, n_jobs=-1)  # Ensure dense output for compatibility with the classifier

# Define an MLPClassifier for classification
model = MLPClassifier(
    random_state=42,  # Set random seed for reproducibility
    max_iter=200  # Maximum number of iterations for training
)

# Combine preprocessing and modeling into a single pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocess the data
    ('classifier', model)  # Apply the classifier
])

# Define the hyperparameter grid for tuning the MLP classifier
param_grid = {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (100, 50)],  # Network architectures to test
    'classifier__activation': ['relu', 'tanh'],  # Activation functions to test
    'classifier__solver': ['adam'],  # Optimization algorithm
    'classifier__learning_rate': ['constant', 'adaptive'],  # Learning rate strategies
    'classifier__alpha': [0.0001, 0.001]  # Regularization strength (L2 penalty)
}

# Perform hyperparameter tuning with GridSearchCV
grid_search4 = GridSearchCV(
    pipeline,  # Pipeline to optimize
    param_grid=param_grid,  # Hyperparameter grid
    cv=5,  # Use 5-fold cross-validation
    n_jobs=-1  # Use all available CPU cores
)
grid_search4.fit(data9[['icd', 'med', 'report_text']], data9['annot'])  # Train the model on the training data

# Output the best hyperparameters and corresponding cross-validation score
print("Best Parameters: ", grid_search4.best_params_)
print("Best Score: ", grid_search4.best_score_)

# Use the trained model to make predictions on the test dataset
X_test_data10 = data10[['icd', 'med', 'report_text']]  # Extract test features
y_test_data10 = data10['annot']  # Extract test labels
y_pred_data10 = grid_search4.predict(X_test_data10)  # Make predictions
y_pred_prob_data10 = grid_search4.predict_proba(X_test_data10)[:, 1]  # Get probabilities for the positive class

# Evaluate the model using bootstrapping
n_iterations = 10  # Number of bootstrap iterations
metrics_values = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'specificity': [], 'roc_auc': [], 'auprc': []}

for _ in range(n_iterations):
    # Resample the test data with replacement
    indices = np.random.choice(len(X_test_data10), len(X_test_data10), replace=True)
    X_sampled = X_test_data10.iloc[indices]  # Resampled test features
    y_sampled = y_test_data10.iloc[indices]  # Resampled test labels

    # Make predictions on the resampled data
    y_pred_sampled = grid_search4.best_estimator_.predict(X_sampled)
    y_pred_prob_sampled = grid_search4.best_estimator_.predict_proba(X_sampled)[:, 1]

    # Calculate confusion matrix and specificity
    tn, fp, fn, tp = confusion_matrix(y_sampled, y_pred_sampled).ravel()
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0

    # Store the evaluation metrics
    metrics_values['accuracy'].append(accuracy_score(y_sampled, y_pred_sampled))
    metrics_values['precision'].append(precision_score(y_sampled, y_pred_sampled))
    metrics_values['recall'].append(recall_score(y_sampled, y_pred_sampled))
    metrics_values['f1'].append(f1_score(y_sampled, y_pred_sampled))
    metrics_values['specificity'].append(specificity)
    metrics_values['roc_auc'].append(roc_auc_score(y_sampled, y_pred_prob_sampled))
    metrics_values['auprc'].append(average_precision_score(y_sampled, y_pred_prob_sampled))

# Calculate and print the mean and 95% confidence intervals for each metric
for metric, values in metrics_values.items():
    mean_value = np.mean(values)  # Mean value of the metric
    lower_band = np.percentile(values, 2.5)  # Lower bound of the 95% confidence interval
    upper_band = np.percentile(values, 97.5)  # Upper bound of the 95% confidence interval

    print(f"{metric.capitalize()}: Mean={mean_value}, 95% CI=({lower_band}, {upper_band})")




Best Parameters:  {'classifier__activation': 'tanh', 'classifier__alpha': 0.001, 'classifier__hidden_layer_sizes': (100,), 'classifier__learning_rate': 'constant', 'classifier__solver': 'adam'}
Best Score:  0.8255019171840658
Accuracy: Mean=0.7959735576923077, 95% CI=(0.7854717548076924, 0.8047475961538462)
Precision: Mean=0.8658851926664337, 95% CI=(0.8339552207345452, 0.8823349520531104)
Recall: Mean=0.6658160627228595, 95% CI=(0.6443331102690677, 0.6884194181104936)
F1: Mean=0.7526593004338398, 95% CI=(0.7361451973797446, 0.7697121571150587)
Specificity: Mean=0.9097651234457265, 95% CI=(0.8911432323756404, 0.9251365982837222)
Roc_auc: Mean=0.8821861441445712, 95% CI=(0.8677481027791552, 0.8909715485188199)
Auprc: Mean=0.8770937185026456, 95% CI=(0.8465984478810411, 0.88986911380543)


Best Parameters:  {'classifier__activation': 'tanh', 'classifier__alpha': 0.001, 'classifier__hidden_layer_sizes': (100,), 'classifier__learning_rate': 'constant', 'classifier__solver': 'adam'}
Best Score:  0.8247427443808057
Accuracy: Mean=0.8024038461538462, 95% CI=(0.7956730769230769, 0.8152794471153846)
Precision: Mean=0.8778107180430034, 95% CI=(0.8525246423766508, 0.9009188418841885)
Recall: Mean=0.6778622178207565, 95% CI=(0.6664623410193493, 0.696118538108178)
F1: Mean=0.764903747393255, 95% CI=(0.7516295025728988, 0.7779750548627472)
Specificity: Mean=0.9149055840295353, 95% CI=(0.9012071938901207, 0.9315183585546821)
Roc_auc: Mean=0.882651963169548, 95% CI=(0.8676672446863012, 0.8916546078941331)
Auprc: Mean=0.8810523197871515, 95% CI=(0.8611588246415243, 0.8960769652776367)

## Logistic Regression

In [4]:
# Import necessary libraries
import pandas as pd  # For handling tabular data
import numpy as np  # For numerical operations
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text into numerical features using TF-IDF
from sklearn.linear_model import LogisticRegression  # Logistic regression classifier
from sklearn.pipeline import Pipeline  # For creating a workflow combining preprocessing and modeling
from sklearn.model_selection import GridSearchCV  # For hyperparameter tuning using cross-validation
from sklearn.metrics import (  # For evaluating model performance
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
)
from sklearn.compose import ColumnTransformer  # For preprocessing multiple types of data (e.g., text and numerical)
from sklearn.preprocessing import StandardScaler  # For standardizing numerical data

# Step 1: Load data
data9 = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/mgh_bidmc_3948.csv')  # Training dataset
data10 = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/mgh_bidmc_1664.csv')  # Testing dataset

# Step 2: Define a TF-IDF vectorizer for text data
vectorizer = TfidfVectorizer(
    stop_words='english',  # Remove common English stop words
    ngram_range=(1, 3),  # Include unigrams, bigrams, and trigrams
    min_df=2,  # Ignore terms appearing in fewer than 2 documents
    max_df=0.9  # Ignore terms appearing in more than 90% of documents
)

# Step 3: Define class weights for handling class imbalance
class_weights = {0: 1.0, 1: 3.0}  # Assign higher weight to the positive class (1)

# Step 4: Define the logistic regression model
model = LogisticRegression(
    solver='liblinear',  # Use 'liblinear' solver (suitable for small datasets and L1 regularization)
    class_weight=class_weights,  # Incorporate class weights
    n_jobs=-1,  # Ensure reproducibility
)

# Step 5: Define a preprocessor for both text and numerical data
preprocessor = ColumnTransformer([
    ('vectorizer', vectorizer, 'report_text'),  # Apply the TF-IDF vectorizer to the 'report_text' column
    ('scaler', StandardScaler(), ['icd', 'med'])  # Standardize the numerical columns 'icd' and 'med'
], n_jobs=-1)

# Step 6: Create a pipeline combining preprocessing and logistic regression
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocessing step
    ('classifier', model)  # Logistic regression classifier
])

# Step 7: Define a hyperparameter grid for tuning the logistic regression model
param_grid = {
    'classifier__penalty': ['l1'],  # Use L1 regularization (Lasso)
    'classifier__C': [0.01, 0.1, 1.0, 10.0]  # Regularization strength
}

# Step 8: Perform hyperparameter tuning with GridSearchCV
grid_search4 = GridSearchCV(
    pipeline,  # Pipeline to optimize
    param_grid=param_grid,  # Hyperparameter grid
    cv=5  # 5-fold cross-validation
)
grid_search4.fit(data9[['icd', 'med', 'report_text']], data9['annot'])  # Train the model on the training dataset

# Step 9: Output the best hyperparameters and corresponding score
print("Best Parameters: ", grid_search4.best_params_)
print("Best Score: ", grid_search4.best_score_)

# Step 10: Make predictions on the test dataset
X_test_data10 = data10[['icd', 'med', 'report_text']]  # Extract features from the test dataset
y_test_data10 = data10['annot']  # Extract labels from the test dataset
y_pred_data10 = grid_search4.predict(X_test_data10)  # Predict labels for the test dataset

# Step 11: Initialize metrics dictionary for bootstrapping
n_iterations = 10  # Number of bootstrap iterations
metrics_values = {  # Store evaluation metrics
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'specificity': [],
    'roc_auc': [],
    'auprc': []
}

# Step 12: Perform bootstrapping to evaluate metrics
for _ in range(n_iterations):
    # Sample test data with replacement
    indices = np.random.choice(len(X_test_data10), len(X_test_data10), replace=True)
    X_sampled = X_test_data10.iloc[indices]  # Resampled test features
    y_sampled = y_test_data10.iloc[indices]  # Resampled test labels

    # Predict labels and probabilities for the resampled data
    y_pred_sampled = grid_search4.predict(X_sampled)
    y_pred_prob_sampled = grid_search4.predict_proba(X_sampled)[:, 1]  # Probability of the positive class

    # Calculate confusion matrix and specificity
    tn, fp, fn, tp = confusion_matrix(y_sampled, y_pred_sampled).ravel()
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0  # True negative rate

    # Append calculated metrics to the dictionary
    metrics_values['accuracy'].append(accuracy_score(y_sampled, y_pred_sampled))
    metrics_values['precision'].append(precision_score(y_sampled, y_pred_sampled))
    metrics_values['recall'].append(recall_score(y_sampled, y_pred_sampled))
    metrics_values['f1'].append(f1_score(y_sampled, y_pred_sampled))
    metrics_values['specificity'].append(specificity)
    metrics_values['roc_auc'].append(roc_auc_score(y_sampled, y_pred_prob_sampled))
    metrics_values['auprc'].append(average_precision_score(y_sampled, y_pred_prob_sampled))

# Step 13: Calculate mean and confidence intervals for each metric
for metric, values in metrics_values.items():
    mean_value = np.mean(values)  # Calculate mean of the metric
    lower_band = np.percentile(values, 2.5)  # 2.5th percentile (lower bound)
    upper_band = np.percentile(values, 97.5)  # 97.5th percentile (upper bound)

    # Print the mean and confidence interval for the metric
    print(f"{metric.capitalize()}: Mean={mean_value}, 95% CI=({lower_band}, {upper_band})")




Best Parameters:  {'classifier__C': 10.0, 'classifier__penalty': 'l1'}
Best Score:  0.9032725289181947
Accuracy: Mean=0.9147836538461538, 95% CI=(0.9036508413461538, 0.9253455528846154)
Precision: Mean=0.9269922475500639, 95% CI=(0.9145133786733259, 0.93889917151419)
Recall: Mean=0.8861268045542816, 95% CI=(0.8696290984667979, 0.9059884075655887)
F1: Mean=0.9060557923702705, 95% CI=(0.8952551674495561, 0.9197443237138295)
Specificity: Mean=0.9395262129625529, 95% CI=(0.9256916458398418, 0.9499064647398154)
Roc_auc: Mean=0.9685777480247992, 95% CI=(0.964485710029028, 0.9741742940236007)
Auprc: Mean=0.9691984571858736, 95% CI=(0.9651149537049218, 0.9737572526108763)


Best Parameters:  {'classifier__C': 10.0, 'classifier__penalty': 'l1'}
Best Score:  0.9032725289181947
Accuracy: Mean=0.911778846153846, 95% CI=(0.8979717548076923, 0.9196664663461539)
Precision: Mean=0.9237657028521212, 95% CI=(0.9127599616450032, 0.9401544893310957)
Recall: Mean=0.8842612220300854, 95% CI=(0.8644202758908641, 0.9010599211563732)
F1: Mean=0.9035477237691338, 95% CI=(0.8879237372156347, 0.9138079434184725)
Specificity: Mean=0.9359959705818681, 95% CI=(0.9222324805339266, 0.9462493721848502)
Roc_auc: Mean=0.9685768369699211, 95% CI=(0.9568631442155783, 0.9764631731839605)
Auprc: Mean=0.9693851581842349, 95% CI=(0.9587637922673002, 0.9782104340774328)