## Table 3: Average performance and [95% confidence intervals] for logistic regression model using all features in the different testing sets

## 1.1 trained on combined data [MGH + BIDMC], tested on combined data [MGH + BIDMC]





In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Load training and test datasets
train_data = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/mgh_bidmc_3948.csv')  # Training dataset
test_data = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/mgh_bidmc_1664.csv')  # Test dataset

# Define a text feature extractor using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))  # Unigrams, bigrams, and trigrams

# Define class weights to handle class imbalance
class_weight_dict = {0: 1.0, 1: 3.0}  # Higher weight for the positive class

# Define the logistic regression classifier
logistic_classifier = LogisticRegression(solver='liblinear', random_state=2025, class_weight=class_weight_dict)

# Define a data preprocessor
data_preprocessor = ColumnTransformer([
    ('text_vectorizer', tfidf_vectorizer, 'report_text'),  # Apply TF-IDF to the 'report_text' column
    ('scaler', StandardScaler(), ['icd', 'med'])  # Scale numerical features 'icd' and 'med'
], n_jobs=-1)

# Create a pipeline for preprocessing and classification
classification_pipeline = Pipeline([
    ('preprocessor', data_preprocessor),
    ('classifier', logistic_classifier)
])

# Define hyperparameters for grid search
hyperparameter_grid = {
    'classifier__penalty': ['l1'],  # L1 regularization
    'classifier__C': [0.01, 0.1, 1.0, 10.0]  # Regularization strength
}

# Perform grid search with 5-fold cross-validation to find the best hyperparameters
grid_search1 = GridSearchCV(classification_pipeline, param_grid=hyperparameter_grid, cv=5, n_jobs=-1)
grid_search1.fit(train_data[['icd', 'med', 'report_text']], train_data['annot'])

# Display the best parameters and the best cross-validation score
print("Best Parameters: ", grid_search1.best_params_)
print("Best Score: ", grid_search1.best_score_)

# Evaluate the model on the test dataset
X_test = test_data[['icd', 'med', 'report_text']]  # Features
y_test = test_data['annot']  # Target labels
y_pred = grid_search1.predict(X_test)

# Perform bootstrapping on the test set to calculate evaluation metrics
n_iterations = 10  # Number of bootstrap iterations
metrics_values = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'specificity': [],
    'roc_auc': [],
    'auprc': []
}

for _ in range(n_iterations):
    # Sample the test data with replacement
    sample_indices = np.random.choice(len(X_test), len(X_test), replace=True)
    X_sampled = X_test.iloc[sample_indices]
    y_sampled = y_test.iloc[sample_indices]

    # Predict on the sampled data
    y_pred_sampled = grid_search1.predict(X_sampled)
    y_pred_prob_sampled = grid_search1.predict_proba(X_sampled)[:, 1]  # Probability of the positive class

    # Calculate confusion matrix for specificity calculation
    tn, fp, fn, tp = confusion_matrix(y_sampled, y_pred_sampled).ravel()
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0

    # Append calculated metrics to the respective lists
    metrics_values['accuracy'].append(accuracy_score(y_sampled, y_pred_sampled))
    metrics_values['precision'].append(precision_score(y_sampled, y_pred_sampled))
    metrics_values['recall'].append(recall_score(y_sampled, y_pred_sampled))
    metrics_values['f1'].append(f1_score(y_sampled, y_pred_sampled))
    metrics_values['specificity'].append(specificity)
    metrics_values['roc_auc'].append(roc_auc_score(y_sampled, y_pred_prob_sampled))
    metrics_values['auprc'].append(average_precision_score(y_sampled, y_pred_prob_sampled))

# Calculate mean and 95% confidence intervals for each metric
for metric, values in metrics_values.items():
    mean_value = np.mean(values)
    lower_bound = np.percentile(values, 2.5)  # Lower bound of the 95% CI
    upper_bound = np.percentile(values, 97.5)  # Upper bound of the 95% CI

    print(f"{metric.capitalize()}: Mean={mean_value:.4f}, 95% CI=({lower_bound:.4f}, {upper_bound:.4f})")


Best Parameters:  {'classifier__C': 1.0, 'classifier__penalty': 'l1'}
Best Score:  0.8941525083826667
Accuracy: Mean=0.8885, 95% CI=(0.8756, 0.8982)
Precision: Mean=0.8848, 95% CI=(0.8621, 0.9049)
Recall: Mean=0.8754, 95% CI=(0.8594, 0.8909)
F1: Mean=0.8800, 95% CI=(0.8615, 0.8877)
Specificity: Mean=0.9000, 95% CI=(0.8843, 0.9204)
Roc_auc: Mean=0.9476, 95% CI=(0.9374, 0.9552)
Auprc: Mean=0.9472, 95% CI=(0.9343, 0.9542)


Best Parameters:  {'classifier__C': 1.0, 'classifier__penalty': 'l1'}
Best Score:  0.8941525083826667
Accuracy: Mean=0.8980, 95% CI=(0.8890, 0.9058)
Precision: Mean=0.8939, 95% CI=(0.8750, 0.9061)
Recall: Mean=0.8902, 95% CI=(0.8797, 0.9049)
F1: Mean=0.8920, 95% CI=(0.8806, 0.9012)
Specificity: Mean=0.9050, 95% CI=(0.8935, 0.9172)
Roc_auc: Mean=0.9532, 95% CI=(0.9471, 0.9606)
Auprc: Mean=0.9528, 95% CI=(0.9458, 0.9606)

## 1.2 trained on combined data [MGH + BIDMC], tested on MGH data

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Load the training and test datasets
train_data = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/mgh_bidmc_3948.csv')  # Training dataset
test_data = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/mgh_680.csv')  # Test dataset

# Define the text feature extractor using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))  # Use unigrams, bigrams, and trigrams

# Define class weights to address class imbalance
class_weight_dict = {0: 1.0, 1: 3.0}  # Assign higher weight to the positive class

# Define the logistic regression model
logistic_model = LogisticRegression(solver='liblinear', random_state=2025, class_weight=class_weight_dict)

# Define the data preprocessor
data_preprocessor = ColumnTransformer([
    ('text_features', tfidf_vectorizer, 'report_text'),  # Apply TF-IDF to 'report_text'
    ('scale_numeric', StandardScaler(), ['icd', 'med'])  # Scale numerical features 'icd' and 'med'
], n_jobs=-1)

# Create a pipeline for data preprocessing and classification
classification_pipeline = Pipeline([
    ('preprocessor', data_preprocessor),
    ('classifier', logistic_model)
])

# Define hyperparameters for grid search
param_grid = {
    'classifier__penalty': ['l1'],  # Use L1 regularization
    'classifier__C': [0.01, 0.1, 1.0, 10.0]  # Regularization strength
}

# Perform grid search with 5-fold cross-validation
grid_search2 = GridSearchCV(classification_pipeline, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search2.fit(train_data[['icd', 'med', 'report_text']], train_data['annot'])

# Display the best parameters and corresponding score
print("Best Parameters: ", grid_search2.best_params_)
print("Best Score: ", grid_search2.best_score_)

# Evaluate the model on the test dataset
X_test = test_data[['icd', 'med', 'report_text']]  # Features
y_test = test_data['annot']  # Target labels
y_pred = grid_search2.predict(X_test)

# Perform bootstrapping to compute evaluation metrics
n_iterations = 10  # Number of bootstrap iterations
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'specificity': [],
    'roc_auc': [],
    'auprc': []
}

for _ in range(n_iterations):
    # Sample test data with replacement
    sampled_indices = np.random.choice(len(X_test), len(X_test), replace=True)
    X_sampled = X_test.iloc[sampled_indices]
    y_sampled = y_test.iloc[sampled_indices]

    # Predict on the sampled data
    y_sampled_pred = grid_search2.predict(X_sampled)
    y_sampled_prob = grid_search2.predict_proba(X_sampled)[:, 1]  # Probability of the positive class

    # Calculate confusion matrix and specificity
    tn, fp, fn, tp = confusion_matrix(y_sampled, y_sampled_pred).ravel()
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0

    # Store metrics
    metrics['accuracy'].append(accuracy_score(y_sampled, y_sampled_pred))
    metrics['precision'].append(precision_score(y_sampled, y_sampled_pred))
    metrics['recall'].append(recall_score(y_sampled, y_sampled_pred))
    metrics['f1'].append(f1_score(y_sampled, y_sampled_pred))
    metrics['specificity'].append(specificity)
    metrics['roc_auc'].append(roc_auc_score(y_sampled, y_sampled_prob))
    metrics['auprc'].append(average_precision_score(y_sampled, y_sampled_prob))

# Calculate mean and 95% confidence intervals for each metric
for metric_name, metric_values in metrics.items():
    mean_value = np.mean(metric_values)
    lower_bound = np.percentile(metric_values, 2.5)  # Lower bound of 95% CI
    upper_bound = np.percentile(metric_values, 97.5)  # Upper bound of 95% CI

    print(f"{metric_name.capitalize()}: Mean={mean_value:.4f}, 95% CI=({lower_bound:.4f}, {upper_bound:.4f})")


Best Parameters:  {'classifier__C': 1.0, 'classifier__penalty': 'l1'}
Best Score:  0.8941525083826667
Accuracy: Mean=0.9368, 95% CI=(0.9147, 0.9497)
Precision: Mean=0.9624, 95% CI=(0.9489, 0.9788)
Recall: Mean=0.9118, 95% CI=(0.8797, 0.9452)
F1: Mean=0.9363, 95% CI=(0.9140, 0.9529)
Specificity: Mean=0.9627, 95% CI=(0.9485, 0.9759)
Roc_auc: Mean=0.9824, 95% CI=(0.9731, 0.9885)
Auprc: Mean=0.9850, 95% CI=(0.9781, 0.9911)


Best Parameters:  {'classifier__C': 1.0, 'classifier__penalty': 'l1'}
Best Score:  0.8941525083826667
Accuracy: Mean=0.9321, 95% CI=(0.9191, 0.9461)
Precision: Mean=0.9607, 95% CI=(0.9468, 0.9737)
Recall: Mean=0.9053, 95% CI=(0.8741, 0.9339)
F1: Mean=0.9320, 95% CI=(0.9160, 0.9455)
Specificity: Mean=0.9606, 95% CI=(0.9471, 0.9742)
Roc_auc: Mean=0.9803, 95% CI=(0.9705, 0.9864)
Auprc: Mean=0.9843, 95% CI=(0.9771, 0.9892)

## 1.3 trained on combined data [MGH + BIDMC], tested on BIDMC data




In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Load the training and test datasets
train_data = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/mgh_bidmc_3948.csv')  # Training dataset
test_data = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/bidmc_984.csv')  # Test dataset

# Define the text feature extractor using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))  # Use unigrams, bigrams, and trigrams

# Define class weights to address class imbalance
class_weight_dict = {0: 1.0, 1: 3.0}  # Assign higher weight to the positive class

# Define the logistic regression classifier
logistic_classifier = LogisticRegression(solver='liblinear', random_state=2025, class_weight=class_weight_dict)

# Define the data preprocessor
data_preprocessor = ColumnTransformer([
    ('text_features', tfidf_vectorizer, 'report_text'),  # Apply TF-IDF to 'report_text'
    ('scale_numeric', StandardScaler(), ['icd', 'med'])  # Scale numerical features 'icd' and 'med'
], n_jobs=-1)

# Create a pipeline for preprocessing and classification
classification_pipeline = Pipeline([
    ('preprocessor', data_preprocessor),
    ('classifier', logistic_classifier)
])

# Define hyperparameters for grid search
hyperparameter_grid = {
    'classifier__penalty': ['l1'],  # Use L1 regularization
    'classifier__C': [0.01, 0.1, 1.0, 10.0]  # Regularization strength
}

# Perform grid search with 5-fold cross-validation
grid_search3 = GridSearchCV(classification_pipeline, param_grid=hyperparameter_grid, cv=5, n_jobs=-1)
grid_search3.fit(train_data[['icd', 'med', 'report_text']], train_data['annot'])

# Display the best hyperparameters and corresponding score
print("Best Parameters: ", grid_search3.best_params_)
print("Best Score: ", grid_search3.best_score_)

# Evaluate the model on the test dataset
X_test = test_data[['icd', 'med', 'report_text']]  # Features
y_test = test_data['annot']  # Target labels
y_pred = grid_search3.predict(X_test)

# Perform bootstrapping to compute evaluation metrics
n_iterations = 10  # Number of bootstrap iterations
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'specificity': [],
    'roc_auc': [],
    'auprc': []
}

for _ in range(n_iterations):
    # Sample test data with replacement
    sampled_indices = np.random.choice(len(X_test), len(X_test), replace=True)
    X_sampled = X_test.iloc[sampled_indices]
    y_sampled = y_test.iloc[sampled_indices]

    # Predict on the sampled data
    y_sampled_pred = grid_search3.predict(X_sampled)
    y_sampled_prob = grid_search3.predict_proba(X_sampled)[:, 1]  # Probability of the positive class

    # Calculate confusion matrix and specificity
    tn, fp, fn, tp = confusion_matrix(y_sampled, y_sampled_pred).ravel()
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0

    # Store metrics
    metrics['accuracy'].append(accuracy_score(y_sampled, y_sampled_pred))
    metrics['precision'].append(precision_score(y_sampled, y_sampled_pred))
    metrics['recall'].append(recall_score(y_sampled, y_sampled_pred))
    metrics['f1'].append(f1_score(y_sampled, y_sampled_pred))
    metrics['specificity'].append(specificity)
    metrics['roc_auc'].append(roc_auc_score(y_sampled, y_sampled_prob))
    metrics['auprc'].append(average_precision_score(y_sampled, y_sampled_prob))

# Calculate mean and 95% confidence intervals for each metric
for metric_name, metric_values in metrics.items():
    mean_value = np.mean(metric_values)
    lower_bound = np.percentile(metric_values, 2.5)  # Lower bound of 95% CI
    upper_bound = np.percentile(metric_values, 97.5)  # Upper bound of 95% CI

    print(f"{metric_name.capitalize()}: Mean={mean_value:.4f}, 95% CI=({lower_bound:.4f}, {upper_bound:.4f})")


Best Parameters:  {'classifier__C': 1.0, 'classifier__penalty': 'l1'}
Best Score:  0.8941525083826667
Accuracy: Mean=0.8572, 95% CI=(0.8419, 0.8784)
Precision: Mean=0.8206, 95% CI=(0.8045, 0.8586)
Recall: Mean=0.8581, 95% CI=(0.8429, 0.8804)
F1: Mean=0.8388, 95% CI=(0.8244, 0.8597)
Specificity: Mean=0.8564, 95% CI=(0.8394, 0.8891)
Roc_auc: Mean=0.9218, 95% CI=(0.9099, 0.9324)
Auprc: Mean=0.9082, 95% CI=(0.8994, 0.9218)


Best Parameters:  {'classifier__C': 1.0, 'classifier__penalty': 'l1'}
Best Score:  0.8941525083826667
Accuracy: Mean=0.8635, 95% CI=(0.8490, 0.8801)
Precision: Mean=0.8248, 95% CI=(0.7973, 0.8512)
Recall: Mean=0.8685, 95% CI=(0.8471, 0.8847)
F1: Mean=0.8459, 95% CI=(0.8332, 0.8646)
Specificity: Mean=0.8599, 95% CI=(0.8349, 0.8836)
Roc_auc: Mean=0.9271, 95% CI=(0.9166, 0.9418)
Auprc: Mean=0.9108, 95% CI=(0.8911, 0.9313)

## 1.4 trained on BIDMC data, tested on MGH data

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the training and test datasets
train_data = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/bidmc_3280.csv')  # Training dataset
test_data = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/mgh_2332.csv')  # Test dataset

# Define the text feature extractor using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))  # Use unigrams, bigrams, and trigrams

# Define class weights to address class imbalance
class_weight_dict = {0: 1.0, 1: 3.0}  # Assign higher weight to the positive class

# Define the logistic regression classifier
logistic_classifier = LogisticRegression(solver='liblinear', random_state=2025, class_weight=class_weight_dict)

# Define the data preprocessor
data_preprocessor = ColumnTransformer([
    ('text_features', tfidf_vectorizer, 'report_text'),  # Apply TF-IDF to 'report_text'
    ('scale_numeric', StandardScaler(), ['icd', 'med'])  # Scale numerical features 'icd' and 'med'
], n_jobs=-1)

# Create a pipeline for preprocessing and classification
classification_pipeline = Pipeline([
    ('preprocessor', data_preprocessor),
    ('classifier', logistic_classifier)
])

# Define hyperparameters for grid search
hyperparameter_grid = {
    'classifier__penalty': ['l1'],  # Use L1 regularization
    'classifier__C': [0.01, 0.1, 1.0, 10.0]  # Regularization strength
}

# Perform grid search with 5-fold cross-validation
grid_search4 = GridSearchCV(classification_pipeline, param_grid=hyperparameter_grid, cv=5, n_jobs=-1)
grid_search4.fit(train_data[['icd', 'med', 'report_text']], train_data['annot'])

# Display the best hyperparameters and corresponding score
print("Best Parameters: ", grid_search4.best_params_)
print("Best Score: ", grid_search4.best_score_)

# Evaluate the model on the test dataset
X_test = test_data[['icd', 'med', 'report_text']]  # Features from test data
y_test = test_data['annot']  # Target labels from test data
y_pred = grid_search4.predict(X_test)

# Perform bootstrapping to compute evaluation metrics
n_iterations = 10  # Number of bootstrap iterations
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'specificity': [],
    'roc_auc': [],
    'auprc': []
}

for _ in range(n_iterations):
    # Sample test data with replacement
    sampled_indices = np.random.choice(len(X_test), len(X_test), replace=True)
    X_sampled = X_test.iloc[sampled_indices]
    y_sampled = y_test.iloc[sampled_indices]

    # Predict on the sampled data
    y_sampled_pred = grid_search4.predict(X_sampled)
    y_sampled_prob = grid_search4.predict_proba(X_sampled)[:, 1]  # Probability of the positive class

    # Calculate confusion matrix and specificity
    tn, fp, fn, tp = confusion_matrix(y_sampled, y_sampled_pred).ravel()
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0

    # Store metrics
    metrics['accuracy'].append(accuracy_score(y_sampled, y_sampled_pred))
    metrics['precision'].append(precision_score(y_sampled, y_sampled_pred))
    metrics['recall'].append(recall_score(y_sampled, y_sampled_pred))
    metrics['f1'].append(f1_score(y_sampled, y_sampled_pred))
    metrics['specificity'].append(specificity)
    metrics['roc_auc'].append(roc_auc_score(y_sampled, y_sampled_prob))
    metrics['auprc'].append(average_precision_score(y_sampled, y_sampled_prob))

# Calculate mean and 95% confidence intervals for each metric
for metric_name, metric_values in metrics.items():
    mean_value = np.mean(metric_values)
    lower_bound = np.percentile(metric_values, 2.5)  # Lower bound of 95% CI
    upper_bound = np.percentile(metric_values, 97.5)  # Upper bound of 95% CI

    print(f"{metric_name.capitalize()}: Mean={mean_value:.4f}, 95% CI=({lower_bound:.4f}, {upper_bound:.4f})")


Best Parameters:  {'classifier__C': 10.0, 'classifier__penalty': 'l1'}
Best Score:  0.9137195121951219
Accuracy: Mean=0.9431, 95% CI=(0.9350, 0.9523)
Precision: Mean=0.9153, 95% CI=(0.9057, 0.9283)
Recall: Mean=0.8937, 95% CI=(0.8772, 0.9115)
F1: Mean=0.9043, 95% CI=(0.8913, 0.9180)
Specificity: Mean=0.9643, 95% CI=(0.9600, 0.9717)
Roc_auc: Mean=0.9767, 95% CI=(0.9696, 0.9848)
Auprc: Mean=0.9584, 95% CI=(0.9428, 0.9711)


Best Parameters:  {'classifier__C': 10.0, 'classifier__penalty': 'l1'}
Best Score:  0.9137195121951219
Accuracy: Mean=0.9448, 95% CI=(0.9397, 0.9500)
Precision: Mean=0.9092, 95% CI=(0.8964, 0.9171)
Recall: Mean=0.9032, 95% CI=(0.8882, 0.9196)
F1: Mean=0.9062, 95% CI=(0.8960, 0.9140)
Specificity: Mean=0.9622, 95% CI=(0.9574, 0.9670)
Roc_auc: Mean=0.9774, 95% CI=(0.9735, 0.9806)
Auprc: Mean=0.9581, 95% CI=(0.9495, 0.9628)

## 1.5 trained on MGH data, tested on BIDMC data

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score,
)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the training and test datasets
train_data = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/mgh_2332.csv')  # Training dataset
test_data = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/bidmc_3280.csv')  # Test dataset

# Define the text feature extractor using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))  # Extract unigrams, bigrams, and trigrams

# Define class weights to address class imbalance
class_weight_dict = {0: 1.0, 1: 3.0}  # Assign higher weight to the positive class

# Define the logistic regression classifier
logistic_classifier = LogisticRegression(solver='liblinear', random_state=2025, class_weight=class_weight_dict)

# Define the data preprocessor
data_preprocessor = ColumnTransformer([
    ('text_features', tfidf_vectorizer, 'report_text'),  # Apply TF-IDF to 'report_text'
    ('scale_numeric', StandardScaler(), ['icd', 'med'])  # Scale numerical features 'icd' and 'med'
], n_jobs=-1)

# Create a pipeline for preprocessing and classification
classification_pipeline = Pipeline([
    ('preprocessor', data_preprocessor),
    ('classifier', logistic_classifier)
])

# Define hyperparameters for grid search
hyperparameter_grid = {
    'classifier__penalty': ['l1'],  # Use L1 regularization
    'classifier__C': [0.01, 0.1, 1.0, 10.0]  # Regularization strength
}

# Perform grid search with 5-fold cross-validation
grid_search5 = GridSearchCV(classification_pipeline, param_grid=hyperparameter_grid, cv=5, n_jobs=-1)
grid_search5.fit(train_data[['icd', 'med', 'report_text']], train_data['annot'])

# Display the best parameters and corresponding score
print("Best Parameters: ", grid_search5.best_params_)
print("Best Score: ", grid_search5.best_score_)

# Evaluate the model on the test dataset
X_test = test_data[['icd', 'med', 'report_text']]  # Features
y_test = test_data['annot']  # Target labels
y_pred = grid_search5.predict(X_test)

# Perform bootstrapping to compute evaluation metrics
n_iterations = 10  # Number of bootstrap iterations
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'specificity': [],
    'roc_auc': []
}

for _ in range(n_iterations):
    # Sample test data with replacement
    sampled_indices = np.random.choice(len(X_test), len(X_test), replace=True)
    X_sampled = X_test.iloc[sampled_indices]
    y_sampled = y_test.iloc[sampled_indices]

    # Predict on the sampled data
    y_sampled_pred = grid_search5.predict(X_sampled)
    y_sampled_prob = grid_search5.predict_proba(X_sampled)[:, 1]  # Probability of the positive class

    # Calculate confusion matrix and specificity
    tn, fp, fn, tp = confusion_matrix(y_sampled, y_sampled_pred).ravel()
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0

    # Store metrics
    metrics['accuracy'].append(accuracy_score(y_sampled, y_sampled_pred))
    metrics['precision'].append(precision_score(y_sampled, y_sampled_pred))
    metrics['recall'].append(recall_score(y_sampled, y_sampled_pred))
    metrics['f1'].append(f1_score(y_sampled, y_sampled_pred))
    metrics['specificity'].append(specificity)
    metrics['roc_auc'].append(roc_auc_score(y_sampled, y_sampled_prob))

# Calculate mean and confidence interval for each metric
for metric_name, metric_values in metrics.items():
    mean_value = np.mean(metric_values)
    lower_bound = np.percentile(metric_values, 2.5)  # Lower bound of 95% CI
    upper_bound = np.percentile(metric_values, 97.5)  # Upper bound of 95% CI

    print(f"{metric_name.capitalize()}: Mean={mean_value:.4f}, 95% CI=({lower_bound:.4f}, {upper_bound:.4f})")


Best Parameters:  {'classifier__C': 10.0, 'classifier__penalty': 'l1'}
Best Score:  0.9502495152144543
Accuracy: Mean=0.9104, 95% CI=(0.9035, 0.9180)
Precision: Mean=0.7975, 95% CI=(0.7793, 0.8144)
Recall: Mean=0.8788, 95% CI=(0.8671, 0.8910)
F1: Mean=0.8361, 95% CI=(0.8221, 0.8499)
Specificity: Mean=0.9215, 95% CI=(0.9151, 0.9282)
Roc_auc: Mean=0.9457, 95% CI=(0.9384, 0.9519)


Best Parameters:  {'classifier__C': 10.0, 'classifier__penalty': 'l1'}
Best Score:  0.9502495152144543
Accuracy: Mean=0.9108, 95% CI=(0.9043, 0.9231)
Precision: Mean=0.8019, 95% CI=(0.7760, 0.8262)
Recall: Mean=0.8757, 95% CI=(0.8528, 0.8965)
F1: Mean=0.8371, 95% CI=(0.8239, 0.8593)
Specificity: Mean=0.9233, 95% CI=(0.9161, 0.9334)
Roc_auc: Mean=0.9449, 95% CI=(0.9347, 0.9550)

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score,
)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the training and test datasets
train_data = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/mgh_1652.csv')  # Training dataset
test_data = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/mgh_680.csv')  # Test dataset

# Define the text feature extractor using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))  # Extract unigrams, bigrams, and trigrams

# Define class weights to address class imbalance
class_weight_dict = {0: 1.0, 1: 3.0}  # Assign higher weight to the positive class

# Define the logistic regression classifier
logistic_classifier = LogisticRegression(solver='liblinear', random_state=2025, class_weight=class_weight_dict)

# Define the data preprocessor
data_preprocessor = ColumnTransformer([
    ('text_features', tfidf_vectorizer, 'report_text'),  # Apply TF-IDF to 'report_text'
    ('scale_numeric', StandardScaler(), ['icd', 'med'])  # Scale numerical features 'icd' and 'med'
], n_jobs=-1)

# Create a pipeline for preprocessing and classification
classification_pipeline = Pipeline([
    ('preprocessor', data_preprocessor),
    ('classifier', logistic_classifier)
])

# Define hyperparameters for grid search
hyperparameter_grid = {
    'classifier__penalty': ['l1'],  # Use L1 regularization
    'classifier__C': [0.01, 0.1, 1.0, 10.0]  # Regularization strength
}

# Perform grid search with 5-fold cross-validation
grid_search6 = GridSearchCV(classification_pipeline, param_grid=hyperparameter_grid, cv=5, n_jobs=-1)
grid_search6.fit(train_data[['icd', 'med', 'report_text']], train_data['annot'])

# Display the best parameters and corresponding score
print("Best Parameters: ", grid_search6.best_params_)
print("Best Score: ", grid_search6.best_score_)

# Evaluate the model on the test dataset
X_test = test_data[['icd', 'med', 'report_text']]  # Features
y_test = test_data['annot']  # Target labels
y_pred = grid_search6.predict(X_test)

# Perform bootstrapping to compute evaluation metrics
n_iterations = 10  # Number of bootstrap iterations
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'specificity': [],
    'roc_auc': []
}

for _ in range(n_iterations):
    # Sample test data with replacement
    sampled_indices = np.random.choice(len(X_test), len(X_test), replace=True)
    X_sampled = X_test.iloc[sampled_indices]
    y_sampled = y_test.iloc[sampled_indices]

    # Predict on the sampled data
    y_sampled_pred = grid_search6.predict(X_sampled)
    y_sampled_prob = grid_search6.predict_proba(X_sampled)[:, 1]  # Probability of the positive class

    # Calculate confusion matrix and specificity
    tn, fp, fn, tp = confusion_matrix(y_sampled, y_sampled_pred).ravel()
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0

    # Store metrics
    metrics['accuracy'].append(accuracy_score(y_sampled, y_sampled_pred))
    metrics['precision'].append(precision_score(y_sampled, y_sampled_pred))
    metrics['recall'].append(recall_score(y_sampled, y_sampled_pred))
    metrics['f1'].append(f1_score(y_sampled, y_sampled_pred))
    metrics['specificity'].append(specificity)
    metrics['roc_auc'].append(roc_auc_score(y_sampled, y_sampled_prob))

# Calculate mean and confidence interval for each metric
for metric_name, metric_values in metrics.items():
    mean_value = np.mean(metric_values)
    lower_bound = np.percentile(metric_values, 2.5)  # Lower bound of 95% CI
    upper_bound = np.percentile(metric_values, 97.5)  # Upper bound of 95% CI

    print(f"{metric_name.capitalize()}: Mean={mean_value:.4f}, 95% CI=({lower_bound:.4f}, {upper_bound:.4f})")


Best Parameters:  {'classifier__C': 10.0, 'classifier__penalty': 'l1'}
Best Score:  0.9582312551496841
Accuracy: Mean=0.9465, 95% CI=(0.9397, 0.9541)
Precision: Mean=0.9812, 95% CI=(0.9708, 0.9908)
Recall: Mean=0.9148, 95% CI=(0.9010, 0.9276)
F1: Mean=0.9468, 95% CI=(0.9410, 0.9541)
Specificity: Mean=0.9810, 95% CI=(0.9699, 0.9908)
Roc_auc: Mean=0.9812, 95% CI=(0.9758, 0.9861)


Best Parameters:  {'classifier__C': 10.0, 'classifier__penalty': 'l1'}
Best Score:  0.9582312551496841
Accuracy: Mean=0.9513, 95% CI=(0.9471, 0.9570)
Precision: Mean=0.9848, 95% CI=(0.9727, 0.9957)
Recall: Mean=0.9187, 95% CI=(0.9020, 0.9282)
F1: Mean=0.9505, 95% CI=(0.9433, 0.9567)
Specificity: Mean=0.9854, 95% CI=(0.9737, 0.9956)
Roc_auc: Mean=0.9831, 95% CI=(0.9785, 0.9890)

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score,
)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the training and test datasets
train_data = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/bidmc_2296.csv')  # Training dataset
test_data = pd.read_csv('/home/niels/cdac Dropbox/Niels Turley/codes/data/bidmc_984.csv')  # Test dataset

# Define the text feature extractor using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))  # Extract unigrams, bigrams, and trigrams

# Define class weights to address class imbalance
class_weight_dict = {0: 1.0, 1: 3.0}  # Assign higher weight to the positive class

# Define the logistic regression classifier
logistic_classifier = LogisticRegression(solver='liblinear', random_state=2025, class_weight=class_weight_dict)

# Define the data preprocessor
data_preprocessor = ColumnTransformer([
    ('text_features', tfidf_vectorizer, 'report_text'),  # Apply TF-IDF to 'report_text'
    ('scale_numeric', StandardScaler(), ['icd', 'med'])  # Scale numerical features 'icd' and 'med'
], n_jobs=-1)

# Create a pipeline for preprocessing and classification
classification_pipeline = Pipeline([
    ('preprocessor', data_preprocessor),
    ('classifier', logistic_classifier)
])

# Define hyperparameters for grid search
hyperparameter_grid = {
    'classifier__penalty': ['l1'],  # Use L1 regularization
    'classifier__C': [0.01, 0.1, 1.0, 10.0]  # Regularization strength
}

# Perform grid search with 5-fold cross-validation
grid_search7 = GridSearchCV(classification_pipeline, param_grid=hyperparameter_grid, cv=5, n_jobs=-1)
grid_search7.fit(train_data[['icd', 'med', 'report_text']], train_data['annot'])

# Display the best parameters and corresponding score
print("Best Parameters: ", grid_search7.best_params_)
print("Best Score: ", grid_search7.best_score_)

# Evaluate the model on the test dataset
X_test = test_data[['icd', 'med', 'report_text']]  # Features
y_test = test_data['annot']  # Target labels
y_pred = grid_search7.predict(X_test)

# Perform bootstrapping to compute evaluation metrics
n_iterations = 10  # Number of bootstrap iterations
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'specificity': [],
    'roc_auc': []
}

for _ in range(n_iterations):
    # Sample test data with replacement
    sampled_indices = np.random.choice(len(X_test), len(X_test), replace=True)
    X_sampled = X_test.iloc[sampled_indices]
    y_sampled = y_test.iloc[sampled_indices]

    # Predict on the sampled data
    y_sampled_pred = grid_search7.predict(X_sampled)
    y_sampled_prob = grid_search7.predict_proba(X_sampled)[:, 1]  # Probability of the positive class

    # Calculate confusion matrix and specificity
    tn, fp, fn, tp = confusion_matrix(y_sampled, y_sampled_pred).ravel()
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0

    # Store metrics
    metrics['accuracy'].append(accuracy_score(y_sampled, y_sampled_pred))
    metrics['precision'].append(precision_score(y_sampled, y_sampled_pred))
    metrics['recall'].append(recall_score(y_sampled, y_sampled_pred))
    metrics['f1'].append(f1_score(y_sampled, y_sampled_pred))
    metrics['specificity'].append(specificity)
    metrics['roc_auc'].append(roc_auc_score(y_sampled, y_sampled_prob))

# Calculate mean and confidence interval for each metric
for metric_name, metric_values in metrics.items():
    mean_value = np.mean(metric_values)
    lower_bound = np.percentile(metric_values, 2.5)  # Lower bound of 95% CI
    upper_bound = np.percentile(metric_values, 97.5)  # Upper bound of 95% CI

    print(f"{metric_name.capitalize()}: Mean={mean_value:.4f}, 95% CI=({lower_bound:.4f}, {upper_bound:.4f})")


Best Parameters:  {'classifier__C': 10.0, 'classifier__penalty': 'l1'}
Best Score:  0.9011149000663068
Accuracy: Mean=0.8796, 95% CI=(0.8597, 0.8997)
Precision: Mean=0.8337, 95% CI=(0.8104, 0.8622)
Recall: Mean=0.9062, 95% CI=(0.8771, 0.9377)
F1: Mean=0.8683, 95% CI=(0.8442, 0.8917)
Specificity: Mean=0.8591, 95% CI=(0.8428, 0.8809)
Roc_auc: Mean=0.9472, 95% CI=(0.9324, 0.9600)


Best Parameters:  {'classifier__C': 10.0, 'classifier__penalty': 'l1'}
Best Score:  0.9011149000663068
Accuracy: Mean=0.8815, 95% CI=(0.8672, 0.8963)
Precision: Mean=0.8308, 95% CI=(0.8048, 0.8647)
Recall: Mean=0.9144, 95% CI=(0.9087, 0.9260)
F1: Mean=0.8705, 95% CI=(0.8557, 0.8873)
Specificity: Mean=0.8563, 95% CI=(0.8322, 0.8842)
Roc_auc: Mean=0.9510, 95% CI=(0.9398, 0.9592)