In [1]:
import pandas as pd
from sklearn.metrics import roc_curve
import numpy as np

In [56]:
number = 42

In [57]:
# Load the provided CSV file
file_path = f'./results/results{number}/metadata_with_predictions_test_results_{number}.csv_updated.csv'
data = pd.read_csv(file_path)

# Display the first few rows and the columns of the dataset to understand its structure
data.head(), data.columns

(                         patient_id  \
 0  0085793617387570009bc6b879caaf29   
 1  00947e679a0e660dfe4f39a0b1c9a2be   
 2  00a55961d0d9a988e15dada3d9ad3688   
 3  00ddc6b627e5600946d20879c2fc56ff   
 4  00ddc6b627e5600946d20879c2fc56ff   
 
                                     cxr_filename            cxr_time_offset  \
 0  0085793617387570009bc6b879caaf29_ae69c033.jpg  3540 days 12:24:52.908070   
 1  00947e679a0e660dfe4f39a0b1c9a2be_89aa45c4.jpg  1386 days 17:58:31.914816   
 2  00a55961d0d9a988e15dada3d9ad3688_bdd766ef.jpg  3025 days 22:15:18.992757   
 3  00ddc6b627e5600946d20879c2fc56ff_0c11726f.jpg  4847 days 12:42:15.500848   
 4  00ddc6b627e5600946d20879c2fc56ff_73843c3f.jpg  5218 days 08:10:15.500848   
 
    cxr_year                                           cxr_path  \
 0      2015  ./cxrs/0085793617387570009bc6b879caaf29_ae69c0...   
 1      2014  ./cxrs/00947e679a0e660dfe4f39a0b1c9a2be_89aa45...   
 2      2015  ./cxrs/00a55961d0d9a988e15dada3d9ad3688_bdd766...   
 3      

In [58]:
data['no_finding_labs'] = abs(1 - data["composite_labs"])
data['no_finding_preds'] = abs(1 - data["composite_preds"])

In [59]:
# Extract relevant data
y_true = data[['slvh_labs', 'dlv_labs', 'composite_labs', 'no_finding_labs']].values
y_preds = data[['slvh_preds', 'dlv_preds', 'composite_preds', 'no_finding_preds']].values
labels = ['slvh', 'dlv', 'composite', 'no_finding']

In [60]:
# Function to calculate optimal thresholds
def calculate_optimal_thresholds(y_true, y_preds, labels):
    thresholds = {}
    for i, label in enumerate(labels):
        # Extract true labels and predicted probabilities for the current label
        true = y_true[:, i]
        preds = y_preds[:, i]
        
        # Compute ROC curve data
        fpr, tpr, thresh = roc_curve(true, preds)
        
        # Calculate Youden's J statistic
        j_scores = tpr - fpr
        optimal_index = np.argmax(j_scores)
        optimal_threshold = thresh[optimal_index]
        
        # Store the optimal threshold
        thresholds[label] = optimal_threshold
    return thresholds

In [61]:
# Calculate thresholds
thresholds = calculate_optimal_thresholds(y_true, y_preds, labels)
thresholds

{'slvh': 0.2625946,
 'dlv': inf,
 'composite': 0.25753528,
 'no_finding': 0.7424798}

In [62]:
# Apply the thresholds to classify the predictions
data['slvh_classified'] = (data['slvh_preds'] >= thresholds['slvh']).astype(int)
data['dlv_classified'] = (data['dlv_preds'] >= thresholds['dlv']).astype(int)
data['composite_classified'] = (data['composite_preds'] >= thresholds['composite']).astype(int)
data['no_finding_classified'] = (data['no_finding_preds'] >= thresholds['no_finding']).astype(int)

In [63]:
# Display the first few rows of the updated dataset with the classified predictions
data[['slvh_preds', 'slvh_classified', 'slvh_labs', 'dlv_preds', 'dlv_classified', 'dlv_labs', 'composite_preds', 'composite_classified', 'composite_labs', 'no_finding_preds', 'no_finding_classified', 'no_finding_labs']].head()

Unnamed: 0,slvh_preds,slvh_classified,slvh_labs,dlv_preds,dlv_classified,dlv_labs,composite_preds,composite_classified,composite_labs,no_finding_preds,no_finding_classified,no_finding_labs
0,0.270441,1,0.0,0.0,0,0.0,0.270441,1,0.0,0.729559,0,1.0
1,0.241343,0,0.0,0.0,0,0.0,0.241343,0,0.0,0.758657,1,1.0
2,0.273055,1,0.0,0.0,0,0.0,0.273055,1,0.0,0.726945,0,1.0
3,0.228331,0,0.0,0.0,0,0.0,0.228331,0,0.0,0.771669,1,1.0
4,0.230147,0,0.0,0.0,0,0.0,0.230147,0,0.0,0.769853,1,1.0


In [64]:
data.to_csv(f"./results/results{number}/metadata_with_predictions_output_labels_{number}.csv")