In [1]:
import pandas as pd
from sklearn.metrics import roc_curve
import numpy as np

In [39]:
number = 9

In [40]:
# Load the provided CSV file
file_path = f'./results/results{number}/metadata_with_predictions_test_results_{number}.csv_updated.csv'
data = pd.read_csv(file_path)

# Display the first few rows and the columns of the dataset to understand its structure
data.head(), data.columns

(                         patient_id  \
 0  00632724788f0fbfc6a0dd0fd34adcc4   
 1  0178076bb36776d6ca72a6cb22fea16d   
 2  0178076bb36776d6ca72a6cb22fea16d   
 3  0178076bb36776d6ca72a6cb22fea16d   
 4  01e639a07cbebd4084ec9b074318f515   
 
                                     cxr_filename            cxr_time_offset  \
 0  00632724788f0fbfc6a0dd0fd34adcc4_acf63c2f.jpg  4963 days 03:14:32.355690   
 1  0178076bb36776d6ca72a6cb22fea16d_6d07adbb.jpg  1956 days 16:55:44.419366   
 2  0178076bb36776d6ca72a6cb22fea16d_862536c0.jpg  1838 days 11:48:44.419366   
 3  0178076bb36776d6ca72a6cb22fea16d_e0b448a0.jpg  1886 days 12:03:44.419366   
 4  01e639a07cbebd4084ec9b074318f515_37b9f596.jpg  3200 days 00:13:00.650946   
 
    cxr_year                                           cxr_path  \
 0      2017  ./cxrs/00632724788f0fbfc6a0dd0fd34adcc4_acf63c...   
 1      2015  ./cxrs/0178076bb36776d6ca72a6cb22fea16d_6d07ad...   
 2      2014  ./cxrs/0178076bb36776d6ca72a6cb22fea16d_862536...   
 3      

In [41]:
data['no_finding_labs'] = abs(1 - data["composite_labs"])
data['no_finding_preds'] = abs(1 - data["composite_preds"])

In [42]:
# Extract relevant data
y_true = data[['slvh_labs', 'dlv_labs', 'composite_labs', 'no_finding_labs']].values
y_preds = data[['slvh_preds', 'dlv_preds', 'composite_preds', 'no_finding_preds']].values
labels = ['slvh', 'dlv', 'composite', 'no_finding']

In [43]:
# Function to calculate optimal thresholds
def calculate_optimal_thresholds(y_true, y_preds, labels):
    thresholds = {}
    for i, label in enumerate(labels):
        # Extract true labels and predicted probabilities for the current label
        true = y_true[:, i]
        preds = y_preds[:, i]
        
        # Compute ROC curve data
        fpr, tpr, thresh = roc_curve(true, preds)
        
        # Calculate Youden's J statistic
        j_scores = tpr - fpr
        optimal_index = np.argmax(j_scores)
        optimal_threshold = thresh[optimal_index]
        
        # Store the optimal threshold
        thresholds[label] = optimal_threshold
    return thresholds

In [44]:
# Calculate thresholds
thresholds = calculate_optimal_thresholds(y_true, y_preds, labels)
thresholds

{'slvh': 0.1585661,
 'dlv': inf,
 'composite': 0.15856612,
 'no_finding': 0.84150803}

In [45]:
# Apply the thresholds to classify the predictions
data['slvh_classified'] = (data['slvh_preds'] >= thresholds['slvh']).astype(int)
data['dlv_classified'] = (data['dlv_preds'] >= thresholds['dlv']).astype(int)
data['composite_classified'] = (data['composite_preds'] >= thresholds['composite']).astype(int)
data['no_finding_classified'] = (data['no_finding_preds'] >= thresholds['no_finding']).astype(int)

In [46]:
# Display the first few rows of the updated dataset with the classified predictions
data[['slvh_preds', 'slvh_classified', 'slvh_labs', 'dlv_preds', 'dlv_classified', 'dlv_labs', 'composite_preds', 'composite_classified', 'composite_labs', 'no_finding_preds', 'no_finding_classified', 'no_finding_labs']].head()

Unnamed: 0,slvh_preds,slvh_classified,slvh_labs,dlv_preds,dlv_classified,dlv_labs,composite_preds,composite_classified,composite_labs,no_finding_preds,no_finding_classified,no_finding_labs
0,0.140301,0,0,0,0,0,0.140301,0,0,0.859699,1,1
1,0.138689,0,0,0,0,0,0.138689,0,0,0.861311,1,1
2,0.155258,0,0,0,0,0,0.155258,0,0,0.844742,1,1
3,0.145356,0,0,0,0,0,0.145356,0,0,0.854644,1,1
4,0.096522,0,0,0,0,0,0.096522,0,0,0.903478,1,1


In [47]:
data.to_csv(f"./results/results{number}/metadata_with_predictions_output_labels_{number}.csv")