In [None]:
import os, sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_curve, auc


In [None]:
cutoff = ' (cutoff 500)'

In [None]:
concat_trues1 = np.load('RV test true array.npy')
concat_preds1 = np.load('RV test prediction array.npy')

concat_trues2 = np.load('BW2V test true array' +cutoff+ '.npy')
concat_preds2 = np.load('BW2V test prediction array' +cutoff+ '.npy')

concat_trues3 = np.load('RV_BW2V test true array' +cutoff+ '.npy')
concat_preds3 =  np.load('RV_BW2V test prediction array' +cutoff+ '.npy')

In [None]:
fpr1, tpr1, thresholds1 = roc_curve(concat_trues1,concat_preds1)
trues1 = concat_trues1.reshape(1000,-1)
preds1 = concat_preds1.reshape(1000,-1)
aucs1 = []
for i,x in enumerate(trues1):
    true = x
    pred = preds1[i]
    fpr, tpr, thresholds1 = roc_curve(true,pred)
    aucs1.append(auc(fpr,tpr))

auc1l = np.percentile(aucs1,2.5)
auc1m = np.percentile(aucs1,50)
auc1h = np.percentile(aucs1,97.5)

fpr2, tpr2, thresholds2 = roc_curve(concat_trues2,concat_preds2)
trues2 = concat_trues2.reshape(1000,-1)
preds2 = concat_preds2.reshape(1000,-1)
aucs2 = []
for i,x in enumerate(trues2):
    true = x
    pred = preds2[i]
    fpr, tpr, thresholds1 = roc_curve(true,pred)
    aucs2.append(auc(fpr,tpr))

auc2l = np.percentile(aucs2,2.5)
auc2m = np.percentile(aucs2,50)
auc2h = np.percentile(aucs2,97.5)

fpr3, tpr3, thresholds3 = roc_curve(concat_trues3,concat_preds3)
trues3 = concat_trues3.reshape(1000,-1)
preds3 = concat_preds3.reshape(1000,-1)
aucs3 = []
for i,x in enumerate(trues3):
    true = x
    pred = preds3[i]
    fpr, tpr, thresholds1 = roc_curve(true,pred)
    aucs3.append(auc(fpr,tpr))

auc3l = np.percentile(aucs3,2.5)
auc3m = np.percentile(aucs3,50)
auc3h = np.percentile(aucs3,97.5)

In [None]:
print(f'Structured Data AUC = {auc1m:.2f} ({auc1l:.2f}-{auc1h:.2f})')
print(f'Text Data AUC = {auc2m:.2f} ({auc2l:.2f}-{auc2h:.2f})')
print(f'Full Model AUC = {auc3m:.2f} ({auc3l:.2f}-{auc3h:.2f})')

In [None]:
# Set Seaborn style
sns.set(style='whitegrid')

plt.figure(figsize=(8, 6))

# Choose three colors that look good together using Seaborn color palette
colors = sns.color_palette("husl", 3)

# Plot ROC curve for RV Model
plt.plot(fpr1, tpr1, color=colors[0], lw=2, label=f'Structured Data Only AUC = {auc1m:.2f}')

# Plot ROC curve for BW2V Model
plt.plot(fpr2, tpr2, color=colors[1], lw=2, label=f'Text Data Only AUC = {auc2m:.2f}')

# Plot ROC curve for Combined Model
plt.plot(fpr3, tpr3, color=colors[2], lw=2, label=f'Full Model AUC = {auc3m:.2f}')
#plt.plot(fpr3, tpr3, color=colors[2], lw=2, label=f'Full Model AUC = {auc3m:.3f} [{auc3l:.2f}-{auc3h:.2f}]')

# Add the diagonal reference line
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')

# Set x-axis and y-axis limits
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

# Label the axes
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

# Add a title
plt.title('Receiver Operating Characteristic (ROC) Curve')

# Add a legend in the lower-right corner
plt.legend(loc='lower right')

# Show the plot
plt.show()



In [None]:
preds = preds3
trues = trues3

out = np.zeros([preds.shape[0],5,7])
for i in range(preds.shape[0]):
    pred = preds[i]
    true = trues[i]
    fpr, tpr, thresholds = roc_curve(true,pred)

    sensitivities = []
    specificities = []
    precisions = []
    recalls = []
    f1_scores = []

    for threshold in thresholds[1:]:
        # Create binary predictions based on the current threshold
        binary_predictions = [1 if p >= threshold else 0 for p in pred]

        # Calculate TP, FP, TN, FN
        TP = sum((p == 1) and (t == 1) for p, t in zip(binary_predictions, true))
        FP = sum((p == 1) and (t == 0) for p, t in zip(binary_predictions, true))
        TN = sum((p == 0) and (t == 0) for p, t in zip(binary_predictions, true))
        FN = sum((p == 0) and (t == 1) for p, t in zip(binary_predictions, true))

        # Calculate sensitivity, specificity, precision, recall, and F1 score
        sensitivity = TP / (TP + FN)
        specificity = TN / (TN + FP)
        precision = TP / (TP + FP)
        recall = sensitivity
        if precision+recall == 0:
            f1 = None
        else:
            f1 = 2 * (precision * recall) / (precision + recall)

        sensitivities.append(sensitivity)
        specificities.append(specificity)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
    X = np.array([sensitivities,specificities,precisions,recalls,f1_scores])
    out[i] = X[:,[(sensitivities >= np.percentile(sensitivities,x)).argmax() for x in [100,90,75,50,25,10,0]]]
out.shape



In [None]:
sensitivities = out[:,0,:]
specificities = out[:,1,:]
precisions = out[:,2,:]
recalls = out[:,3,:]
f1_scores = out[:,4,:]

In [None]:
def confidence_intervals(variable):
    variable = pd.DataFrame(variable)
    variable.dropna(inplace=True)
    variable = variable.values
    means = variable.mean(axis=0)
    low_cis = [np.percentile(variable[:,x],2.5) for x in range(7)]
    high_cis = [np.percentile(variable[:,x],97.5) for x in range(7)]
    for i in range(7):
        # Use string formatting to round values to two decimal places
        mean_str = "{:.2f}".format(means[i])
        low_ci_str = "{:.2f}".format(low_cis[i])
        high_ci_str = "{:.2f}".format(high_cis[i])
        
        # Print the values with rounded formatting
        print(f"{mean_str} ({low_ci_str}-{high_ci_str})")

In [None]:
confidence_intervals(precisions)