In [5]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

In [14]:
df = pd.read_csv('/Users/rbhalerao/Desktop/Project1CPH200A/project1/logs/clinical_utility/learning_rate_0.0006_batch_size_256_num_epochs_200_regularization_lambda_0')

In [21]:
def get_feature_metrics(df, feature,nlst_metric=None, matching=None):
    # Drop rows with NaN in the specified feature and test_Y
    df_filtered = df.dropna(subset=[feature, 'test_Y'])

    # Replace empty strings with NaN
    df_filtered.loc[:, feature] = df_filtered[feature].replace('', pd.NA)

    # Convert to numeric and drop any rows that couldn't be converted
    y_pred_series = pd.to_numeric(df_filtered[feature], errors='coerce')

    # Filter out rows where conversion to numeric produced NA
    mask = y_pred_series.notna()
    df_filtered = df_filtered[mask]

    # Get y_true for remaining rows
    y_true = df_filtered['lung_cancer'].astype('int64')

    if feature == 'pred_test_Y':

        def calculate_threshold(nlst_metric, matching):
            thresholds = np.arange(0, 1.01, 0.01)  # Create thresholds from 0 to 1
            best_threshold = None
            
            for threshold in thresholds:
                if matching == 'specificity':
                    predictions = (df_filtered['pred_test_Y'] >= threshold).astype(int)
                    tn, fp, fn, tp = confusion_matrix(y_true, predictions).ravel()
                    # Calculate specificity
                    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
                    if specificity >= nlst_metric:
                        best_threshold = threshold
                        break  # Stop at the first threshold that matches or exceeds NLST specificity
                if matching == 'sensitivity':
                    #If I threshold at 0, sensitivity will always be equal to 1 so change 
                    #sensivity to have to be less than or equal to nlst metric
                    predictions = (df_filtered['pred_test_Y'] >= threshold).astype(int)
                    tn, fp, fn, tp = confusion_matrix(y_true, predictions).ravel()
                    # Calculate specificity
                    sensitivity = tp/(tp+fn) if (tp + fn) > 0 else 0
                    if sensitivity <= nlst_metric:
                        best_threshold = threshold
                        break
                if matching == 'ppv':
                    predictions = (df_filtered['pred_test_Y'] >= threshold).astype(int)
                    tn, fp, fn, tp = confusion_matrix(y_true, predictions).ravel()
                    # Calculate specificity
                    ppv = tp/(tp+fp) if (tp + fp) > 0 else 0
                    if ppv >= nlst_metric:
                        best_threshold = threshold
                        break
            return best_thresholds

        threshold = calculate_threshold(nlst_metric, matching)
        print(threshold)
        y_pred = (df_filtered[feature] >= threshold).astype('int64')
        
        
    else:
        y_pred = y_pred_series[mask].astype('int64')

    # Compute confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    # Sensitivity (Recall)
    sensitivity = tp/(tp+fn) if (tp + fn) > 0 else 0

    # Specificity
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

    # Positive Predictive Value (PPV)
    ppv = tp/(tp+fp) if (tp + fp) > 0 else 0

    metric_tracker = {
        "Sensitivity": sensitivity,
        "Specificity": specificity,
        "PPV": ppv,
    }

    return metric_tracker


In [15]:
print(df['lung_cancer'].value_counts())

lung_cancer
0    29175
1      712
Name: count, dtype: int64


In [16]:
get_feature_metrics(df, 'nlst_flag')

{'Sensitivity': np.float64(0.6666666666666666),
 'Specificity': np.float64(0.8029895687708356),
 'PPV': np.float64(0.07537012113055182)}

In [17]:
get_feature_metrics(df, 'pred_test_Y', 0.80, 'specificity')

{'Sensitivity': np.float64(0.9887640449438202),
 'Specificity': np.float64(0.8051756640959726),
 'PPV': np.float64(0.11020663744520977)}

In [18]:
get_feature_metrics(df, 'pred_test_Y', 0.66, 'sensitivity')

{'Sensitivity': np.float64(0.6320224719101124),
 'Specificity': np.float64(0.9943444730077121),
 'PPV': np.float64(0.7317073170731707)}

In [19]:
get_feature_metrics(df, 'pred_test_Y', 0.07, 'ppv')

{'Sensitivity': np.float64(0.9985955056179775),
 'Specificity': np.float64(0.6772236503856041),
 'PPV': np.float64(0.07020142180094786)}

In [22]:
get_feature_metrics(df, 'pred_test_Y', 0.9, 'specificity')

0.14


{'Sensitivity': np.float64(0.9592696629213483),
 'Specificity': np.float64(0.9076949443016281),
 'PPV': np.float64(0.20231042654028436)}

In [28]:
def get_subgroup_metrics(df, subgroup):
    df = df.dropna(subset=subgroup)
    values = df[subgroup].unique()
    metric_tracker = {}
    for v in values: 
        subset = df[df[subgroup] == v]
        y_pred = (subset['pred_test_Y'] >= 0.14).astype(int)
        y_true = (subset['lung_cancer']).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        sensitivity = tp/(tp+fn) if (tp + fn) > 0 else 0
        ppv = tp/(tp+fp) if (tp + fp) > 0 else 0
        metric_tracker[v] = {
            'Specificity': specificity,
            'Sensitivity': sensitivity,
            'PPV': ppv
        }
    return metric_tracker


In [29]:
get_subgroup_metrics(df, 'sex')

{np.int64(1): {'Specificity': np.float64(0.8721034792833922),
  'Sensitivity': np.float64(0.9698375870069605),
  'PPV': np.float64(0.18438464931627702)},
 np.int64(2): {'Specificity': np.float64(0.9426552520722924),
  'Sensitivity': np.float64(0.9430604982206405),
  'PPV': np.float64(0.23895401262398558)}}

In [30]:
get_subgroup_metrics(df, 'race7')

{np.int64(1): {'Specificity': np.float64(0.9123024604920984),
  'Sensitivity': np.float64(0.9585406301824212),
  'PPV': np.float64(0.20866425992779783)},
 np.int64(2): {'Specificity': np.float64(0.9514632405424697),
  'Sensitivity': np.float64(0.9722222222222222),
  'PPV': np.float64(0.33980582524271846)},
 np.int64(4): {'Specificity': np.float64(0.9669187145557656),
  'Sensitivity': np.float64(0.8846153846153846),
  'PPV': np.float64(0.39655172413793105)},
 np.int64(7): {'Specificity': np.float64(0.6462655601659751),
  'Sensitivity': np.float64(1.0),
  'PPV': np.float64(0.08333333333333333)},
 np.int64(5): {'Specificity': np.float64(0.9080459770114943),
  'Sensitivity': np.float64(1.0),
  'PPV': np.float64(0.23809523809523808)},
 np.int64(3): {'Specificity': np.float64(0.936),
  'Sensitivity': np.float64(1.0),
  'PPV': np.float64(0.23809523809523808)},
 np.int64(6): {'Specificity': np.float64(0.891566265060241),
  'Sensitivity': np.float64(1.0),
  'PPV': np.float64(0.1)}}

In [31]:
get_subgroup_metrics(df, 'educat')

{np.float64(7.0): {'Specificity': np.float64(0.9557588202352063),
  'Sensitivity': np.float64(0.8831168831168831),
  'PPV': np.float64(0.22295081967213115)},
 np.float64(2.0): {'Specificity': np.float64(0.8566392479435958),
  'Sensitivity': np.float64(0.9702970297029703),
  'PPV': np.float64(0.28654970760233917)},
 np.float64(3.0): {'Specificity': np.float64(0.9066750039080819),
  'Sensitivity': np.float64(0.9411764705882353),
  'PPV': np.float64(0.19433198380566802)},
 np.float64(5.0): {'Specificity': np.float64(0.9064193071745198),
  'Sensitivity': np.float64(0.9808917197452229),
  'PPV': np.float64(0.212707182320442)},
 np.float64(1.0): {'Specificity': np.float64(0.8924302788844621),
  'Sensitivity': np.float64(1.0),
  'PPV': np.float64(0.25)},
 np.float64(4.0): {'Specificity': np.float64(0.9015304649148137),
  'Sensitivity': np.float64(1.0),
  'PPV': np.float64(0.21247113163972287)},
 np.float64(6.0): {'Specificity': np.float64(0.9334016393442623),
  'Sensitivity': np.float64(0.956

In [32]:
get_subgroup_metrics(df, 'cig_stat')

{np.float64(0.0): {'Specificity': np.float64(1.0),
  'Sensitivity': np.float64(0.46),
  'PPV': np.float64(1.0)},
 np.float64(2.0): {'Specificity': np.float64(0.8911013798231844),
  'Sensitivity': np.float64(0.9970845481049563),
  'PPV': np.float64(0.20602409638554217)},
 np.float64(1.0): {'Specificity': np.float64(0.6338526912181303),
  'Sensitivity': np.float64(0.9965277777777778),
  'PPV': np.float64(0.21725965177895534)}}

In [33]:
get_subgroup_metrics(df, 'nlst_flag')

{np.float64(0.0): {'Specificity': np.float64(0.9891076291236999),
  'Sensitivity': np.float64(0.875),
  'PPV': np.float64(0.44545454545454544)},
 np.float64(1.0): {'Specificity': np.float64(0.616448326055313),
  'Sensitivity': np.float64(1.0),
  'PPV': np.float64(0.1752738654147105)}}