In [None]:
# this notebook calculates interrater reliability (measured using the F1) between two manual
# annotaions of the same imaging report
# note the F1 metric is invariant to which annotation is considered the gold standard

In [1]:
import numpy as np
import pandas as pd

In [2]:
def find_metsite(dataset, mincode, maxcode):
    temp = np.where(((dataset['image_casite1'] >= mincode) & (dataset['image_casite1'] <= maxcode)) |
                    ((dataset['image_casite2'] >= mincode) & (dataset['image_casite2'] <= maxcode)) |
                    ((dataset['image_casite3'] >= mincode) & (dataset['image_casite3'] <= maxcode)) |
                    ((dataset['image_casite4'] >= mincode) & (dataset['image_casite4'] <= maxcode)) |
                    ((dataset['image_casite5'] >= mincode) & (dataset['image_casite5'] <= maxcode)) |
                    ((dataset['image_casite6'] >= mincode) & (dataset['image_casite6'] <= maxcode)) |
                    ((dataset['image_casite7'] >= mincode) & (dataset['image_casite7'] <= maxcode)) |
                    ((dataset['image_casite8'] >= mincode) & (dataset['image_casite8'] <= maxcode)) |
                    ((dataset['image_casite9'] >= mincode) & (dataset['image_casite9'] <= maxcode)) |
                    ((dataset['image_casite10'] >= mincode) & (dataset['image_casite10'] <= maxcode)) |
                    ((dataset['image_casite11'] >= mincode) & (dataset['image_casite11'] <= maxcode)) |
                    ((dataset['image_casite12'] >= mincode) & (dataset['image_casite12'] <= maxcode)) |
                    ((dataset['image_casite13'] >= mincode) & (dataset['image_casite13'] <= maxcode)) |
                    ((dataset['image_casite14'] >= mincode) & (dataset['image_casite14'] <= maxcode)) |
                    ((dataset['image_casite15'] >= mincode) & (dataset['image_casite15'] <= maxcode)),
                    1, 0)
    return temp
    #return np.where(np.isnan(temp), 0, temp)

In [3]:
def parse_imaging_file(dataset):
    newset = dataset.copy()
    newset = newset[newset.redcap_repeat_instrument == 'prissmm_imaging']
    newset = newset[newset.prissmm_imaging_complete == 2]
    newset['brain_met'] = find_metsite(newset, 700, 719)
    newset['bone_met'] = find_metsite(newset, 400, 419)
    newset['adrenal_met'] = find_metsite(newset, 740, 749)
    newset['liver_met'] = find_metsite(newset, 220, 220)
    newset['lung_met'] = find_metsite(newset, 340, 349)
    newset['node_met'] = find_metsite(newset, 770, 779)
    newset['peritoneal_met'] = find_metsite(newset, 481, 482)
    newset['ascites'] = find_metsite(newset, 901, 901)
    newset['peritoneal_met'] = np.where((newset.peritoneal_met==1) | (newset.ascites==1), 1, 0)
    newset['any_cancer'] = np.where(newset['image_ca'] == 1, 1, 0)
    newset['response'] = np.where(newset['image_overall'] == 1, 1, 0)
    newset['progression'] = np.where(newset['image_overall'] == 4, 1, 0)
    newset['class_status'] = np.where(newset['any_cancer'] == 0, 0, newset['image_overall'])
    newset = newset.rename(columns={'image_scansite___1':'head_imaged',
                           'image_scansite___2':'spine_imaged',
                           'image_scansite___3':'neck_imaged',
                           'image_scansite___4':'chest_imaged',
                           'image_scansite___5':'abdomen_imaged',
                           'image_scansite___6':'pelvis_imaged',
                           'image_scansite___7':'extremity_imaged',
                           'image_scansite___8':'whole_body_imaged'})
    newset['head_imaged'] = np.where(newset['whole_body_imaged']==1, 1, newset['head_imaged'])
    newset['spine_imaged'] = np.where(newset['whole_body_imaged']==1, 1, newset['spine_imaged'])
    newset['neck_imaged'] = np.where(newset['whole_body_imaged']==1, 1, newset['neck_imaged'])
    newset['chest_imaged'] = np.where(newset['whole_body_imaged']==1, 1, newset['chest_imaged'])
    newset['abdomen_imaged'] = np.where(newset['whole_body_imaged']==1, 1, newset['abdomen_imaged'])
    newset['pelvis_imaged'] = np.where(newset['whole_body_imaged']==1, 1, newset['pelvis_imaged'])
    newset['extremity_imaged'] = np.where(newset['whole_body_imaged']==1, 1, newset['extremity_imaged'])
    newset['date'] = pd.to_datetime(newset.image_scan_dt)

    return newset[['record_id','date','image_scan_type','any_cancer','progression','response','class_status','brain_met','bone_met','adrenal_met','liver_met','lung_met','node_met','peritoneal_met','head_imaged','neck_imaged','spine_imaged','chest_imaged','abdomen_imaged','pelvis_imaged','extremity_imaged','whole_body_imaged']]

In [5]:
# different imaging report types
prefix = '/mnt/d/Dropbox (Partners Healthcare)/'
dataset_list = []
for cancer_type in ['nsclc_phase2_existing','crc','breast','pancreas','bladder','prostate']:
    pts = pd.read_csv(prefix+'profile_3-2023/redcaps/' + cancer_type + '_ptchars.csv', low_memory=False)[['record_id','internal_mrn']]
    pts = pts.sort_values(by=['record_id','internal_mrn']).reset_index(drop=True)
    pts = pts.groupby('internal_mrn').first().reset_index(drop=False).rename(columns={'internal_mrn':'dfci_mrn'})
    
    # for prostate patients only take BPC cases? that would be inconsistent...
    
    imaging = pd.read_csv(prefix+'profile_3-2023/redcaps/' + cancer_type + '_imaging.csv', low_memory=False)
    imaging = parse_imaging_file(imaging)
    imaging = pd.merge(pts, imaging, on='record_id').drop(columns='record_id')
    imaging['cancer_type'] = cancer_type

    dataset_list.append(imaging)

In [6]:
final = pd.concat(dataset_list, axis=0)

In [8]:
final['num_curations'] = final.groupby(['dfci_mrn','cancer_type','image_scan_type','date','head_imaged','neck_imaged','spine_imaged','chest_imaged','abdomen_imaged','pelvis_imaged']).transform('size')

In [10]:
final.num_curations.value_counts()

1.0    67244
2.0     1078
3.0       51
4.0        8
Name: num_curations, dtype: int64

In [13]:
double = final[final.num_curations == 2]

In [15]:
double['id_in_group'] = double.groupby(['dfci_mrn','cancer_type','image_scan_type','date','head_imaged','neck_imaged','spine_imaged','chest_imaged','abdomen_imaged','pelvis_imaged']).cumcount()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  double['id_in_group'] = double.groupby(['dfci_mrn','cancer_type','image_scan_type','date','head_imaged','neck_imaged','spine_imaged','chest_imaged','abdomen_imaged','pelvis_imaged']).cumcount()


In [20]:
from sklearn.metrics import f1_score

In [22]:
for outcome in ['any_cancer','progression','response','brain_met','bone_met','adrenal_met','liver_met','lung_met','node_met','peritoneal_met']:
    first = double[double.id_in_group == 0]
    first[outcome + '_first'] = first[outcome]
    second = double[double.id_in_group == 1]
    second[outcome + '_second'] = second[outcome]
    combined = pd.merge(first, second, on = ['dfci_mrn','cancer_type','image_scan_type','date','head_imaged','neck_imaged','spine_imaged','chest_imaged','abdomen_imaged','pelvis_imaged'])
    combined = combined[[outcome + '_first', outcome + '_second']]
    print(outcome)
    print(pd.crosstab(combined[outcome + '_first'], combined[outcome + '_second']))
    print('f1 #1')
    first_f1 = f1_score(combined[outcome + '_first'], combined[outcome + '_second'])
    print(first_f1)
    print('f1 #2')
    second_f1 = f1_score(combined[outcome + '_second'], combined[outcome + '_first'])
    print(second_f1)
    print('average f1')
    print((first_f1 + second_f1)/2)
    print('\n')
    


any_cancer
any_cancer_second    0    1
any_cancer_first           
0                  254   36
1                   38  211
f1 #1
0.8508064516129032
f1 #2
0.8508064516129032
average f1
0.8508064516129032


progression
progression_second    0   1
progression_first          
0                   411  34
1                    31  63
f1 #1
0.6596858638743456
f1 #2
0.6596858638743456
average f1
0.6596858638743456


response
response_second    0  1
response_first         
0                513  7
1                 11  8
f1 #1
0.47058823529411764
f1 #2
0.47058823529411764
average f1
0.47058823529411764


brain_met
brain_met_second    0   1
brain_met_first          
0                 484  17
1                  12  26
f1 #1
0.6419753086419753
f1 #2
0.6419753086419753
average f1
0.6419753086419753


bone_met
bone_met_second    0   1
bone_met_first          
0                412  20
1                 14  93
f1 #1
0.8454545454545455
f1 #2
0.8454545454545455
average f1
0.8454545454545455


adrenal_met


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first[outcome + '_first'] = first[outcome]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  second[outcome + '_second'] = second[outcome]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first[outcome + '_first'] = first[outcome]
A value is trying to be set on a copy of a slice from a DataFrame.
Try us