In [1]:
import numpy as np
import pandas as pd

In [None]:
# this notebook calculates interrater reliability as measured by F1 for two independent manual annotations 
# of the same oncologist notes at DFCI.
# (F1 scores are invariant to which annotation is considered the gold standard)

In [4]:
prefix ='/mnt/d/Dropbox (Partners Healthcare)/'

In [2]:
def parse_medonc_file(dataset):
    newset = dataset.copy()
    newset = newset[newset.redcap_repeat_instrument == 'prissmm_med_onc_assessment']
    newset = newset[newset.prissmm_med_onc_assessment_complete == 2]

    newset['any_cancer'] = np.where(newset['md_ca'] == 1, 1, 0)
    newset['response'] = np.where(newset['md_ca_status'] == 1, 1, 0)
    #newset['progression'] = np.where(newset['md_ca_status'] == 4, 1, 0)
    newset['progression'] = np.where(np.logical_or(newset['md_ca_status'] == 4, newset['md_ca_status']==3), 1, 0)


    newset['date'] = pd.to_datetime(newset.md_onc_visit_dt)

    return newset[['record_id','date','any_cancer','progression','response',]]

In [5]:
dataset_list = []
for cancer_type in ['nsclc_phase2_new','nsclc_phase2_existing','crc','breast','pancreas','bladder','prostate','rcc_barkouny']:
    try:
        pts = pd.read_csv(prefix+'profile_3-2023/redcaps/' + cancer_type + '_ptchars.csv', low_memory=False)[['record_id','internal_mrn']]
        pts = pts.sort_values(by=['record_id','internal_mrn']).reset_index(drop=True)
        pts = pts.groupby('internal_mrn').first().reset_index(drop=False).rename(columns={'internal_mrn':'dfci_mrn'})
    except:
        pts = pd.read_csv(prefix+'profile_3-2023/redcaps/' + cancer_type + '_ptchars.csv', low_memory=False)[['record_id','local_mrn']]
        pts = pts.sort_values(by=['record_id','local_mrn']).reset_index(drop=True)
        pts = pts.groupby('local_mrn').first().reset_index(drop=False).rename(columns={'local_mrn':'dfci_mrn'})        

    medonc = pd.read_csv(prefix+'profile_3-2023/redcaps/' + cancer_type + '_medonc.csv', low_memory=False)
    medonc = parse_medonc_file(medonc)
    medonc = pd.merge(pts, medonc, on='record_id').drop(columns='record_id')
    medonc['cancer_type'] = cancer_type

    dataset_list.append(medonc)

In [6]:
final = pd.concat(dataset_list, axis=0)

In [7]:
final['num_curations'] = final.groupby(['dfci_mrn','cancer_type','date']).transform('size')

In [8]:
final.num_curations.value_counts()

1.0    78882
2.0      102
3.0        3
Name: num_curations, dtype: int64

In [9]:
double = final[final.num_curations == 2]

In [11]:
double['id_in_group'] = double.groupby(['dfci_mrn','cancer_type']).cumcount()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  double['id_in_group'] = double.groupby(['dfci_mrn','cancer_type']).cumcount()


In [13]:
from sklearn.metrics import f1_score

In [14]:
for outcome in ['any_cancer','progression','response']:
    first = double[double.id_in_group == 0]
    first[outcome + '_first'] = first[outcome]
    second = double[double.id_in_group == 1]
    second[outcome + '_second'] = second[outcome]
    combined = pd.merge(first, second, on = ['dfci_mrn','cancer_type','date'])
    combined = combined[[outcome + '_first', outcome + '_second']]
    print(outcome)
    print(pd.crosstab(combined[outcome + '_first'], combined[outcome + '_second']))
    print('f1 #1')
    first_f1 = f1_score(combined[outcome + '_first'], combined[outcome + '_second'])
    print(first_f1)
    print('f1 #2')
    second_f1 = f1_score(combined[outcome + '_second'], combined[outcome + '_first'])
    print(second_f1)
    print('average f1')
    print((first_f1 + second_f1)/2)
    print('\n')
    


any_cancer
any_cancer_second   0   1
any_cancer_first         
0                  14   1
1                   2  19
f1 #1
0.9268292682926829
f1 #2
0.9268292682926829
average f1
0.9268292682926829


progression
progression_second   0  1
progression_first        
0                   25  4
1                    2  5
f1 #1
0.6250000000000001
f1 #2
0.6250000000000001
average f1
0.6250000000000001


response
response_second   0  1
response_first        
0                35  0
1                 0  1
f1 #1
1.0
f1 #2
1.0
average f1
1.0




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first[outcome + '_first'] = first[outcome]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  second[outcome + '_second'] = second[outcome]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first[outcome + '_first'] = first[outcome]
A value is trying to be set on a copy of a slice from a DataFrame.
Try us