In [None]:
# this notebook just calculates basic demographics for the DFCI cohort for imaging reports

In [32]:
import numpy as np
import pandas as pd

In [33]:
prefix = '/mnt/d/Dropbox (Partners Healthcare)/'
pt_dataset_list = []
for cancer_type in ['nsclc_phase2_existing','crc','breast','pancreas','bladder','prostate']:
    pts = pd.read_csv(prefix+'profile_3-2023/redcaps/' + cancer_type + '_ptchars.csv', low_memory=False)[['record_id','internal_mrn','genie_patient_id']]
    pts = pts.sort_values(by=['record_id','internal_mrn']).reset_index(drop=True)
    pts = pts.groupby('internal_mrn').first().reset_index(drop=False).rename(columns={'internal_mrn':'dfci_mrn'})
    pts['cancer_type'] = cancer_type
    pts = pts[['dfci_mrn','genie_patient_id','cancer_type']]
    pt_dataset_list.append(pts)


pt_dataset = pd.concat(pt_dataset_list, axis=0)

pt_dataset = pt_dataset.groupby('dfci_mrn').first().reset_index()

In [35]:
pt_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6124 entries, 0 to 6123
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   dfci_mrn          6124 non-null   float64
 1   genie_patient_id  6091 non-null   object 
 2   cancer_type       6124 non-null   object 
dtypes: float64(1), object(2)
memory usage: 143.7+ KB


In [72]:
prefix = '/mnt/d/Dropbox (Partners Healthcare)/profile_3-2023/'

specimens = pd.read_csv(prefix+'structured_data/REQ_KK71_105304_2_GENOMIC_SPECIMEN.csv', encoding='ISO-8859-1', low_memory=False)
specimens = specimens[['DFCI_MRN', 'TEST_ORDER_DT']]
specimens['TEST_ORDER_DT'] = pd.to_datetime(specimens.TEST_ORDER_DT)
specimens = specimens.sort_values(by=['DFCI_MRN','TEST_ORDER_DT']).groupby('DFCI_MRN').first().reset_index()


In [73]:
specimens.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52058 entries, 0 to 52057
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DFCI_MRN       52058 non-null  int64         
 1   TEST_ORDER_DT  52058 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1)
memory usage: 813.5 KB


In [38]:
#no_genie = pt_dataset[pt_dataset.genie_patient_id.isnull()]

In [39]:
#reports[reports.dfci_mrn.isin(no_genie.dfci_mrn)].any_cancer.value_counts()

In [40]:
#no_genie.cancer_type.value_counts()

In [41]:
#pt_dataset.genie_patient_id.nunique()

In [None]:
registration = pd.read_csv(prefix+'structured_data/REQ_KK71_105304_2_PT_INFO_STATUS_REGISTRATION.csv', sep=',', encoding='iso-8859-1', low_memory=False)
registration = registration.groupby('DFCI_MRN').first().reset_index()
registration = registration[['DFCI_MRN','GENDER_NM','BIRTH_DT']]
registration['BIRTH_DT'] = pd.to_datetime(registration.BIRTH_DT)

registration = pd.merge(registration, specimens, on='DFCI_MRN')
registration['age_at_sequencing'] = (registration.TEST_ORDER_DT - registration.BIRTH_DT) / np.timedelta64(1, 'Y')
#registration.head()

In [78]:
demographics = pd.read_csv(prefix+'structured_data/REQ_KK71_105304_2_DEMOGRAPHICS_REGISTRATION.csv', sep=',', encoding='iso-8859-1', low_memory=False)
demographics = demographics.groupby('DFCI_MRN').first().reset_index()
demographics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56869 entries, 0 to 56868
Data columns (total 35 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   DFCI_MRN                       56869 non-null  int64  
 1   DR_REQUEST_SEQ                 56869 non-null  int64  
 2   PATIENT_ID                     56869 non-null  int64  
 3   SRC_DIM_SEQ                    56869 non-null  int64  
 4   MARITAL_STATUS_NM              56869 non-null  object 
 5   RELIGION_NM                    56869 non-null  object 
 6   EDUCATION_LEVEL_NM             56869 non-null  object 
 7   BIRTHPLACE_NM                  52541 non-null  object 
 8   PRIM_LANG_NM                   56869 non-null  object 
 9   WRITTEN_LANG_NM                56869 non-null  object 
 10  ETHNICITY_1_NM                 56837 non-null  object 
 11  ETHNICITY_2_NM                 4744 non-null   object 
 12  ETHNICITY_3_NM                 490 non-null   

In [79]:
demographics['race'] = np.select([
    demographics.RACE_NAT_HAWAIIAN_PACIFIC_IND == 'Y',
    demographics.RACE_AM_INDIAN_ALASKAN_IND == 'Y',
    demographics.RACE_BLACK_IND == 'Y',
    demographics.RACE_ASIAN_IND == 'Y',
    demographics.RACE_WHITE_IND == 'Y'],
    ['Native Hawaiian/Pacific Islander',
     'American Indian/Alaskan',
     'Black',
     'Asian',
     'White'],
     'Other/unknown')

demographics = demographics[['DFCI_MRN','race','HISPANIC_IND']]

In [80]:
demographics = pd.merge(demographics, registration, on='DFCI_MRN')

In [82]:
demographics.GENDER_NM.value_counts()

FEMALE     27226
MALE       24830
UNKNOWN        2
Name: GENDER_NM, dtype: int64

In [90]:
demographics = demographics.rename(columns={'DFCI_MRN':'dfci_mrn'})

In [None]:
#demographics.head()

In [105]:
demographics['agecat'] = np.select([
    demographics.age_at_sequencing < 30,
    demographics.age_at_sequencing < 40,
    demographics.age_at_sequencing < 50,
    demographics.age_at_sequencing < 60,
    demographics.age_at_sequencing < 70,
    demographics.age_at_sequencing < 80,
    True
],
    ['< 30',
     '30-39',
     '40-49',
     '50-59',
     '60-69',
     '70-79',
     '80+'])

In [107]:
reports = pd.read_csv('/mnt/d/Dropbox (Partners HealthCare)/profile_3-2023/derived_data/labeled_imaging_prissmm.csv')

In [108]:
reports = pd.merge(reports, demographics, on='dfci_mrn')

In [109]:
# mixed response counts as progression
reports['progression'] = np.where(reports.class_status==3,1,reports.progression)

In [110]:

reports.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37274 entries, 0 to 37273
Data columns (total 42 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Unnamed: 0.1              37274 non-null  int64         
 1   dfci_mrn                  37274 non-null  float64       
 2   cancer_type               37274 non-null  object        
 3   image_scan_type           37274 non-null  float64       
 4   date                      37274 non-null  object        
 5   head_imaged               37274 non-null  float64       
 6   neck_imaged               37274 non-null  float64       
 7   spine_imaged              37274 non-null  float64       
 8   chest_imaged              37274 non-null  float64       
 9   abdomen_imaged            37274 non-null  float64       
 10  pelvis_imaged             37274 non-null  float64       
 11  any_cancer                37274 non-null  int64         
 12  progression       

In [111]:
reports.image_scan_type.value_counts()

1.0     22677
3.0      7613
7.0      3405
5.0      2889
11.0      690
Name: image_scan_type, dtype: int64

In [120]:
report_demographics = reports.copy()
report_demographics.shape

(37274, 42)

In [121]:
report_demographics.groupby('dfci_mrn').first().shape

(3213, 41)

In [123]:
for var in ['GENDER_NM', 'race', 'HISPANIC_IND','agecat', 'cancer_type','any_cancer','progression','response','brain_met','bone_met','adrenal_met','liver_met','lung_met','node_met','peritoneal_met']:
    print('document level')
    print(var)
    print(report_demographics[var].value_counts())
    print(report_demographics[var].value_counts()/report_demographics.shape[0])
    print("\n")

document level
GENDER_NM
MALE      18967
FEMALE    18307
Name: GENDER_NM, dtype: int64
MALE      0.508853
FEMALE    0.491147
Name: GENDER_NM, dtype: float64


document level
race
White                      33532
Black                       1311
Asian                       1300
Other/unknown               1096
American Indian/Alaskan       35
Name: race, dtype: int64
White                      0.899608
Black                      0.035172
Asian                      0.034877
Other/unknown              0.029404
American Indian/Alaskan    0.000939
Name: race, dtype: float64


document level
HISPANIC_IND
N    36415
Y      859
Name: HISPANIC_IND, dtype: int64
N    0.976954
Y    0.023046
Name: HISPANIC_IND, dtype: float64


document level
agecat
60-69    11129
50-59     9538
70-79     7464
40-49     5177
80+       2003
30-39     1700
< 30       263
Name: agecat, dtype: int64
60-69    0.298573
50-59    0.255889
70-79    0.200247
40-49    0.138890
80+      0.053737
30-39    0.045608
< 30     0.0

In [116]:
patient_demographics = reports.groupby('dfci_mrn').first().copy()

In [124]:
for var in ['GENDER_NM', 'race', 'HISPANIC_IND','agecat', 'cancer_type','any_cancer','progression','response','brain_met','bone_met','adrenal_met','liver_met','lung_met','node_met','peritoneal_met']:
    print('patient level')
    print(var)
    print(patient_demographics[var].value_counts())
    print(patient_demographics[var].value_counts()/patient_demographics.shape[0])
    print("\n")

patient level
GENDER_NM
MALE      1816
FEMALE    1397
Name: GENDER_NM, dtype: int64
MALE      0.565204
FEMALE    0.434796
Name: GENDER_NM, dtype: float64


patient level
race
White                      2901
Black                       119
Other/unknown               104
Asian                        85
American Indian/Alaskan       4
Name: race, dtype: int64
White                      0.902894
Black                      0.037037
Other/unknown              0.032369
Asian                      0.026455
American Indian/Alaskan    0.001245
Name: race, dtype: float64


patient level
HISPANIC_IND
N    3143
Y      70
Name: HISPANIC_IND, dtype: int64
N    0.978214
Y    0.021786
Name: HISPANIC_IND, dtype: float64


patient level
agecat
60-69    997
50-59    801
70-79    672
40-49    373
80+      214
30-39    132
< 30      24
Name: agecat, dtype: int64
60-69    0.310302
50-59    0.249300
70-79    0.209150
40-49    0.116091
80+      0.066604
30-39    0.041083
< 30     0.007470
Name: agecat, dtype: 