In [1]:
import numpy as np
import pandas as pd

In [None]:
# this notebook just calculates basic demographics for the dfci med onc cohort

In [2]:
prefix = '/mnt/d/Dropbox (Partners Healthcare)/'
pt_dataset_list = []
for cancer_type in ['nsclc_phase2_existing','crc','breast','pancreas','bladder','prostate']:
    pts = pd.read_csv(prefix+'profile_3-2023/redcaps/' + cancer_type + '_ptchars.csv', low_memory=False)[['record_id','internal_mrn','genie_patient_id']]
    pts = pts.sort_values(by=['record_id','internal_mrn']).reset_index(drop=True)
    pts = pts.groupby('internal_mrn').first().reset_index(drop=False).rename(columns={'internal_mrn':'dfci_mrn'})
    pts['cancer_type'] = cancer_type
    pts = pts[['dfci_mrn','genie_patient_id','cancer_type']]
    pt_dataset_list.append(pts)


pt_dataset = pd.concat(pt_dataset_list, axis=0)

pt_dataset = pt_dataset.groupby('dfci_mrn').first().reset_index()

In [4]:
pt_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6124 entries, 0 to 6123
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   dfci_mrn          6124 non-null   float64
 1   genie_patient_id  6091 non-null   object 
 2   cancer_type       6124 non-null   object 
dtypes: float64(1), object(2)
memory usage: 143.7+ KB


In [5]:
prefix = '/mnt/d/Dropbox (Partners Healthcare)/profile_3-2023/'

specimens = pd.read_csv(prefix+'structured_data/REQ_KK71_105304_2_GENOMIC_SPECIMEN.csv', encoding='ISO-8859-1', low_memory=False)
specimens = specimens[['DFCI_MRN', 'TEST_ORDER_DT']]
specimens['TEST_ORDER_DT'] = pd.to_datetime(specimens.TEST_ORDER_DT)
specimens = specimens.sort_values(by=['DFCI_MRN','TEST_ORDER_DT']).groupby('DFCI_MRN').first().reset_index()


In [6]:
specimens.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52058 entries, 0 to 52057
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DFCI_MRN       52058 non-null  int64         
 1   TEST_ORDER_DT  52058 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1)
memory usage: 813.5 KB


In [None]:
registration = pd.read_csv(prefix+'structured_data/REQ_KK71_105304_2_PT_INFO_STATUS_REGISTRATION.csv', sep=',', encoding='iso-8859-1', low_memory=False)
registration = registration.groupby('DFCI_MRN').first().reset_index()
registration = registration[['DFCI_MRN','GENDER_NM','BIRTH_DT']]
registration['BIRTH_DT'] = pd.to_datetime(registration.BIRTH_DT)

registration = pd.merge(registration, specimens, on='DFCI_MRN')
registration['age_at_sequencing'] = (registration.TEST_ORDER_DT - registration.BIRTH_DT) / np.timedelta64(1, 'Y')
#registration.head()

In [12]:
demographics = pd.read_csv(prefix+'structured_data/REQ_KK71_105304_2_DEMOGRAPHICS_REGISTRATION.csv', sep=',', encoding='iso-8859-1', low_memory=False)
demographics = demographics.groupby('DFCI_MRN').first().reset_index()
demographics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56869 entries, 0 to 56868
Data columns (total 35 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   DFCI_MRN                       56869 non-null  int64  
 1   DR_REQUEST_SEQ                 56869 non-null  int64  
 2   PATIENT_ID                     56869 non-null  int64  
 3   SRC_DIM_SEQ                    56869 non-null  int64  
 4   MARITAL_STATUS_NM              56869 non-null  object 
 5   RELIGION_NM                    56869 non-null  object 
 6   EDUCATION_LEVEL_NM             56869 non-null  object 
 7   BIRTHPLACE_NM                  52541 non-null  object 
 8   PRIM_LANG_NM                   56869 non-null  object 
 9   WRITTEN_LANG_NM                56869 non-null  object 
 10  ETHNICITY_1_NM                 56837 non-null  object 
 11  ETHNICITY_2_NM                 4744 non-null   object 
 12  ETHNICITY_3_NM                 490 non-null   

In [13]:
demographics['race'] = np.select([
    demographics.RACE_NAT_HAWAIIAN_PACIFIC_IND == 'Y',
    demographics.RACE_AM_INDIAN_ALASKAN_IND == 'Y',
    demographics.RACE_BLACK_IND == 'Y',
    demographics.RACE_ASIAN_IND == 'Y',
    demographics.RACE_WHITE_IND == 'Y'],
    ['Native Hawaiian/Pacific Islander',
     'American Indian/Alaskan',
     'Black',
     'Asian',
     'White'],
     'Other/unknown')

demographics = demographics[['DFCI_MRN','race','HISPANIC_IND']]

In [14]:
demographics = pd.merge(demographics, registration, on='DFCI_MRN')

In [15]:
demographics.GENDER_NM.value_counts()

FEMALE     27226
MALE       24830
UNKNOWN        2
Name: GENDER_NM, dtype: int64

In [16]:
demographics = demographics.rename(columns={'DFCI_MRN':'dfci_mrn'})

In [1]:
#demographics.head()

In [19]:
demographics['agecat'] = np.select([
    demographics.age_at_sequencing < 30,
    demographics.age_at_sequencing < 40,
    demographics.age_at_sequencing < 50,
    demographics.age_at_sequencing < 60,
    demographics.age_at_sequencing < 70,
    demographics.age_at_sequencing < 80,
    True
],
    ['< 30',
     '30-39',
     '40-49',
     '50-59',
     '60-69',
     '70-79',
     '80+'])

In [20]:
reports = pd.read_csv('/mnt/d/Dropbox (Partners HealthCare)/profile_3-2023/derived_data/labeled_medonc_prissmm_mixedisprog.csv')

In [21]:
reports = pd.merge(reports, demographics, on='dfci_mrn')

In [22]:
# now merged demographics to report anotations
reports.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39191 entries, 0 to 39190
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Unnamed: 0.1              39191 non-null  int64         
 1   dfci_mrn                  39191 non-null  float64       
 2   cancer_type               39191 non-null  object        
 3   date                      39191 non-null  object        
 4   any_cancer                39191 non-null  int64         
 5   progression               39191 non-null  int64         
 6   response                  39191 non-null  int64         
 7   Unnamed: 0                39191 non-null  int64         
 8   text                      39191 non-null  object        
 9   PROVIDER_DEPARTMENT       39191 non-null  object        
 10  patient_id                39191 non-null  int64         
 11  hybrid_death_ind          39191 non-null  object        
 12  hybrid_death_dt   

In [24]:
report_demographics = reports.copy()
report_demographics.shape

(39191, 24)

In [25]:
report_demographics.groupby('dfci_mrn').first().shape

(3588, 23)

In [27]:
for var in ['GENDER_NM', 'race', 'HISPANIC_IND','agecat', 'cancer_type','any_cancer','progression','response']:
    print('document level')
    print(var)
    print(report_demographics[var].value_counts())
    print(report_demographics[var].value_counts()/report_demographics.shape[0])
    print("\n")

document level
GENDER_NM
MALE      20239
FEMALE    18952
Name: GENDER_NM, dtype: int64
MALE      0.51642
FEMALE    0.48358
Name: GENDER_NM, dtype: float64


document level
race
White                      35054
Black                       1402
Asian                       1399
Other/unknown               1319
American Indian/Alaskan       17
Name: race, dtype: int64
White                      0.894440
Black                      0.035774
Asian                      0.035697
Other/unknown              0.033656
American Indian/Alaskan    0.000434
Name: race, dtype: float64


document level
HISPANIC_IND
N    38301
Y      890
Name: HISPANIC_IND, dtype: int64
N    0.977291
Y    0.022709
Name: HISPANIC_IND, dtype: float64


document level
agecat
60-69    11746
50-59    10237
70-79     7876
40-49     5315
80+       2040
30-39     1743
< 30       234
Name: agecat, dtype: int64
60-69    0.299712
50-59    0.261208
70-79    0.200965
40-49    0.135618
80+      0.052053
30-39    0.044474
< 30     0.005

In [28]:
patient_demographics = reports.groupby('dfci_mrn').first().copy()

In [29]:
for var in ['GENDER_NM', 'race', 'HISPANIC_IND','agecat', 'cancer_type','any_cancer','progression','response']:
    print('patient level')
    print(var)
    print(patient_demographics[var].value_counts())
    print(patient_demographics[var].value_counts()/patient_demographics.shape[0])
    print("\n")

patient level
GENDER_NM
MALE      1897
FEMALE    1691
Name: GENDER_NM, dtype: int64
MALE      0.528707
FEMALE    0.471293
Name: GENDER_NM, dtype: float64


patient level
race
White                      3227
Asian                       123
Black                       119
Other/unknown               115
American Indian/Alaskan       4
Name: race, dtype: int64
White                      0.899387
Asian                      0.034281
Black                      0.033166
Other/unknown              0.032051
American Indian/Alaskan    0.001115
Name: race, dtype: float64


patient level
HISPANIC_IND
N    3504
Y      84
Name: HISPANIC_IND, dtype: int64
N    0.976589
Y    0.023411
Name: HISPANIC_IND, dtype: float64


patient level
agecat
60-69    1106
50-59     874
70-79     779
40-49     405
80+       248
30-39     153
< 30       23
Name: agecat, dtype: int64
60-69    0.308250
50-59    0.243590
70-79    0.217113
40-49    0.112876
80+      0.069119
30-39    0.042642
< 30     0.006410
Name: agecat, 