In [1]:
import scipy as sp
import pandas as pd
import numpy as np
import importlib
import lftmodels
import lftlib
import matplotlib.pyplot as plt
import math
import lmfit
import lftfit
import diagnosis
import re
import pathinc
import discharge
import endoscopy
import meds
import histandphysical
import entity_recognition_colon
import procedure
import encounters
import demographics
from lmfit import Model
import seaborn as sns

from collections import Counter
pd.set_option('display.max_columns', 350)
pd.set_option('display.max_rows', 250)
#pd.set_option('display.max_colwidth', None)
#pd.reset_option('^display.', silent=True)

In [995]:
pd.reset_option('^display.', silent=True)

# Load discharge file

In [2]:
importlib.reload(discharge)

<module 'discharge' from '/Users/pkc17/MGH/RPDR/discharge.py'>

In [3]:
dis_df_col = (
    discharge
    .load_RPDR_dis(
    '/Users/pkc17/MGH/IBD_RPDR/data/UC_all/processed/HK961_20210421_170204_Dis.txt', 
    delimiter='|', datetime_col='Report_Date_Time')
)

dis_df_col = dis_df_col.sort_values(['EMPI', 'datetime'])

Reading from : /Users/pkc17/MGH/IBD_RPDR/data/UC_all/processed/HK961_20210421_170204_Dis_multiline_corrected.txt


In [4]:
## Extract admit and discharge dates from discarge summary report text

def extract_admit_date(text):
    
    admit_date = re.findall(r'((?i)Admit Date:|(?i)Admission Date:|(?i)Arrival Date:|(?i)Admission:|(?i)Admit:|ADM:)\s*(\d+/\d+/\d+)',text)
    
    if len(admit_date)>0:
        return admit_date[0][1]
    else:
        np.nan
        
def extract_dis_date(text):
    
    dis_date = re.findall(r'((?i)Discharge Date:|(?i)Departure Date:|(?i)Discharge:|(?i)DIS. DATE:|Date of Discharge|D\/C:)\s*(\d+/\d+/\d+)',text)
    
    if len(dis_date)>0:
        return dis_date[0][1]
    else:
        np.nan

In [1]:
dis_df_col['Admit_Date'] = np.nan
dis_df_col['Discharge_Date'] = np.nan

dis_df_col['Admit_Date'] = dis_df_col['Report_Text'].apply(lambda x: extract_admit_date(x))
dis_df_col['Discharge_Date'] = dis_df_col['Report_Text'].apply(lambda x: extract_dis_date(x))

dis_df_col['Admit_Date'] = pd.to_datetime(dis_df_col['Admit_Date'], errors='coerce')
dis_df_col[(dis_df_col['Admit_Date']<'1975')|(dis_df_col['Admit_Date']>'2022')] = np.nan

dis_df_col['Discharge_Date'] = pd.to_datetime(dis_df_col['Discharge_Date'], errors='coerce')
dis_df_col[(
    (dis_df_col['Discharge_Date']<'1975')|(dis_df_col['Discharge_Date']>'2022')|
    ((dis_df_col['Discharge_Date']-dis_df_col['Admit_Date']).dt.days<0))] = np.nan

# Load history and physical file

In [4]:
importlib.reload(histandphysical)

<module 'histandphysical' from '/Users/pkc17/MGH/RPDR/histandphysical.py'>

In [10]:
hnp_df_col = histandphysical.load_RPDR_hnp('/Users/pkc17/MGH/IBD_RPDR/data/UC_all/processed/HK961_20210421_170204_Hnp.txt', delimiter='|', datetime_col='Report_Date_Time')
hnp_df_col = hnp_df_col.sort_values(['EMPI', 'datetime'])

# Load encounters file

In [None]:
enc_df_col = encounters.load_RPDR_enc(
    '/Users/pkc17/MGH/IBD_RPDR/data/UC_all/processed/HK961_20210421_170204_Enc.txt', delimiter='|')

enc_df_col['Admit_Date'] = pd.to_datetime(enc_df_col['Admit_Date'])
enc_df_col['Discharge_Date'] = pd.to_datetime(enc_df_col['Discharge_Date'])

enc_df_col.loc[enc_df_col['Discharge_Date']<'1900','Discharge_Date'] = None

enc_df_col = enc_df_col.sort_values(['EMPI', 'Admit_Date'])

In [None]:
uc_symptoms = ('diarrhea|blood|bleeding|abdominal pain|fever|nausea|vomiting|bloating|stool|'+
               'constipation|anemia|appetite|fatigue|cramp|flatulence|weight|erythema'+
              'renal|anal|rectal|rectum|bowel|polyp|colon|gastrointestinal|intestinal|colitis')

uc_icd_codes = pd.read_csv('/Users/pkc17/MGH/IBD_RPDR/Ulcerative_colitis_(UC).csv')
uc_icd_codes = uc_icd_codes[uc_icd_codes['ICD code'].str.contains('ICD', case=True, na=False)].copy()

uc_icd_codes_list = uc_icd_codes['ICD code'].str.split(':').apply(lambda x: x[1])

diagnosis_colnames = ['Diagnosis_1', 'Diagnosis_2', 
        'Diagnosis_3', 'Diagnosis_4', 'Diagnosis_5', 'Diagnosis_6', 
        'Diagnosis_7', 'Diagnosis_8', 'Diagnosis_9', 'Diagnosis_10']

fil_uc_icd_prncp = (enc_df_col['Principal_Diagnosis']
              .str.contains('|'.join(uc_icd_codes_list) + '|K51', case=True, na=False)
             )

fil_uc_icd_scndr_1 = (enc_df_col[diagnosis_colnames]
              .apply(lambda row: ', '.join(row.values.astype(str)), axis=1)
              .str.contains('|'.join(uc_icd_codes_list) + '|K51', case=True, na=False)
             )
fil_uc_icd_scndr_2 = enc_df_col['Principal_Diagnosis'].str.contains(uc_symptoms, case=False, na=False)

fil_uc_icd = (fil_uc_icd_prncp | (fil_uc_icd_scndr_1&fil_uc_icd_scndr_2))
# fil_uc_icd = fil_uc_icd_prncp


enc_df_col['UC_enc_diagnosis'] = np.nan
enc_df_col.loc[fil_uc_icd, 'UC_enc_diagnosis'] = True
enc_df_col_uc = enc_df_col[fil_uc_icd].copy()

# Load procedure file

In [7]:
importlib.reload(procedure)

<module 'procedure' from '/Users/pkc17/MGH/RPDR/procedure.py'>

In [None]:
prc_df_col = procedure.load_RPDR_prc('/Users/pkc17/MGH/IBD_RPDR/data/UC_all/processed/HK961_20210421_170204_Prc.txt')
prc_df_col = prc_df_col.sort_values(['EMPI', 'datetime'])

In [None]:
prc_df_col['Procedure'] = prc_df_col['Code_Type'] + '-' + prc_df_col['Code']

filter_colectomy = prc_df_col['Procedure_Name'].str.contains('colectomy', case=False, na=False)

In [None]:
prc_colectomy = pd.read_csv('/Users/pkc17/MGH/Colon_RPDR_analysis/prc_colectomy.csv', index_col=0)

prc_colectomy['Procedure'] = prc_colectomy['Code_Type'] + '-' + prc_colectomy['Code']

prc_df_col['Colectomy'] = np.nan
prc_df_col.loc[prc_df_col['Procedure'].isin(prc_colectomy['Procedure']), 'Colectomy'] = True

prc_df_col_uc = prc_df_col[prc_df_col['Colectomy']==True].copy()

prc_df_col_uc = prc_df_col_uc.sort_values(['EMPI', 'datetime'])

# Load diagnosis file

In [9]:
importlib.reload(diagnosis)

<module 'diagnosis' from '/Users/pkc17/MGH/RPDR/diagnosis.py'>

In [11]:
diag_df_col = diagnosis.load_RPDR_diag(
    '/Users/pkc17/MGH/IBD_RPDR/data/UC_all/processed/HK961_20210421_170204_Dia.txt')
diag_df_col = diag_df_col.sort_values(['EMPI', 'datetime'])

In [12]:
diag_df_col_ph = diagnosis.load_RPDR_diag(
    '/Users/pkc17/MGH/IBD_RPDR/data/UC_phenotype/processed/HK961_20210924_171115_Dia.txt')
diag_df_col_ph = diag_df_col_ph.sort_values(['EMPI', 'datetime'])

In [13]:
uc_icd_codes = pd.read_csv('/Users/pkc17/MGH/IBD_RPDR/Ulcerative_colitis_(UC).csv')
uc_icd_codes['Code_Type'] = uc_icd_codes['ICD code'].str.split(':').apply(lambda x: x[0])
uc_icd_codes['Code'] = uc_icd_codes['ICD code'].str.split(':').apply(lambda x: x[1])

fil_uc_icd9 = diag_df_col['Code'].str.contains('|'.join(uc_icd_codes.loc[uc_icd_codes['Code_Type']=='ICD9', 'Code']), case=True, na=False)
fil_uc_icd10 = diag_df_col['Code'].str.contains('|'.join(uc_icd_codes.loc[uc_icd_codes['Code_Type']=='ICD10', 'Code']) + '|K51', case=True, na=False)
fil_uc_lpa = diag_df_col['Code'].str.contains('|'.join(uc_icd_codes.loc[uc_icd_codes['Code_Type']=='LPA', 'Code']), case=True, na=False)
fil_uc_oda = diag_df_col['Code'].str.contains('|'.join(uc_icd_codes.loc[uc_icd_codes['Code_Type']=='ODA', 'Code']), case=True, na=False)

diag_df_col['UC_diagnosis'] = np.nan
diag_df_col.loc[((diag_df_col['Code_Type']=='ICD9')&(fil_uc_icd9)) |
           ((diag_df_col['Code_Type']=='ICD10')&(fil_uc_icd10)) |
           ((diag_df_col['Code_Type']=='LMR')&(fil_uc_lpa)) |
           ((diag_df_col['Code_Type']=='Oncall')&(fil_uc_oda)),'UC_diagnosis'] = True

diag_df_col_uc = diag_df_col[diag_df_col['UC_diagnosis']==True].copy()


# Load Operative Notes

In [16]:
opn_df_col = endoscopy.load_RPDR_endo('/Users/pkc17/MGH/IBD_RPDR/data/UC_all/processed/HK961_20210421_170204_Opn.txt', delimiter='|', datetime_col='Report_Date_Time')


Reading from : /Users/pkc17/MGH/IBD_RPDR/data/UC_all/processed/HK961_20210421_170204_Opn_multiline_corrected.txt


In [17]:
opn_df_col = opn_df_col.sort_values(['EMPI','datetime'])

# Load demographics

In [825]:
importlib.reload(demographics)

<module 'demographics' from '/Users/pkc17/MGH/RPDR/demographics.py'>

In [14]:
dem_df_col = demographics.load_RPDR_dem(
    '/Users/pkc17/MGH/IBD_RPDR/data/UC_all/processed/HK961_20210421_170204_Dem.txt', delimiter='|')

In [15]:
dem_df_col_ph = demographics.load_RPDR_dem(
    '/Users/pkc17/MGH/IBD_RPDR/data/UC_phenotype/processed/HK961_20210924_171115_Dem.txt', delimiter='|')

In [16]:
dem_df_col['Race'] = dem_df_col['Race'].str.lower()

filter_1 = (dem_df_col['Race'].str.startswith('african american', na=False) | 
            dem_df_col['Race'].str.startswith('black', na=False))
dem_df_col.loc[filter_1, 'Race'] = 'African American'

filter_2 = (dem_df_col['Race'].str.startswith('american indian', na=False))
dem_df_col.loc[filter_2, 'Race'] = 'American Indian'

filter_3 = (dem_df_col['Race'].str.startswith('asian', na=False) | 
            dem_df_col['Race'].str.startswith('indian', na=False))
dem_df_col.loc[filter_3, 'Race'] = 'Asian'

filter_4 = (dem_df_col['Race'].str.startswith('dominican', na=False) | 
            dem_df_col['Race'].str.startswith('hispanic', na=False))
dem_df_col.loc[filter_4, 'Race'] = 'Hispanic'

filter_5 = (dem_df_col['Race'].str.startswith('hawaiian', na=False))
dem_df_col.loc[filter_5, 'Race'] = 'Hawaiian or other pacific islander'

filter_6 = (dem_df_col['Race'].str.startswith('european', na=False) | 
            dem_df_col['Race'].str.startswith('white', na=False))
dem_df_col.loc[filter_6, 'Race'] = 'Caucasian'

filter_7 = (dem_df_col['Race'].str.startswith('not recorded', na=False))
dem_df_col.loc[filter_7, 'Race'] = 'Not recorded'

filter_8 = (dem_df_col['Race'].str.startswith('other', na=False) |
           dem_df_col['Race'].str.startswith('multiracial', na=False))
dem_df_col.loc[filter_8, 'Race'] = 'Other'

In [17]:
dem_df_col.loc[dem_df_col['Zip_code']=='@', 'Zip_code'] = np.nan

In [644]:
from uszipcode import SearchEngine
search = SearchEngine(simple_zipcode=True)

def zip_city(x):
    srch = search.by_zipcode(int(x))
    if srch:
        city, state = srch.major_city, srch.state
        return city + ', ' + state
    else:
        return np.nan

def zip_county(x):
    srch = search.by_zipcode(int(x))
    if srch:
        city, state = srch.county, srch.state
        return city + ', ' + state
    else:
        return np.nan
    
def zip_lat(x):
    srch = search.by_zipcode(int(x))
    if srch:
        latitude = srch.lat
        return latitude
    else:
        return np.nan
    
def zip_lng(x):
    srch = search.by_zipcode(int(x))
    if srch:
        longitude = srch.lng
        return longitude
    else:
        return np.nan

In [645]:
# dem_df_col['City'] = dem_df_col['Zip_code'].fillna(0).astype(int).astype(str).apply(zip_city)
# dem_df_col['County'] = dem_df_col['Zip_code'].fillna(0).astype(int).astype(str).apply(zip_county)
# dem_df_col['latitude'] = dem_df_col['Zip_code'].fillna(0).astype(int).astype(str).apply(zip_lat)
# dem_df_col['longitude'] = dem_df_col['Zip_code'].fillna(0).astype(int).astype(str).apply(zip_lng)

# Load medications

In [643]:
importlib.reload(meds)

<module 'meds' from '/Users/pkc17/MGH/RPDR/meds.py'>

In [18]:
meds_df_col = meds.load_RPDR_meds('/Users/pkc17/MGH/IBD_RPDR/data/UC_all/processed/HK961_20210421_170204_Med.txt')
meds_df_col = meds_df_col.sort_values(['EMPI', 'datetime'])

# Load pathology reports

In [15]:
importlib.reload(pathinc)

<module 'pathinc' from '/Users/pkc17/MGH/RPDR/pathinc.py'>

In [19]:
# Load path reports

path_df_col = pathinc.load_RPDR_path(
    '/Users/pkc17/MGH/IBD_RPDR/data/UC_all/processed/HK961_20210421_170204_Pat.txt', 
    delimiter='|', datetime_col='Report_Date_Time')

path_df_col = path_df_col.sort_values(['EMPI', 'datetime'])

Reading from : /Users/pkc17/MGH/IBD_RPDR/data/UC_all/processed/HK961_20210421_170204_Pat_multiline_corrected.txt


In [20]:
# Truncate final dx

path_df2_col = pathinc.truncate_finaldx(path_df_col.copy(), update=True)

Truncating to only final diagnosis...
Updating input path dataframe with truncated path reports


In [21]:
# Truncate extra details

path_df3_col = pathinc.truncate_lower(path_df2_col.copy(), update=True)

Updating input path dataframe with truncated path reports


In [608]:
importlib.reload(entity_recognition)

<module 'entity_recognition' from '/Users/pkc17/MGH/RPDR/entity_recognition.py'>

In [22]:
# Entity recognition

%time path_df4_col = entity_recognition.is_colitis(path_df3_col.copy(), update=True, only_truncated=True)

Updating input path dataframe
CPU times: user 1h 32min 15s, sys: 58.1 s, total: 1h 33min 13s
Wall time: 1h 34min 3s


# Load endoscopy notes file

In [212]:
importlib.reload(endoscopy)

<module 'endoscopy' from '/Users/pkc17/MGH/RPDR/endoscopy.py'>

In [23]:
endo_df_col = endoscopy.load_RPDR_endo(
    '/Users/pkc17/MGH/IBD_RPDR/data/UC_all/processed/HK961_20210421_170204_End.txt', 
    delimiter='|', datetime_col='Report_Date_Time')

endo_df_col = endo_df_col.sort_values(['EMPI', 'datetime'])

Reading from : /Users/pkc17/MGH/IBD_RPDR/data/UC_all/processed/HK961_20210421_170204_End_multiline_corrected.txt


In [24]:
%time endo_df_col_2 = endoscopy.truncate_dx_start(endo_df_col.copy(), update=True)

Truncating to only final diagnosis...
CPU times: user 23.7 s, sys: 292 ms, total: 24 s
Wall time: 24.5 s


In [25]:
%time endo_df_col_3 = endoscopy.truncate_dx_end(endo_df_col_2.copy(), update=True)

Updating input path dataframe with truncated MGH, BWH path reports
CPU times: user 12.7 s, sys: 155 ms, total: 12.9 s
Wall time: 13.1 s


In [610]:
importlib.reload(entity_recognition)

<module 'entity_recognition' from '/Users/pkc17/MGH/RPDR/entity_recognition.py'>

In [26]:
%time endo_df_col_4 = entity_recognition.is_colitis(endo_df_col_3.copy(), update=True)

Updating input path dataframe
CPU times: user 1h 18min 42s, sys: 20.8 s, total: 1h 19min 3s
Wall time: 1h 28min 53s


In [27]:
endo_df_col_4 = endo_df_col_4.sort_values(['EMPI', 'datetime'])

## Crohns, Granuloma exclusions

In [29]:
# Excludes all patients with a history of crohns or granuloma

def nonuc_exclusions(df, endoscopy_df, pathology_df):
    
    '''
    Inputs:
    df: dataframe to be processed which has a unique identifier 'EMPI'
    endoscopy_df: endoscopy dataframe with the feature 'crohns' (specifying if the crohns is present)
    pathology_df: pathology dataframe with the feature 'granuloma' (specifying if the granuloma is present)
    
    Outputs:
    Subset of the original file with all crohns and granuloma patients exlcuded
    '''
    
    crohns_exclusion_empis = (
        endoscopy_df
        .loc[
            (endoscopy_df['crohns']==True)
            , 'EMPI']
        .drop_duplicates()
    )
    
    
    filter_mc_exc = pathology_df.loc[:,
                            ['microscopic_colitis', 'collagenous_colitis', 'lymphocytic_colitis']].sum(axis=1)>=1
    mc_exclusion_empis = (
        pathology_df
        .loc[
            filter_mc_exc
            , 'EMPI']
        .drop_duplicates()
    )
    
    granuloma_exclusion_empis = (
        pathology_df
        .loc[
            (pathology_df['granuloma']==True)
            , 'EMPI']
        .drop_duplicates()
    )
    
    print('Total number of patients in the original input data file:', df['EMPI'].nunique())
    
    print("\nCrohn's exclusions:", df.loc[df['EMPI'].isin(crohns_exclusion_empis), 'EMPI'].nunique())
    print("Granuloma's exclusions:", df.loc[df['EMPI'].isin(granuloma_exclusion_empis), 'EMPI'].nunique())
    
    print("Crohn's and Granuloma's overlap:", 
          df.loc[(
              df['EMPI'].isin(crohns_exclusion_empis) &
              df['EMPI'].isin(granuloma_exclusion_empis) 
          ), 'EMPI'].nunique())
    
    print("Microscopic Colitis exclusions:", df.loc[df['EMPI'].isin(mc_exclusion_empis), 'EMPI'].nunique())

    df = df[~df['EMPI'].isin(crohns_exclusion_empis)].copy()
    df = df[~df['EMPI'].isin(granuloma_exclusion_empis)].copy()
    df = df[~df['EMPI'].isin(mc_exclusion_empis)].copy()

    print('\nTotal number of patients after exclusions:', df['EMPI'].nunique())
    
    return df


In [30]:
endo_df_col_4 = endo_df_col_4.sort_values(['EMPI', 'datetime'])

# Only including those observations in the endoscopy file which are positive for Ulcerative Colitis 
uc_cols = ['ulcerative_colitis', 'pan_colitis', 'proctitis', 'proctosigmoiditis', 'left_sided_colitis']
filter_uc = (endo_df_col_4.loc[:,uc_cols]
    .sum(axis=1)>=1
)
endo_df_uc = endo_df_col_4[filter_uc].copy()
endo_df_uc['UC_any'] = True

# Excluding crohns and granuloma
endo_df_uc = nonuc_exclusions(endo_df_uc, endo_df_col_4, path_df4_col)

# Subset of UC cases containing only the first ever dx
endo_df_uc = endo_df_uc.sort_values(['EMPI', 'datetime'])
endo_df_uc_nodup = endo_df_uc.drop_duplicates('EMPI', keep='first').copy()

Total number of patients in the original input data file: 4232

Crohn's exclusions: 665
Granuloma's exclusions: 741
Crohn's and Granuloma's overlap: 241
Microscopic Colitis exclusions: 55

Total number of patients after exclusions: 3028


## ER Model

In [35]:
import spacy
from negspacy.negation import Negex
from negspacy.termsets import termset

ts = termset("en_clinical")

config={
        "neg_termset":{
            "pseudo_negations": ts.terms['pseudo_negations'] + ['not limited to', 'not excluded', 'needs to be ruled out', 'although not apparent'],
            "preceding_negations": ts.terms['preceding_negations'] + ['negative', 'insufficient', 'without evidence of', 'rather than', 'history'],
            "following_negations": ts.terms['following_negations'] + ['negative', 'unremarkable', 'ruled out', 'less likely', 'is not', 'are not', 'does not', 'have not', 'was not', 'were not', 'absent', 'not present'],
            "termination": ts.terms['termination'] + ['note:', ';', ', negative', ',negative']
        }
    }


corpus = 'en_core_sci_lg' # en_ner_bc5cdr_md, en_core_sci_md, en_core_sci_lg

nlp_2 = spacy.load(corpus) 

nlp_2.add_pipe(
    "negex",
    config = config
)


<negspacy.negation.Negex at 0x7f799ead3490>

In [36]:
text  = '''

Impression:   - Mild changes of colitis, infectious versus recurrent inactive
              ulcerative colitis, with edema, granularity and blunting of
              vascular pattern. Biopsies obtained in distal transverse
              colon, sigmoid and rectum. The inflammation was minor
'''.lower()

In [179]:
importlib.reload(entity_recognition_colon)

<module 'entity_recognition_colon' from '/Users/pkc17/MGH/RPDR/entity_recognition_colon.py'>

In [180]:
print(entity_recognition_colon.entities(text, nlp=nlp_2))

impression True
mild True
changes True
colitis (mild) (inactive) True
infectious True
recurrent False
ulcerativecolitis (mild) (inactive) False
edema False
granularity False
blunting False
vascular pattern False
biopsies True
distal transverse colon True
sigmoid True
rectum True
inflammation (minimal-inflammation) True



## Chart Review

In [804]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 400)
pd.set_option('display.max_colwidth', None)

In [None]:
pd.reset_option('^display.', silent=True)

In [765]:
endo_diag_col.loc[:,['EMPI','MRN']].drop_duplicates()[0:250]

Unnamed: 0,EMPI,MRN
0,100000623,1191043.0
26,100000623,151209.0
45,100002259,449454.0
51,100002259,
63,100003555,626754.0
80,100005154,824623.0
82,100005154,
100,100005214,52588.0
104,100005222,833244.0
105,100005222,


In [770]:
selected_empi = '100012050'

dem_df_col_ph['EMPI'].isin([selected_empi]).sum()

1

In [771]:
dem_df_col[dem_df_col['EMPI'].isin([selected_empi])]

Unnamed: 0,EMPI,EPIC_PMRN,MRN_Type,MRN,Gender,Date_of_Birth,Age,Language,Race,Marital_status,Religion,Is_a_veteran,Zip_code,Country,Vital_status,Date_Of_Death
12931,100012050,10020220000.0,"PMRN, MGH, BWH, FH, NWH","10020217104, 1400023, 01448844, 01078564, 0033...",Male,1950-05-06,70,English-ENGLISH,Caucasian,Married-MARRIED,Non-denominational-NON-DENOM,Yes,2492,United States-UNITED STATES,Not reported as deceased,


In [760]:
dis_df_col[dis_df_col['EMPI']==selected_empi]

Unnamed: 0_level_0,EMPI,EPIC_PMRN,MRN_Type,MRN,Report_Number,Report_Date_Time,Report_Description,Report_Status,Report_Type,Report_Text,datetime,Admit_Date,Discharge_Date
unique_report_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
100005154_3315282,100005154,10024663782,BWH,824623,3315282,10/6/2008 12:00:00 AM,Discharge Summary,F,BDISDIS,Admission Date: 10/06/2008 ...,2008-10-06,2008-10-06,2008-10-09
100005154_4075163,100005154,10024663782,BWH,824623,4075163,6/23/2010 12:00:00 AM,Discharge Summary,F,BDISDIS,Admission Date: 06/23/2010 ...,2010-06-23,2010-06-23,2010-06-25
100005154_BWH-12521-D-1,100005154,10024663782,BWH,824623,BWH-12521-D-1,6/26/2010 12:00:00 AM,Discharge Summary,F,BDISDIS,"ATTENDING: COUPER, GREGORY STEPHEN MD\n \n \nB...",2010-06-26,NaT,NaT


In [285]:
# print(dis_df_col.loc['100026460_1580409761', 'Report_Text'])

In [761]:
enc_df_col[enc_df_col['EMPI']==selected_empi]

Unnamed: 0,EMPI,EPIC_PMRN,MRN_Type,MRN,Encounter_number,Encounter_Status,Hospital,Inpatient_Outpatient,Service_Line,Attending_MD,Admit_Date,Discharge_Date,LOS_Days,Clinic_Name,Admit_Source,Discharge_Disposition,Payor,Admitting_Diagnosis,Principal_Diagnosis,Diagnosis_1,Diagnosis_2,Diagnosis_3,Diagnosis_4,Diagnosis_5,Diagnosis_6,Diagnosis_7,Diagnosis_8,Diagnosis_9,Diagnosis_10,DRG,Patient_Type,Referrer_Discipline


In [762]:
endo_df_col_4[endo_df_col_4['EMPI']==selected_empi]

Unnamed: 0_level_0,EMPI,EPIC_PMRN,MRN_Type,MRN,Report_Number,Report_Date_Time,Report_Description,Report_Status,Report_Type,Report_Text,datetime,has_dx_start,dx_start_line,has_dx_end,dx_end_line,dx_end_line_LAFD,erythema,marked_erythema,inflammation,mild_inflammation,moderate_inflammation,severe_inflammation,loss_vasculature,dec_vasculature,granularity,ulceration,friability,mild_friability,spont_bleeding,adherent_blood,erosion,congestion,edema,pseudopolyp,crohns,superficial_ulcer,shallow_ulcer,aphthous_ulcer,small_ulcer,large_ulcer,deep_ulcer,mild_ulcer,colitis,chronic_colitis,mild_colitis,moderate_colitis,severe_colitis,active_colitis,inactive_colitis,acute_colitis,ulcerative_colitis,pan_colitis,proctitis,proctosigmoiditis,left_sided_colitis,active_ileitis,chronic_ileitis,arch_distortion,basal_plasmacytosis,active_enteritis,chronic_enteritis,crypt_abscess,crypt_atrophy,cryptitis,lymphoid_agg,lamina_propria,granuloma,noncaseating_gran,nonnecrotizing_gran,paneth_cell,cdiff,cmv,mayo_score,disease_list
unique_report_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1
100005154_1,100005154,10024663782,BWH,824623,1,11/11/1998 6:10:00 PM,ENDOSCOPY,Signed,NDO,FINDINGS:\nSURFACE & MUCOSA: 1 cm polypod mass...,1998-11-11 18:10:00,True,FINDINGS:,True,RECOMMENDATION: Follow-up with me in 5 days.,19.0,True,,,,,,,,,,True,,,,,,,True,,,,,,,,,True,True,,,,True,,,True,,,,True,,,,,,,,,,,,,,,,,,,"[pseudopolyp snared True, erythema True, friab..."
100005154_2,100005154,10024663782,BWH,824623,2,9/10/2001 11:24:00 AM,ENDOSCOPY,Signed,NDO,FINDINGS: Mucosa intact and normal aside from...,2001-09-10 11:24:00,True,FINDINGS: Mucosa intact and normal aside from...,True,RECOMMENDATION:,13.0,,,,,,,,,True,,,,,,,,,,,,,,,,,,True,,,,,,,,True,,,,,,,,,,,,,,,,,,,,,,,"[granularity True, ulcerativecolitis True]"
100005154_3,100005154,10024663782,BWH,824623,3,3/2/2004 2:28:00 AM,COLONOSCOPY,Signed,NDO,FINDINGS: Mucosa in rectum normal with normal...,2004-03-02 02:28:00,True,FINDINGS: Mucosa in rectum normal with normal...,True,PROCEDURE CODES:,12.0,False,,,,,,,,False,,False,,,,,,,,,,,,,,,,True,,,,,,,,True,,,,,,,,,,,,,,,,,,,,,,,[ulcerativecolitis True]
100005154_4,100005154,10024663782,BWH,824623,4,3/22/2005 8:53:00 AM,COLONOSCOPY,Signed,NDO,FINDINGS: Mucosa intact with some pallor; no ...,2005-03-22 08:53:00,True,FINDINGS: Mucosa intact with some pallor; no ...,True,RECOMMENDATION:,13.0,,,,,,,,,,,,,,,,,,,,,,,,,,,True,,,,,,,,True,,,,,,,,,,,,,,,,,,,,,,,[ulcerativecolitis True]
100005154_5,100005154,10024663782,BWH,824623,5,9/14/2006 5:54:00 PM,COLONOSCOPY,Signed,NDO,FINDINGS: Inactive healed colitis in the tran...,2006-09-14 17:54:00,True,FINDINGS: Inactive healed colitis in the tran...,True,RECOMMENDATION:,13.0,,,,,,,,,,,,,,,,,,,,,,,,,,,True,,,,,,True,,,,,,,,,,,,,,,,,,,,,,,,,[inactive_healed-colitis True]
100005154_15493,100005154,10024663782,BWH,824623,15493,9/11/2008 6:02:00 AM,COLONOSCOPY,Signed,NDO,Findings:\n The perianal and digital recta...,2008-09-11 06:02:00,True,Findings:,True,Recommendation: - Await pathology results.,26.0,,,,,,,,,,,,,,,,,,,,,,,,,,,True,,,,,,,,True,,,,,,,,,,,,,,,,,,,,,,,"[ulcerativecolitis True, ulcerativecolitis True]"
100005154_36045,100005154,10024663782,BWH,824623,36045,11/12/2009 1:09:00 PM,COLONOSCOPY,Signed,NDO,Findings:\n The perianal and digital recta...,2009-11-12 13:09:00,True,Findings:,True,Recommendation: - Await pathology results.,21.0,,,,,,,,,,,,,,,,,,,,,,,,,,,True,,,,,,,,True,,,,,,,,,,,,,,,,,,,,,,,"[ulcerativecolitis True, ulcerativecolitis True]"


In [749]:
print(endo_df_col_4.loc['100451944_1', 'Report_Text'])

FINDINGS:
SURFACE & MUCOSA: friable from anal verge to 4 cm-bipsied; normal 
from 4 to 45  cm with bipsies from 20-45 cm
MASSES & POLYPS: There were no masses or polyps found.  
LUMEN & VASCULAR: There were no vascular abnormalities noted.  
 
COMMENT: Active inflammation in rectum only; will start on rowasa 
supp qhs
 
there were no complications associated with the study.  
 
IMPRESSION: 
1.  Ulcerative colitis.  556.9.
 


In [750]:
path_df4_col[path_df4_col['EMPI']==selected_empi]

Unnamed: 0_level_0,EMPI,EPIC_PMRN,MRN_Type,MRN,Report_Number,Report_Date_Time,Report_Description,Report_Status,Report_Type,Report_Text,datetime,has_final_diagnosis,final_diagnosis_line,has_lowersec,lowersec_line,lowersec_start_LAFD,erythema,marked_erythema,inflammation,mild_inflammation,moderate_inflammation,severe_inflammation,loss_vasculature,dec_vasculature,granularity,ulceration,friability,mild_friability,spont_bleeding,adherent_blood,erosion,congestion,edema,pseudopolyp,crohns,superficial_ulcer,shallow_ulcer,aphthous_ulcer,small_ulcer,large_ulcer,deep_ulcer,mild_ulcer,colitis,chronic_colitis,mild_colitis,moderate_colitis,severe_colitis,active_colitis,inactive_colitis,acute_colitis,ulcerative_colitis,pan_colitis,proctitis,proctosigmoiditis,left_sided_colitis,active_ileitis,chronic_ileitis,arch_distortion,basal_plasmacytosis,active_enteritis,chronic_enteritis,crypt_abscess,crypt_atrophy,cryptitis,lymphoid_agg,lamina_propria,granuloma,noncaseating_gran,nonnecrotizing_gran,paneth_cell,cdiff,cmv,mayo_score,disease_list
unique_report_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1
100451944_C9006087J,100451944,10064193062,BWH,8823700,C9006087J,2/15/1990 12:00:00 AM,Cytology,Final,PAT,Accession Number: C9006087J ...,1990-02-15,False,,False,,-1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
100451944_G9033260J,100451944,10064193062,BWH,8823700,G9033260J,3/15/1990 12:00:00 AM,Surgical Pathology,Final,PAT,"DIAGNOSIS: by CIBAS,EDMUND SAULIUS,M.D.\n...",1990-03-15,True,"DIAGNOSIS: by CIBAS,EDMUND SAULIUS,M.D.",True,CLINICAL DATA:,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,[]
100451944_G9033551J,100451944,10064193062,BWH,8823700,G9033551J,3/22/1990 12:00:00 AM,Surgical Pathology,Final,PAT,"DIAGNOSIS: by CIBAS,EDMUND SAULIUS,M.D.\n...",1990-03-22,True,"DIAGNOSIS: by CIBAS,EDMUND SAULIUS,M.D.",True,CLINICAL DATA:,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,[]
100451944_C9028067J,100451944,10064193062,BWH,8823700,C9028067J,7/23/1990 12:00:00 AM,Cytology,Final,PAT,Accession Number: C9028067J ...,1990-07-23,False,,False,,-1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
100451944_C9041862J,100451944,10064193062,BWH,8823700,C9041862J,10/22/1990 12:00:00 AM,Cytology,Final,PAT,Accession Number: C9041862J ...,1990-10-22,False,,False,,-1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
100451944_C9102573J,100451944,10064193062,BWH,8823700,C9102573J,1/16/1991 12:00:00 AM,Cytology,Final,PAT,Accession Number: C9102573J ...,1991-01-16,False,,False,,-1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
100451944_S9905176M,100451944,10064193062,BWH,8823700,S9905176M,2/24/1999 12:00:00 AM,Surgical Pathology,Final,PAT,"DIAGNOSIS: by GOLDMAN,HARVEY,M.D.\n#1. CO...",1999-02-24,True,"DIAGNOSIS: by GOLDMAN,HARVEY,M.D.",True,CLINICAL DATA:,8.0,,,,,,,,,,,,,,,,,,,,,,,,,,,True,True,,,,True,,,,,,,,,,,,,,,,,,,False,,,,,,,[chronic active-colitis True]


In [751]:
opn_df_col[opn_df_col['EMPI']==selected_empi]

Unnamed: 0_level_0,EMPI,EPIC_PMRN,MRN_Type,MRN,Report_Number,Report_Date_Time,Report_Description,Report_Status,Report_Type,Report_Text,datetime
unique_report_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


In [419]:
# print(opn_df_col.loc['103652637_4303666637', 'Report_Text'])

In [752]:
prc_df_col[(prc_df_col['EMPI']==selected_empi)]

Unnamed: 0,EMPI,EPIC_PMRN,MRN_Type,MRN,Date,Procedure_Name,Code_Type,Code,Procedure_Flag,Quantity,Provider,Clinic,Hospital,Inpatient_Outpatient,Encounter_number,datetime,Procedure,Colectomy
7085858,100451944,10064193062,BWH,8823700.0,7/1/1997,Induction of labor by artificial rupture of me...,ICD9,73.01,,,,not recorded,BWH,Inpatient,TSI-BWH-83247100018,1997-07-01,ICD9-73.01,
7085862,100451944,10064193062,BWH,8823700.0,7/1/1997,Medical induction of labor,ICD9,73.4,,,"Monsein, Merle Ellen, MD",not recorded,BWH,Inpatient,TSI-BWH-83247100018,1997-07-01,ICD9-73.4,
7085891,100451944,10064193062,BWH,8823700.0,7/1/1997,"Fetal monitoring, not otherwise specified",ICD9,75.34,,,"Monsein, Merle Ellen, MD",not recorded,BWH,Inpatient,TSI-BWH-83247100018,1997-07-01,ICD9-75.34,
7085925,100451944,10064193062,BWH,8823700.0,7/1/1997,Repair of other current obstetric laceration,ICD9,75.69,Primary,,,not recorded,BWH,Inpatient,TSI-BWH-83247100018,1997-07-01,ICD9-75.69,
6925274,100451944,10064193062,BWH,8823700.0,12/10/1998,"Urinalysis, by dip stick or tablet reagent for...",CPT,81003,,1.0,BWH:0000000,One Brookline Place (124),BWH,Outpatient,TSI-BWH-83247100019,1998-12-10,CPT-81003,
6931939,100451944,10064193062,BWH,8823700.0,12/10/1998,Chloride; blood,CPT,82435,,1.0,BWH:0000000,One Brookline Place (124),BWH,Outpatient,TSI-BWH-83247100019,1998-12-10,CPT-82435,
6932542,100451944,10064193062,BWH,8823700.0,12/10/1998,"Cholesterol, serum or whole blood, total",CPT,82465,,1.0,BWH:0000000,One Brookline Place (124),BWH,Outpatient,TSI-BWH-83247100019,1998-12-10,CPT-82465,
6933909,100451944,10064193062,BWH,8823700.0,12/10/1998,Creatinine; blood,CPT,82565,,1.0,BWH:0000000,One Brookline Place (124),BWH,Outpatient,TSI-BWH-83247100019,1998-12-10,CPT-82565,
6937786,100451944,10064193062,BWH,8823700.0,12/10/1998,"Glucose; quantitative, blood (except reagent s...",CPT,82947,,1.0,BWH:0000000,One Brookline Place (124),BWH,Outpatient,TSI-BWH-83247100019,1998-12-10,CPT-82947,
6947801,100451944,10064193062,BWH,8823700.0,12/10/1998,Potassium; serum,CPT,84132,,1.0,BWH:0000000,One Brookline Place (124),BWH,Outpatient,TSI-BWH-83247100019,1998-12-10,CPT-84132,


## Function test

In [257]:
text = '''Findings:
     continuous erythema mucus linear erosions in distal 60 cm 
     thru the descending colon transverse , ascending and cecum 
     normal- biopsies of ascend-cecum, transv, descend, 
     rectosigmoid
Impression:          moderate active left side UC'''

In [67]:
bool(re.search(r'\b(?:(left\w*)\W+(?:\w+\W+){0,4}?(\w*colitis)|(\w*colitis)\W+(?:\w+\W+){0,3}?(left\w*))\b', text))

False

In [259]:
importlib.reload(entity_recognition)

<module 'entity_recognition' from '/Users/pkc17/MGH/RPDR/entity_recognition.py'>

In [261]:
print(entity_recognition.entities(text, nlp_2))

findings True
continuous erythema mucus linear erosions True
distal True
descending colon transverse True
ascending True
cecum True
biopsies True
ascend-cecum True
transv True
rectosigmoid True
impression True
moderate True
active True
ulcerativecolitis (left-sided colitis) (moderate) (active) True



In [1824]:
filter_sig = endo_df_col_4['Report_Text'].str.contains('sigmoiditis', case=False, na=False)
filter_proc = endo_df_col_4['Report_Text'].str.contains('proctitis', case=False, na=False)
filter_ulc = endo_df_col_4['Report_Text'].str.contains('Ulcerative', case=False, na=False)


In [2]:
for index, row in endo_df_col_4.loc[filter_sig].sample(50).iterrows():
    
    
#     bool_1 = bool(re.search(r'\b(?:biopsy\W+(?:\w+\W+){0,6}?(\w*colitis)|(\w*colitis)\W+(?:\w+\W+){0,6}?biopsy)\b', row['Report_Text'].lower()))
#     bool_2 = bool(re.search(r'\b(?:biopsy\W+(?:\w+\W+){0,3}?(\w*colitis)|(\w*colitis)\W+(?:\w+\W+){0,2}?biopsy)\b', row['Report_Text'].lower()))

#     if not ((bool_1 & (not bool_2))):
#         continue
#     if not bool_1:
#         continue
        
    print('#####################################', end='\n\n')
    print(index, end='\n\n')
    print(row.MRN_Type,  end='\n\n')
    print(row.Report_Description,  end='\n\n')
    print(row.Report_Status,  end='\n\n')
    print(row.Report_Date_Time,  end='\n\n')
    print(row['Report_Text'], end='\n\n')
    print(row['disease_list'], end='\n\n')
    print(entity_recognition_colon.entities(row['Report_Text'], nlp_2))
    print('#####################################', end='\n\n')



# Stats - All files

In [41]:
print('Number of reports in the dishcarge summary file:', dis_df_col['EMPI'].shape[0])
print('Number of patients in the dishcarge summary file:', dis_df_col['EMPI'].nunique(), end='\n\n')

print('Number of reports in the history and physical file:', hnp_df_col['EMPI'].shape[0])
print('Number of patients in the history and physical file:', hnp_df_col['EMPI'].nunique(), end='\n\n')

print('Number of reports in the encounters file:', enc_df_col['EMPI'].shape[0])
print('Number of patients in the encounters file:', enc_df_col['EMPI'].nunique(), end='\n\n')

print('Number of reports in the procedures file:', prc_df_col['EMPI'].shape[0])
print('Number of patients in the procedures file:', prc_df_col['EMPI'].nunique(), end='\n\n')

print('Number of reports in the pathology file:', path_df3_col['EMPI'].shape[0])
print('Number of patients in the pathology file:', path_df3_col['EMPI'].nunique(), end='\n\n')

print('Number of reports in the endoscopy file:', endo_df_col['EMPI'].shape[0])
print('Number of patients in the endoscopy file:', endo_df_col['EMPI'].nunique(), end='\n\n')

print('Number of reports in the endoscopy file:', meds_df_col['EMPI'].shape[0])
print('Number of patients in the endoscopy file:', meds_df_col['EMPI'].nunique())

Number of reports in the dishcarge summary file: 190796
Number of patients in the dishcarge summary file: 14242

Number of reports in the history and physical file: 31399
Number of patients in the history and physical file: 7569

Number of reports in the encounters file: 409237
Number of patients in the encounters file: 1400

Number of reports in the procedures file: 10650234
Number of patients in the procedures file: 14709

Number of reports in the pathology file: 176248
Number of patients in the pathology file: 14640

Number of reports in the endoscopy file: 75466
Number of patients in the endoscopy file: 14407

Number of reports in the endoscopy file: 7212333
Number of patients in the endoscopy file: 14673


# Merge

## Endoscopy Type - Manually corrected

In [38]:
# Manually labelled endoscopy type file
endo_description = pd.read_csv('/Users/pkc17/MGH/IBD_RPDR/data/UC_all/endoscopy.csv', index_col=0, dtype=str)

In [39]:
def update_endoscopy_type(df, manual_df, label_name):
    '''
    Input:
    df: Endoscopy file to be updated. Must have the columns EMPI, MRN, Report_Number, Report_Description
    manual_df: manually reviewed endoscopy file with corrected Report_Description. Must have EMPI, MRN, Report_Number
    label_name: The column with correct labels in the manual_df dataframe
    
    Output:
    Returns a modified version of original df after making corrections to Report_Description'''
    
    result_df = df.copy()
    
    result_df = pd.merge(
        df
        ,manual_df.loc[:,['EMPI', 'EPIC_PMRN', 'Report_Number']+[label_name]]
        ,how='left'
        ,on=['EMPI', 'EPIC_PMRN', 'Report_Number']
    ).copy()
    
    filter_corrections = ~(result_df[label_name].isna())
    
    print('Number of observations modified:', sum(filter_corrections))
    
    result_df.loc[filter_corrections, 'Report_Description'] = result_df.loc[filter_corrections, label_name]
    
    result_df.drop(columns=['Endoscopy Type'], inplace=True)
    
    return result_df
    

In [40]:
endo_df_uc = update_endoscopy_type(endo_df_uc, endo_description, label_name='Endoscopy Type')

Number of observations modified: 243


## Mayo scoring

In [41]:
def mayo_scoring(df):
    
    df = df.copy()
    
    # Mayo 3
    mayo3_cols = ['severe_inflammation', 'spont_bleeding', 'adherent_blood', 'large_ulcer',
                 'deep_ulcer', 'severe_colitis']
    
    mayo_stage3_filter = (df.loc[:, mayo3_cols].sum(axis=1)>=1)
    
    
     # Mayo 2
    mayo2_cols = ['moderate_inflammation', 'friability', 'erosion', 'loss_vasculature', 'marked_erythema',
                 'superficial_ulcer', 'shallow_ulcer', 'aphthous_ulcer', 'small_ulcer', 'mild_ulcer',
                 'ulceration', 'moderate_colitis']
    
    mayo_stage2_filter = (
        (df.loc[:, mayo2_cols].sum(axis=1)>=1) &
        (~mayo_stage3_filter)
    )
    
    
    # Mayo 1
    mayo1_cols = ['mild_inflammation', 'inflammation', 'dec_vasculature', 'congestion', 'edema',
                 'erythema', 'mild_friability', 'mild_colitis']
    
    mayo_stage1_filter = (
        (df.loc[:, mayo1_cols].sum(axis=1)>=1) &
        (~(mayo_stage3_filter|mayo_stage2_filter))
    )
    
    # Mayo 0
    mayo0_keywords = 'normal|remission|healed|inactive|quiescent|no evidence'

    mayo_stage0_filter_normal = df['Report_Text'].str.contains(mayo0_keywords, case=False, na='coerce')

    mayo_stage0_filter =(

        (mayo_stage0_filter_normal)
        &
        (~(mayo_stage3_filter|mayo_stage2_filter|mayo_stage1_filter))

    )
    
    # Creating a new column 'mayo_stage'
    df['mayo_stage'] = df['mayo_score']

    df.loc[(df['mayo_stage'].isna()) & (mayo_stage3_filter), 'mayo_stage'] = 3
    df.loc[(df['mayo_stage'].isna()) & (mayo_stage2_filter), 'mayo_stage'] = 2
    df.loc[(df['mayo_stage'].isna()) & (mayo_stage1_filter), 'mayo_stage'] = 1
    df.loc[(df['mayo_stage'].isna()) & (mayo_stage0_filter), 'mayo_stage'] = 0
    
    return df

In [42]:
# Generating mayo scores based on definitions

endo_df_uc = mayo_scoring(endo_df_uc)

In [43]:
print('Excluding reports with no mayo score in the Impression/Findings section n:', 
      sum(endo_df_uc['mayo_stage'].isna()))

endo_df_uc = endo_df_uc[~endo_df_uc['mayo_stage'].isna()].copy()

print('\nNumber of remaining reports n:', 
      endo_df_uc.shape[0])

Excluding reports with no mayo score in the Impression/Findings section n: 225

Number of remaining reports n: 6482


In [60]:
def report_count(df, col_name, feature_description, scores_list):
    
    for i in scores_list:
        print('Count of reports with {} {}:'.format(feature_description, i), (
            df.loc[df[col_name]==i, 'EMPI'].shape[0]))
    print('Count of reports with missing {}:'.format(feature_description), df[col_name].isna().sum())
        
def patient_count(df, col_name, feature_description, scores_list):
    
    for i in scores_list:
        print('Count of patients with {} {}:'.format(feature_description, i), (
            df.loc[df[col_name]==i, 'EMPI'].nunique()))
    print('Count of patients with missing {}:'.format(feature_description), 
          df.loc[df[col_name].isna(), 'EMPI'].nunique())

In [61]:
print('Reports:')
report_count(endo_df_uc, col_name='mayo_stage', feature_description='Mayo Score', scores_list=[0,1,2,3])
print('\nPatients:')
patient_count(endo_df_uc, col_name='mayo_stage', feature_description='Mayo Score', scores_list=[0,1,2,3])

Reports:
Count of reports with Mayo Score 0: 1493
Count of reports with Mayo Score 1: 1399
Count of reports with Mayo Score 2: 2805
Count of reports with Mayo Score 3: 769
Count of reports with missing Mayo Score: 0

Patients:
Count of patients with Mayo Score 0: 860
Count of patients with Mayo Score 1: 997
Count of patients with Mayo Score 2: 1820
Count of patients with Mayo Score 3: 629
Count of patients with missing Mayo Score: 0


In [63]:
print('Reports (direct mention only):')
report_count(endo_df_uc, col_name='mayo_score', feature_description='Mayo Score', scores_list=[0,1,2,3])
print('\nPatients (direct mention only):')
patient_count(endo_df_uc, col_name='mayo_score', feature_description='Mayo Score', scores_list=[0,1,2,3])

Reports (direct mention only):
Count of reports with Mayo Score 0: 107
Count of reports with Mayo Score 1: 140
Count of reports with Mayo Score 2: 265
Count of reports with Mayo Score 3: 170
Count of reports with missing Mayo Score: 5784

Patients (direct mention only):
Count of patients with Mayo Score 0: 83
Count of patients with Mayo Score 1: 126
Count of patients with Mayo Score 2: 222
Count of patients with Mayo Score 3: 146
Count of patients with missing Mayo Score: 2740


## Severity of Inflammation

In [79]:
def inflammation_scoring(df):
    
    df = df.copy()
    
    # Level 3 inflammation
    inflam_stage3_filter = (df['severe_inflammation']==True)
    
    
    # Level 2 inflammation
    inflam_stage2_filter = (
        (df['moderate_inflammation']==True)
        &
        (~inflam_stage3_filter)
    )
     

    # Level 1 inflammation
    inflam_stage1_filter = (
        ((df['mild_inflammation']==True))
        &
        (~(inflam_stage3_filter|inflam_stage2_filter))
    )
    
    
    # Level 0 inflammation
    level0_inflam_keywords = 'normal|remission|healed|quiescent|inactive|no evidence'
    inflam_stage0_filter_normal = df['Report_Text'].str.contains(level0_inflam_keywords, case=False, na='coerce')

    inflam_stage0_filter =(
        (df['inflammation']==True) 
        &
        (inflam_stage0_filter_normal)
        &
        (~(inflam_stage3_filter|inflam_stage2_filter|inflam_stage1_filter))
    )
    
    # Level 9 inflammation - No inflammation
    inflam_stage9_filter = ((
        ((df['inflammation']!=True))
        &
        (~(inflam_stage3_filter|inflam_stage2_filter|inflam_stage1_filter|inflam_stage0_filter))
    )) 
    
    
    df['inflammation_severity'] = np.nan

    df.loc[inflam_stage3_filter, 'inflammation_severity'] = 3
    df.loc[inflam_stage2_filter, 'inflammation_severity'] = 2
    df.loc[inflam_stage1_filter, 'inflammation_severity'] = 1
    df.loc[inflam_stage0_filter, 'inflammation_severity'] = 0
    
    df.loc[inflam_stage9_filter, 'inflammation_severity'] = 9
    
    return df

In [80]:
# Extracting severity of inflammation

endo_df_uc = inflammation_scoring(endo_df_uc)

In [81]:
print('Reports:')
report_count(endo_df_uc, col_name='inflammation_severity', 
             feature_description='Inflammation Level', scores_list=[0,1,2,3,9])
print('\nPatients:')
patient_count(endo_df_uc, col_name='inflammation_severity', 
              feature_description='Inflammation Level', scores_list=[0,1,2,3,9])

Reports:
Count of reports with Inflammation Level 0: 794
Count of reports with Inflammation Level 1: 791
Count of reports with Inflammation Level 2: 747
Count of reports with Inflammation Level 3: 322
Count of reports with Inflammation Level 9: 3565
Count of reports with missing Inflammation Level: 263

Patients:
Count of patients with Inflammation Level 0: 625
Count of patients with Inflammation Level 1: 598
Count of patients with Inflammation Level 2: 609
Count of patients with Inflammation Level 3: 286
Count of patients with Inflammation Level 9: 1887
Count of patients with missing Inflammation Level: 233


## Colectomy

In [181]:
# Drop duplicates in procedures file
prc_df_col_uc_nodup = prc_df_col_uc.drop_duplicates(subset=['EMPI', 'datetime', 'MRN_Type', 'MRN'], keep='first')
prc_df_col_uc_nodup = prc_df_col_uc_nodup[prc_df_col_uc_nodup['EMPI'].isin(endo_df_uc['EMPI'])]


# Create a an empty df with all available EMPI's and dates

endo_prc_col = (
    pd.concat
    (
        [
    endo_df_uc.loc[:,['EMPI', 'datetime', 'MRN', 'MRN_Type']].reset_index(drop=True),
    prc_df_col_uc_nodup.loc[:,['EMPI', 'datetime', 'MRN', 'MRN_Type']].reset_index(drop=True)
        ]
    )
    .drop_duplicates()
    .sort_values(['EMPI', 'datetime'])
)

In [182]:
print('Total UC patient count:', endo_df_uc['EMPI'].nunique(), end='\n\n')

# Merge the newly created df with endoscopy file
endo_prc_col = (
    pd.merge(
        endo_prc_col, 
        endo_df_uc.loc[:,['EMPI', 'MRN_Type', 'MRN', 'datetime'
                             ,'Report_Number', 'Report_Text', 'UC_any']]
        ,how='left', on=['EMPI', 'datetime', 'MRN_Type', 'MRN']
    )
).copy()


# Merge the file with procedures containing colectomy history
endo_prc_col = (
    pd.merge(
        endo_prc_col, 
        prc_df_col_uc_nodup.loc[:,['EMPI', 'datetime', 'MRN', 'MRN_Type'
                             ,'Procedure_Name', 'Inpatient_Outpatient', 'Colectomy']]
        , how='left', on=['EMPI', 'datetime', 'MRN_Type', 'MRN']
    )
).copy().sort_values(['EMPI', 'datetime'])
del prc_df_col_uc_nodup

# Exclude cases with colectomy before endoscopic diagnosis of UC
excluded_empis_colectomy = (
    endo_prc_col
    .groupby('EMPI')
    .apply( 
        lambda x: pd.Series( {'colectomy':
            (x.loc[x['UC_any']==True,'datetime'].min()
            <=x.loc[x['Colectomy']==True,'datetime'].min())
            |
            (sum(x['Colectomy']==True)==0)}
        )
    )
    .reset_index()
    .query("colectomy==False")['EMPI'].drop_duplicates()
)

print('Number of patients who had ‘colectomy’ after an endo confirmed UC diagnosis: ', 
      endo_prc_col.loc[endo_prc_col['Colectomy']==True, 'EMPI'].nunique() - excluded_empis_colectomy.shape[0])

print('Number of patients who had ‘inpatient colectomy’ after an endo confirmed UC diagnosis:',
     (
         endo_prc_col
         .loc[(
             (endo_prc_col['Colectomy']==True)
             &(~endo_prc_col['EMPI'].isin(excluded_empis_colectomy))
             &((endo_prc_col['Inpatient_Outpatient']=='Inpatient'))
         ), 'EMPI']
     ).nunique()
)

print('\nExcluding patients who had ‘colectomy’ on the same day or before an MGB wide endoscopy confirmed UC n: ', 
      excluded_empis_colectomy.shape[0])

# Excluding the patients whose colectomy was recorded before an endoscopy (80 cases)

endo_df_uc = endo_df_uc[~endo_df_uc['EMPI'].isin(excluded_empis_colectomy)].copy()

print('\nRemaining endo diagnosed UC patients after all the exclusion:', endo_df_uc['EMPI'].nunique())

print('\nRemaining endo diagnosed UC patients that are also in ML phenotype:', 
     endo_df_uc.loc[endo_df_uc['EMPI'].isin(dem_df_col_ph['EMPI']), 'EMPI'].nunique())

Total UC patient count: 2952

Number of patients who had ‘colectomy’ after an endo confirmed UC diagnosis:  464
Number of patients who had ‘inpatient colectomy’ after an endo confirmed UC diagnosis: 449

Excluding patients who had ‘colectomy’ on the same day or before an MGB wide endoscopy confirmed UC n:  107

Remaining endo diagnosed UC patients after all the exclusion: 2845

Remaining endo diagnosed UC patients that are also in ML phenotype: 2079


## Hospitilization - 'Inpatient status'

In [183]:
# Drop duplicates in procedures file
diag_df_col_uc_nodup = diag_df_col_uc.drop_duplicates(subset=['EMPI', 'datetime', 'MRN_Type', 'MRN'], keep='first')
diag_df_col_uc_nodup = diag_df_col_uc_nodup[diag_df_col_uc_nodup['EMPI'].isin(endo_df_uc['EMPI'])]

# Create a an empty df with all available EMPI's and dates

endo_diag_col = (
    pd.concat
    (
        [
    endo_df_uc.loc[:,['EMPI', 'datetime', 'MRN', 'MRN_Type']].reset_index(drop=True),
    diag_df_col_uc_nodup.loc[:,['EMPI', 'datetime', 'MRN', 'MRN_Type']].reset_index(drop=True)
        ]
    )
    .drop_duplicates()
    .sort_values(['EMPI', 'datetime'])
)

In [184]:
# Merge the newly created df with endoscopy file
endo_diag_col = (
    pd.merge(
        endo_diag_col, 
        endo_df_uc.loc[:,['EMPI', 'MRN_Type', 'MRN', 'datetime','Report_Number', 
                              'Report_Text', 'UC_any', 'mayo_score', 'mayo_stage', 
                              'disease_list']]
        ,how='left', on=['EMPI', 'datetime', 'MRN_Type', 'MRN']
    )
).copy()


# Merge the file with procedures containing colectomy history
endo_diag_col = (
    pd.merge(
        endo_diag_col, 
        diag_df_col_uc_nodup.loc[:,['EMPI', 'datetime', 'MRN', 'MRN_Type'
                             ,'Diagnosis_Name', 'Inpatient_Outpatient', 'UC_diagnosis']]
        , how='left', on=['EMPI', 'datetime', 'MRN_Type', 'MRN']
    )
).copy().sort_values(['EMPI', 'datetime'])
del diag_df_col_uc_nodup


In [185]:
print(
    'Number of patients from the UC cohort who were hospitalized due to UC at some point n (%): {} ({})'
    .format(
        endo_diag_col.loc[endo_diag_col['Inpatient_Outpatient'] == 'Inpatient',
                          'EMPI'].nunique(),
        round(
            endo_diag_col.loc[endo_diag_col['Inpatient_Outpatient'] ==
                              'Inpatient', 'EMPI'].nunique() * 100 /
            endo_diag_col['EMPI'].nunique(), 2)))

# Cases with inpatient UC hosptilization only before an endoscopic diagnosis of UC
excluded_empis_uc_hsp = (
    endo_diag_col.groupby('EMPI').apply(lambda x: pd.Series({
        'hospitalization': (x.loc[x['UC_any'] == True, 'datetime'].min(
        ) < x.loc[x['Inpatient_Outpatient'] == 'Inpatient', 'datetime'].max())
        | (sum(x['Inpatient_Outpatient'] == 'Inpatient') == 0)
    })).reset_index().query("hospitalization==False")['EMPI'])

print(
    'Number of patients who had UC related hospitilization on or after an endo confirmed UC diagnosis n (%): {} ({})'
    .format((
        endo_diag_col.loc[endo_diag_col['Inpatient_Outpatient'] == 'Inpatient',
                          'EMPI'].nunique() - excluded_empis_uc_hsp.shape[0]),
            round((endo_diag_col.loc[endo_diag_col['Inpatient_Outpatient'] ==
                                     'Inpatient', 'EMPI'].nunique() -
                   excluded_empis_uc_hsp.shape[0]) * 100 /
                  (endo_diag_col['EMPI'].nunique()), 2)))

Number of patients from the UC cohort who were hospitalized due to UC at some point n (%): 1877 (65.98)
Number of patients who had UC related hospitilization on or after an endo confirmed UC diagnosis n (%): 1494 (52.51)


In [659]:
endo_diag_col.to_csv('/Users/pkc17/MGH/Colon_RPDR_analysis/endo_diag_col.csv')

## Hospitilization - 'Length of stay'

In [186]:
# Keep only 'Regular' Inpatient records
enc_df_col_uc_nodup = (
    enc_df_col_uc[
        ((enc_df_col_uc['Inpatient_Outpatient']=='Inpatient')
        & (enc_df_col_uc['Encounter_Status']=='Regular'))
    ]).copy()



# Drop any entries with missing Admit Date or Discharge Date
enc_df_col_uc_nodup = (
    enc_df_col_uc_nodup
    .loc[:
        ,['EMPI', 'MRN_Type', 'MRN', 'Inpatient_Outpatient', 
          'Admit_Date', 'Discharge_Date', 'UC_enc_diagnosis']]
    .dropna(subset=['Admit_Date', 'Discharge_Date'])
)

# Drop duplicates based on EMPI, MRN and Admit Date
enc_df_col_uc_nodup['datetime'] = enc_df_col_uc_nodup['Admit_Date']
enc_df_col_uc_nodup = (
    enc_df_col_uc_nodup
    .drop_duplicates(subset=['EMPI', 'datetime', 'MRN_Type', 'MRN'], keep='first')
    .drop_duplicates(subset=['EMPI', 'Discharge_Date'], keep='first')
    .copy()
)

# Exlcuding all patient records that dont have an endo confirmed UC dx
enc_df_col_uc_nodup = enc_df_col_uc_nodup[enc_df_col_uc_nodup['EMPI'].isin(endo_df_uc['EMPI'])]


In [187]:
# Merge based on Admit Date from encounters file and datetime from endoscopy file

dup_df = (
    pd.merge_asof(
        (endo_df_uc
         .sort_values('datetime')
         .loc[:,['EMPI', 'MRN_Type', 'MRN', 'datetime', 'Report_Number', 'Report_Text'
                 ,'UC_any', 'mayo_score', 'mayo_stage', 'disease_list']]
        )
        ,(enc_df_col_uc_nodup
          .sort_values('datetime')
          )
        , on='datetime'
        ,by=['EMPI', 'MRN_Type', 'MRN']
        ,tolerance=pd.Timedelta("20d"), direction='backward')
)

dup_df.loc[dup_df['datetime']>dup_df['Discharge_Date'], 
           ['Inpatient_Outpatient', 'Admit_Date', 'Discharge_Date','UC_enc_diagnosis']] = np.nan

In [33]:
788584+615013

1403597

In [34]:
105405+150247

255652

In [35]:
682380+609637

1292017

In [36]:
111970+68595

180565

In [37]:
99638+87028

186666

In [40]:
1292017+180565+186666

1659248

In [188]:
# Create a an empty df with all available EMPI's and dates

endo_enc_col = (
    pd.concat
    (
        [
    dup_df.loc[:,['EMPI', 'datetime', 'MRN', 'MRN_Type']].reset_index(drop=True),
    enc_df_col_uc_nodup.loc[:,['EMPI', 'datetime', 'MRN', 'MRN_Type']].reset_index(drop=True)
        ]
    )
    .drop_duplicates()
    .sort_values(['EMPI', 'datetime'])
)

In [189]:
# Merge the newly created df with endoscopy file
endo_enc_col = (
    pd.merge(
        endo_enc_col
        ,dup_df
        ,how='left', on=['EMPI', 'datetime', 'MRN_Type', 'MRN']
    )
).copy()


# Merge the file with the encounter file
endo_enc_col = (
    pd.merge(
        endo_enc_col
        ,enc_df_col_uc_nodup
        ,how='left', on=['EMPI', 'datetime', 'MRN_Type', 'MRN']
    )
).copy().sort_values(['EMPI', 'datetime'])


# Combine and rename duplicate columns generated from previous merge
endo_enc_col.loc[endo_enc_col['Inpatient_Outpatient_x'].isna(), 'Inpatient_Outpatient_x'] = endo_enc_col.loc[
    endo_enc_col['Inpatient_Outpatient_x'].isna(), 'Inpatient_Outpatient_y']

endo_enc_col.loc[endo_enc_col['Admit_Date_x'].isna(), 'Admit_Date_x'] = endo_enc_col.loc[
    endo_enc_col['Admit_Date_x'].isna(), 'Admit_Date_y']

endo_enc_col.loc[endo_enc_col['Discharge_Date_x'].isna(), 'Discharge_Date_x'] = endo_enc_col.loc[
    endo_enc_col['Discharge_Date_x'].isna(), 'Discharge_Date_y']

endo_enc_col.loc[endo_enc_col['UC_enc_diagnosis_x'].isna(), 'UC_enc_diagnosis_x'] = endo_enc_col.loc[
    endo_enc_col['UC_enc_diagnosis_x'].isna(), 'UC_enc_diagnosis_y']

endo_enc_col = (
    endo_enc_col
    .drop(columns=['Inpatient_Outpatient_y', 'Admit_Date_y','Discharge_Date_y', 'UC_enc_diagnosis_y'])
    .rename(columns={'Inpatient_Outpatient_x':'Inpatient_Outpatient', 'Admit_Date_x':'Admit_Date',
                    'Discharge_Date_x':'Discharge_Date', 'UC_enc_diagnosis_x':'UC_enc_diagnosis'})
).copy()


# Calculate length of stay
# endo_enc_col.loc[endo_enc_col['Admit_Date']>endo_enc_col['Discharge_Date'], 'Discharge_Date'] = np.nan

endo_enc_col['Length_of_stay'] = ((endo_enc_col['Discharge_Date']-endo_enc_col['Admit_Date']).dt.days) + 1


# Drop duplicates from the encounters file that are already in dup_df
endo_enc_col = endo_enc_col.drop_duplicates(subset=['EMPI', 'Admit_Date', 'Discharge_Date'], keep='last').copy()

In [368]:
del dup_df, enc_df_col_uc_nodup

In [190]:
print('Total UC patients count:', endo_enc_col['EMPI'].nunique(), end='\n\n')

uc_hospitalized_empis = endo_enc_col.loc[endo_enc_col['Inpatient_Outpatient']=='Inpatient', 'EMPI'].drop_duplicates()

print('Number of patients from the UC cohort who were hospitalized due to UC at some point n (%): {} ({})'
      .format(
          uc_hospitalized_empis.shape[0]
          ,round(uc_hospitalized_empis.shape[0]*100
                 /endo_enc_col['EMPI'].nunique(), 2)
      )
)


# Patients with no inpatient hospitilizations on or after an endo confirmed UC diagnosis
excluded_empis_uc_hsp = (
    endo_enc_col
    .groupby('EMPI')
    .apply( 
        lambda x: pd.Series( {'hospitalization':
            (x.loc[x['UC_any']==True,'datetime'].min()
            <=x.loc[x['Inpatient_Outpatient']=='Inpatient','datetime'].max())
            |
            (sum(x['Inpatient_Outpatient']=='Inpatient')==0)}
        )
    )
    .reset_index()
    .query("hospitalization==False")['EMPI']
)

# Patients with at least one inpatient hospitilization on or after endo UC dx
uc_hospitalized_sub_empis = uc_hospitalized_empis[~uc_hospitalized_empis.isin(excluded_empis_uc_hsp)]

print('Number of patients with UC related hospitilization during or after an endo confirmed UC diagnosis n (%): {} ({})'
      .format(
          uc_hospitalized_sub_empis.shape[0]
          ,round((uc_hospitalized_sub_empis.shape[0])*100
                 /endo_enc_col['EMPI'].nunique(), 2)
      )
)

# Patients with inpatient hospitilization during endo dx
uc_hospitalized_dx_empis = (
    endo_enc_col
    .loc[(endo_enc_col['UC_any']==True)&(endo_enc_col['Inpatient_Outpatient']=='Inpatient')
         ,'EMPI']
    .drop_duplicates()
)

print('\nNumber of patients who were hospitalized on the day of endoscopy based UC diagnosis n (%): {} ({})'
      .format(
          uc_hospitalized_dx_empis.shape[0]
          ,round((uc_hospitalized_dx_empis.shape[0])*100
                 /endo_enc_col['EMPI'].nunique(), 2)
      )
)

print('Avg. length of stay for inpatient hospitalizations on the day of endoscopy based UC dx mean: {}'
      .format(
          round(
              endo_enc_col.loc[
                  (endo_enc_col['UC_any']==True)&(endo_enc_col['Inpatient_Outpatient']=='Inpatient')
                  ,'Length_of_stay'].mean()
              ,2)
      )
)


print('\nMedian length of stay (days):', endo_enc_col['Length_of_stay'].median())
print('Mean length of stay (days):', round(endo_enc_col['Length_of_stay'].mean(), 2))

Total UC patients count: 2845

Number of patients from the UC cohort who were hospitalized due to UC at some point n (%): 1122 (39.44)
Number of patients with UC related hospitilization during or after an endo confirmed UC diagnosis n (%): 858 (30.16)

Number of patients who were hospitalized on the day of endoscopy based UC diagnosis n (%): 476 (16.73)
Avg. length of stay for inpatient hospitalizations on the day of endoscopy based UC dx mean: 10.35

Median length of stay (days): 6.0
Mean length of stay (days): 8.36


In [195]:
~endo_diag_col['EMPI'].isin(excluded_empis_uc_hsp)

0        True
1        True
2        True
3        True
4        True
         ... 
98253    True
98254    True
98255    True
98256    True
98257    True
Name: EMPI, Length: 98258, dtype: bool

In [None]:
All endo diagnsosis - check for hospitilization

## Chart Review

In [490]:
# Hospitilization after endo confirmed UC but not on the same day

uc_hospitalized_sub_empis[~uc_hospitalized_sub_empis.isin(uc_hospitalized_dx_empis)].sample(10)

526     100544450
3981    114199745
340     100326134
637     100723112
1965    103440685
803     100934672
1327    102032167
1926    103355608
3941    113853248
3428    107386458
Name: EMPI, dtype: object

In [521]:
# Hospitilization before an endo confirmed UC but not on or after

excluded_empis_uc_hsp.sample(10)

1804    106711288
298     100443582
789     101962138
2157    111587455
1408    104780629
1162    103465190
2188    112935060
2127    111080350
2068    109952242
1759    106491509
Name: EMPI, dtype: object

In [527]:
random_empi = '103355608'

In [528]:
endo_enc_col[endo_enc_col['EMPI']==random_empi]

Unnamed: 0,EMPI,datetime,MRN,MRN_Type,Report_Number,Report_Text,ulcerative_colitis,mayo_score,mayo_stage,disease_list,Inpatient_Outpatient,Admit_Date,Discharge_Date,UC_enc_diagnosis,Length_of_stay
1926,103355608,2005-01-03 00:00:00,19752377,BWH,,,,,,,Inpatient,2005-01-03,2005-01-05,True,3.0
1927,103355608,2005-04-29 00:00:00,19752377,BWH,,,,,,,Inpatient,2005-04-29,2005-05-01,True,3.0
1928,103355608,2005-05-06 09:38:00,19752377,BWH,3.0,"FINDINGS: Moderate confluent erythema, granul...",True,,2.0,"[moderate_confluent erythema True, granularity...",,NaT,NaT,,
1929,103355608,2005-06-13 00:00:00,19752377,BWH,,,,,,,Inpatient,2005-06-13,2005-06-20,True,8.0
1930,103355608,2005-07-08 00:00:00,19752377,BWH,,,,,,,Inpatient,2005-07-08,2005-07-15,True,8.0


In [529]:
endo_df_col_4.loc[(endo_df_col_4['EMPI']==random_empi) & 
                  ((endo_df_col_4['ulcerative_colitis']==True)|(endo_df_col_4['pan_colitis']==True))]

Unnamed: 0_level_0,EMPI,EPIC_PMRN,MRN_Type,MRN,Report_Number,Report_Date_Time,Report_Description,Report_Status,Report_Type,Report_Text,datetime,has_dx_start,dx_start_line,has_dx_end,dx_end_line,dx_end_line_LAFD,erythema,marked_erythema,inflammation,mild_inflammation,moderate_inflammation,severe_inflammation,loss_vasculature,dec_vasculature,granularity,ulceration,friability,mild_friability,spont_bleeding,adherent_blood,erosion,congestion,edema,pseudopolyp,crohns,superficial_ulcer,shallow_ulcer,aphthous_ulcer,small_ulcer,large_ulcer,deep_ulcer,mild_ulcer,colitis,chronic_colitis,mild_colitis,moderate_colitis,severe_colitis,active_colitis,inactive_colitis,acute_colitis,ulcerative_colitis,pan_colitis,proctitis,proctosigmoiditis,left_sided_colitis,active_ileitis,chronic_ileitis,arch_distortion,basal_plasmacytosis,active_enteritis,chronic_enteritis,crypt_abscess,crypt_atrophy,cryptitis,lymphoid_agg,lamina_propria,granuloma,noncaseating_gran,nonnecrotizing_gran,paneth_cell,cdiff,cmv,mayo_score,disease_list
unique_report_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1
103355608_3,103355608,10089077787,BWH,19752377,3,5/6/2005 9:38:00 AM,COLONOSCOPY,Signed,NDO,"FINDINGS: Moderate confluent erythema, granul...",2005-05-06 09:38:00,True,"FINDINGS: Moderate confluent erythema, granul...",True,RECOMMENDATION:,11.0,True,,,,,,,,True,False,True,,,,,,,,,,,,,,,,True,True,,True,,True,,,True,,,,,,,,,,,,,,,,,,,,,,,"[moderate_confluent erythema True, granularity..."


In [525]:
enc_df_col.loc[(enc_df_col['EMPI']==random_empi)&(enc_df_col['Inpatient_Outpatient']=='Inpatient')]

Unnamed: 0,EMPI,EPIC_PMRN,MRN_Type,MRN,Encounter_number,Encounter_Status,Hospital,Inpatient_Outpatient,Service_Line,Attending_MD,Admit_Date,Discharge_Date,LOS_Days,Clinic_Name,Admit_Source,Discharge_Disposition,Payor,Admitting_Diagnosis,Principal_Diagnosis,Diagnosis_1,Diagnosis_2,Diagnosis_3,Diagnosis_4,Diagnosis_5,Diagnosis_6,Diagnosis_7,Diagnosis_8,Diagnosis_9,Diagnosis_10,DRG,Patient_Type,Referrer_Discipline,UC_enc_diagnosis
2184515,103465190,10044427614,MGH,4285029,TSI-MGH-K911620987,Regular,MGH,Inpatient,GI,"Leeds, Naomi Rachel, MD",2006-02-12,2006-02-22,10.0,IV Storehouse (467),Walk In,Home Or Self Care,Bc-Blue Care Elect,780.6 - Fever,556.8 - Other ulcerative colitis,530.81 - Esophageal reflux,455.6 - Unspecified hemorrhoids without mentio...,565.0 - Anal fissure,300.01 - Panic disorder,276.51 - Dehydration,288.8 - Other specified disease of white blood...,"493.90 - Asthma, unspecified without mention o...","788.30 - Urinary incontinence, unspecified",,,"DRG:179 - Inflammatory Bowel Disease, APDRG:1...",,,True
2202527,103465190,10044427614,MGH,4285029,IDX-MGH-3-37837130,,MGH,Inpatient,,"Avery, Laura Louise, MD",2006-02-12,NaT,,Radiology (38),,,,,"556.9 - Ulcerative colitis, unspecified","789.00 - Abdominal pain, unspecified site",,,,,,,,,,,,,True
2202528,103465190,10044427614,MGH,4285029,IDX-MGH-3-37850050,,MGH,Inpatient,,"Avery, Laura Louise, MD",2006-02-12,NaT,,Radiology (38),,,,,780.6 - Fever,,,,,,,,,,,,,,
2194117,103465190,10044427614,MGH,4285029,IDX-MGH-3-37911667,,MGH,Inpatient,,"Richter, James Michael, MD",2006-02-13,NaT,,Gastroenterology (19),,,,,"556.9 - Ulcerative colitis, unspecified",,,,,,,,,,,,,,True
2194120,103465190,10044427614,MGH,4285029,IDX-MGH-3-37893371,,MGH,Inpatient,,"Murray, Megan Blanche, MD",2006-02-13,NaT,,Infectious Disease (23),,,,,558.9 - Other and unspecified noninfectious ga...,780.6 - Fever,,,,,,,,,,,,,
2196482,103465190,10044427614,MGH,4285029,IDX-MGH-3-37860125,,MGH,Inpatient,,"Leeds, Naomi Rachel, MD",2006-02-13,NaT,,MG Med-Hospital Based PC - Primary Care Assoc ...,,,,,558.9 - Other and unspecified noninfectious ga...,276.51 - Dehydration,680.9 - Carbuncle and furuncle of unspecified ...,780.6 - Fever,,,,,,,,,,,
2197657,103465190,10044427614,MGH,4285029,IDX-MGH-3-37816734,,MGH,Inpatient,,"Richter, James Michael, MD",2006-02-14,NaT,,Gastroenterology (19),,,,,569.82 - Ulceration of intestine,,,,,,,,,,,,,,
2194123,103465190,10044427614,MGH,4285029,IDX-MGH-3-37935619,,MGH,Inpatient,,"Gandhi, Rajesh Tim, MD",2006-02-15,NaT,,Infectious Disease (23),,,,,558.9 - Other and unspecified noninfectious ga...,,,,,,,,,,,,,,
2197658,103465190,10044427614,MGH,4285029,IDX-MGH-3-37878486,,MGH,Inpatient,,"Barry, Michael John, MD",2006-02-15,NaT,,MG Med-Hospital Based PC - Primary Care Assoc ...,,,,,"556.9 - Ulcerative colitis, unspecified",,,,,,,,,,,,,,True
2196480,103465190,10044427614,MGH,4285029,IDX-MGH-3-37878485,,MGH,Inpatient,,"Atlas, Steven Julius, MD",2006-02-16,NaT,,MG Med-Hospital Based PC - Primary Care Assoc ...,,,,,780.6 - Fever,"789.00 - Abdominal pain, unspecified site",,,,,,,,,,,,,


In [526]:
dis_df_col[dis_df_col['EMPI']==random_empi]

Unnamed: 0_level_0,EMPI,EPIC_PMRN,MRN_Type,MRN,Report_Number,Report_Date_Time,Report_Description,Report_Status,Report_Type,Report_Text,datetime,Admit_Date,Discharge_Date
unique_report_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
103465190_555707,103465190,10044427614,MGH,4285029,555707,2/12/2006 12:00:00 AM,ED Discharge Summary,F,MDISDIS,"ED DISCHARGE NOTIFICATION\n\nFINKELSTEIN,JULIA...",2006-02-12 00:00:00,NaT,NaT
103465190_MGHPOE39972480,103465190,10044427614,MGH,4285029,MGHPOE39972480,2/12/2006 6:02:00 PM,Discharge Summary,F,MDISDIS,Massachusetts General Hospital \n 55...,2006-02-12 18:02:00,2006-02-12,2006-02-22
103465190_MGHPOE39956076,103465190,10044427614,MGH,4285029,MGHPOE39956076,2/12/2006 6:02:00 PM,Discharge Summary,F,MDISDIS,Massachusetts General Hospital \n 55...,2006-02-12 18:02:00,2006-02-12,2006-02-22
103465190_852096,103465190,10044427614,MGH,4285029,852096,2/22/2006 12:00:00 AM,Discharge Summary,F,MDISDIS,"DISCHARGE SUMMARY\n\nNAME: FINKELSTEIN, JULIA ...",2006-02-22 00:00:00,2006-02-12,2006-02-22
103465190_558973,103465190,10044427614,MGH,4285029,558973,2/27/2006 12:00:00 AM,ED Discharge Summary,F,MDISDIS,ED DISCHARGE NOTIFICATION/SUMMARY\n\nFINKELSTE...,2006-02-27 00:00:00,NaT,NaT
103465190_681111,103465190,10044427614,MGH,4285029,681111,8/11/2007 12:00:00 AM,ED Discharge Summary,F,MDISDIS,ED DISCHARGE NOTIFICATION/SUMMARY\n\nFINKELSTE...,2007-08-11 00:00:00,NaT,NaT


In [618]:
print('Mayo scores of patients who were ever hospitalized\n')

print('Count of patients with Mayo score 3 and an endo confirmed UC:', (
    (endo_enc_col.loc[
        (endo_enc_col['mayo_stage']==3)&(endo_enc_col['EMPI'].isin(uc_hospitalized_empis))
        ,'EMPI']
     .nunique())
)
     )

print('Count of patients with Mayo score 2 and an endo confirmed UC:', (
    (endo_enc_col.loc[
        (endo_enc_col['mayo_stage']==2)&(endo_enc_col['EMPI'].isin(uc_hospitalized_empis))
        ,'EMPI']
     .nunique())
)
     )
      
print('Count of patients with Mayo score 1 and an endo confirmed UC:', (
    (endo_enc_col.loc[
        (endo_enc_col['mayo_stage']==1)&(endo_enc_col['EMPI'].isin(uc_hospitalized_empis))
        ,'EMPI']
     .nunique())
)
     )
      
print('Count of patients with Mayo score 0 and an endo confirmed UC:', (
    (endo_enc_col.loc[
        (endo_enc_col['mayo_stage']==0)&(endo_enc_col['EMPI'].isin(uc_hospitalized_empis))
        ,'EMPI']
     .nunique())
)
     )

Mayo scores of patients who were ever hospitalized

Count of patients with Mayo score 3 and an endo confirmed UC: 126
Count of patients with Mayo score 2 and an endo confirmed UC: 613
Count of patients with Mayo score 1 and an endo confirmed UC: 141
Count of patients with Mayo score 0 and an endo confirmed UC: 162


In [619]:
print('Mayo scores of patients who were never hospitalized\n')

print('Count of patients with Mayo score 3 and an endo confirmed UC:', (
    (endo_enc_col.loc[
        (endo_enc_col['mayo_stage']==3)&(~endo_enc_col['EMPI'].isin(uc_hospitalized_empis))
        ,'EMPI']
     .nunique())
)
     )

print('Count of patients with Mayo score 2 and an endo confirmed UC:', (
    (endo_enc_col.loc[
        (endo_enc_col['mayo_stage']==2)&(~endo_enc_col['EMPI'].isin(uc_hospitalized_empis))
        ,'EMPI']
     .nunique())
)
     )
      
print('Count of patients with Mayo score 1 and an endo confirmed UC:', (
    (endo_enc_col.loc[
        (endo_enc_col['mayo_stage']==1)&(~endo_enc_col['EMPI'].isin(uc_hospitalized_empis))
        ,'EMPI']
     .nunique())
)
     )
      
print('Count of patients with Mayo score 0 and an endo confirmed UC:', (
    (endo_enc_col.loc[
        (endo_enc_col['mayo_stage']==0)&(~endo_enc_col['EMPI'].isin(uc_hospitalized_empis))
        ,'EMPI']
     .nunique())
)
     )

Mayo scores of patients who were never hospitalized

Count of patients with Mayo score 3 and an endo confirmed UC: 86
Count of patients with Mayo score 2 and an endo confirmed UC: 589
Count of patients with Mayo score 1 and an endo confirmed UC: 181
Count of patients with Mayo score 0 and an endo confirmed UC: 211


In [571]:
Counter(endo_enc_col.loc[:, 'MRN_Type'])

Counter({'MGH': 1753, 'BWH': 1312, 'FH': 108, 'NWH': 76, 'NSM': 65})

In [570]:
Counter(endo_enc_col.loc[endo_enc_col['EMPI'].isin(uc_hospitalized_dx_empis), 'MRN_Type'])

Counter({'BWH': 152, 'MGH': 242})

In [436]:
endo_enc_col['datetime']

Unnamed: 0,EMPI,datetime,MRN,MRN_Type,Report_Number,Report_Text,ulcerative_colitis,mayo_score,mayo_stage,disease_list,Inpatient_Outpatient,Admit_Date,Discharge_Date,UC_enc_diagnosis,Length_of_stay
0,100000623,2007-05-25 10:35:00,1191043,MGH,92149321,Findings:\n The perianal and digital recta...,True,,2.0,"[moderate_inflammation True, congestion True, ...",,NaT,NaT,,
1,100002259,2003-01-29 19:50:00,00449454,BWH,2,FINDINGS: There was evidence of ulcerative co...,True,,1.0,"[ulcerativecolitis True, erythematous True, ul...",,NaT,NaT,,
2,100003555,2001-02-02 07:44:00,00626754,BWH,1,FINDINGS: The colonic mucosa appeared entirel...,True,,0.0,[ulcerativecolitis True],,NaT,NaT,,
3,100005154,1998-11-11 18:10:00,00824623,BWH,1,FINDINGS:\nSURFACE & MUCOSA: 1 cm polypod mass...,True,,2.0,"[pseudopolyp snared True, erythema True, friab...",,NaT,NaT,,
4,100005214,1999-02-23 00:00:00,0052588,MGH,10020131,Findings: After obtaining informed consent...,True,,0.0,[ulcerativecolitis True],,NaT,NaT,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3403,115268025,2020-12-15 00:00:00,7129672,MGH,,,,,,,Inpatient,2020-12-15,2021-01-05,True,22.0
3404,115291216,2007-06-06 00:00:00,3262845,MGH,,,,,,,Inpatient,2007-06-06,2007-06-08,True,3.0
3405,115291216,2010-06-23 12:35:00,3262845,MGH,92279658,Findings:\n The perianal and digital recta...,True,,2.0,"[mild_inflammation True, erythema True, friabi...",,NaT,NaT,,
3407,115308849,2020-12-03 09:32:00,7173579,MGH,93154158,Findings:\n The perianal and digital recta...,True,2.0,2.0,"[inflammation True, marked-erythema True, abse...",Inpatient,2020-12-01,2020-12-04,True,4.0


## Demographic findings

In [254]:
endo_df_col_uc = pd.merge(
    endo_df_col_uc, 
    dem_df_col.loc[:,
                   ['EMPI', 'Gender', 'Race', 'Zip_code', 'Date_of_Birth', 'Date_Of_Death']
                  ]
    ,how='left', on=['EMPI']).copy()


endo_df_col_uc['age'] = (
    (endo_df_col_uc['datetime']-endo_df_col_uc['Date_of_Birth'])
    .dt.days.astype(int, errors='ignore')/365.25
)
endo_df_col_uc['dx_year'] = endo_df_col_uc['datetime'].dt.year

In [530]:
print('Mean age:', round(endo_df_col_uc['age'].mean(), 2))
print('Std age:', round(endo_df_col_uc['age'].std(), 2), end='\n\n')

print('Sex, Female n (%): {} ({})'.format((endo_df_col_uc['Gender']=='Female').sum(),
      round((endo_df_col_uc['Gender']=='Female').mean()*100, 2)), end='\n\n')

print('Race, Asians n (%): {} ({})'.format((endo_df_col_uc['Race']=='Asian').sum(),
      round((endo_df_col_uc['Race']=='Asian').mean()*100, 2)))
print('Race, African American n (%): {} ({})'.format((endo_df_col_uc['Race']=='African American').sum(),
      round((endo_df_col_uc['Race']=='African American').mean()*100, 2)))
print('Race, Hispanic n (%): {} ({})'.format((endo_df_col_uc['Race']=='Hispanic').sum(),
      round((endo_df_col_uc['Race']=='Hispanic').mean()*100, 2)))
print('Race, Whites n (%): {} ({})'.format((endo_df_col_uc['Race']=='Caucasian').sum(),
      round((endo_df_col_uc['Race']=='Caucasian').mean()*100, 2)))
print('Race, Others n (%): {} ({})'.format(endo_df_col_uc['Race'].isin(['Other', 'American Indian']).sum(),
      round(endo_df_col_uc['Race'].isin(['Other', 'American Indian']).mean()*100, 2)))
print('Race, Not recorded n (%): {} ({})'.format((endo_df_col_uc['Race']=='Not recorded').sum(),
      round((endo_df_col_uc['Race']=='Not recorded').mean()*100, 2)), end='\n\n')

print('Cases through 1995-2000 n (%): {} ({})'.format(endo_df_col_uc['dx_year'].between(1995, 2000).sum(),
      round(endo_df_col_uc['dx_year'].between(1995, 2000).mean()*100, 2)))
print('Cases through 2001-2005 n (%): {} ({})'.format(endo_df_col_uc['dx_year'].between(2001, 2005).sum(),
      round(endo_df_col_uc['dx_year'].between(2001, 2005).mean()*100, 2)))
print('Cases through 2006-2010 n (%): {} ({})'.format(endo_df_col_uc['dx_year'].between(2006, 2010).sum(),
      round(endo_df_col_uc['dx_year'].between(2006, 2010).mean()*100, 2)))
print('Cases through 2011-2015 n (%): {} ({})'.format(endo_df_col_uc['dx_year'].between(2011, 2015).sum(),
      round(endo_df_col_uc['dx_year'].between(2011, 2015).mean()*100, 2)))
print('Cases through 2016-2021 n (%): {} ({})'.format(endo_df_col_uc['dx_year'].between(2016, 2021).sum(),
      round(endo_df_col_uc['dx_year'].between(2016, 2021).mean()*100, 2)), end='\n\n')


Mean age: 44.26
Std age: 18.38

Sex, Female n (%): 1268 (55.52)

Race, Asians n (%): 67 (2.93)
Race, African American n (%): 102 (4.47)
Race, Hispanic n (%): 55 (2.41)
Race, Whites n (%): 1960 (85.81)
Race, Others n (%): 53 (2.32)
Race, Not recorded n (%): 47 (2.06)

Cases through 1995-2000 n (%): 315 (13.79)
Cases through 2001-2005 n (%): 349 (15.28)
Cases through 2006-2010 n (%): 465 (20.36)
Cases through 2011-2015 n (%): 559 (24.47)
Cases through 2016-2021 n (%): 596 (26.09)



In [531]:
filter_sigmoidoscopy = endo_df_col_uc['Report_Description'].str.contains('SIG', case=False, na=False)
filter_colonoscopy = endo_df_col_uc['Report_Description'].str.contains('COLON', case=False, na=False)

print('Sigmoidoscopy n (%): {} ({})'.format(filter_sigmoidoscopy.sum(),
                                            round(filter_sigmoidoscopy.mean()*100, 2)), end='\n\n')

print('Colonoscopy n (%): {} ({})'.format(filter_colonoscopy.sum(),
                                            round(filter_colonoscopy.mean()*100, 2)), end='\n\n')

print('Endoscopy (unspecified) n (%): {} ({})'.format((~(filter_sigmoidoscopy|filter_colonoscopy)).sum(),
                                            round((~(filter_sigmoidoscopy|filter_colonoscopy)).mean()*100, 2)))

Sigmoidoscopy n (%): 632 (27.67)

Colonoscopy n (%): 1614 (70.67)

Endoscopy (unspecified) n (%): 38 (1.66)


In [542]:
print('Mayo Scores (only direct mentions) N:\n\n',
      endo_df_col_uc['mayo_score'].sort_values().value_counts(),
      sep='', end='\n\n')

print('Mayo Scores (only direct mentions) %:\n\n',
      endo_df_col_uc['mayo_score'].sort_values().value_counts()*100/endo_df_col_uc['mayo_score'].dropna().shape[0],
      sep='', end='\n\n')

Mayo Scores (only direct mentions) N:

2.0    95
3.0    64
1.0    56
0.0    27
2.5     5
1.5     1
0.5     1
Name: mayo_score, dtype: int64

Mayo Scores (only direct mentions) %:

2.0    38.152610
3.0    25.702811
1.0    22.489960
0.0    10.843373
2.5     2.008032
1.5     0.401606
0.5     0.401606
Name: mayo_score, dtype: float64



In [533]:
print('Mayo Scores N:\n\n',
      endo_df_col_uc['mayo_stage'].sort_values().value_counts(),
      sep='', end='\n\n')

print('Mayo Scores %:\n\n',
      endo_df_col_uc['mayo_stage'].sort_values().value_counts()*100/endo_df_col_uc['mayo_stage'].dropna().shape[0],
      sep='', end='\n\n')

Mayo Scores N:

2.0    1269
0.0     371
1.0     329
3.0     308
2.5       5
1.5       1
0.5       1
Name: mayo_stage, dtype: int64

Mayo Scores %:

2.0    55.560420
0.0    16.243433
1.0    14.404553
3.0    13.485114
2.5     0.218914
1.5     0.043783
0.5     0.043783
Name: mayo_stage, dtype: float64



# ER Validation

In [142]:
def evaluation_binary(df, actual_cols, predicted_cols):
    
    from IPython.display import Markdown, display
    
    def printmd(string):
        display(Markdown(string))
        
    for i in range(len(actual_cols)):
        
        actual_col = df[actual_cols[i]]
        predicted_col = df[predicted_cols[i]]
        feat_name = actual_cols[i]
    
        printmd('**{}**'.format(feat_name))

        from sklearn.metrics import confusion_matrix

        missing_index = (actual_col.isna() | predicted_col.isna())

    #     if sum(missing_index)!=0:
    #         print ('Missing  %: {} \n'.format(round(np.mean(missing_index)*100, 2)))

        actual_col = actual_col[~missing_index].astype(int)
        predicted_col = predicted_col[~missing_index].astype(int)

        print('N :', sum(~missing_index))

        cm = confusion_matrix(actual_col, predicted_col)
        
        if cm.shape == (1,1):
            tn, fp, fn, tp = 0, 0, 0, cm[0][0]
            
            total=(tn+fp+fn+tp)
            accuracy=(tp + tn)*100/total
            sensitivity = tp*100 / (tp+fn)
            specificity = np.nan
            ppv = tp*100 / (tp+fp)
            npv = np.nan
            
        else: 
            tn, fp, fn, tp = cm.ravel()

            total=(tn+fp+fn+tp)
            accuracy=(tp + tn)*100/total
            sensitivity = tp*100 / (tp+fn)
            specificity = tn*100 / (tn+fp)
            ppv = tp*100 / (tp+fp)
            npv = tn*100 / (tn+fn)
            
        print('\nConfusion Matrix : \n\n', pd.crosstab(actual_col, predicted_col))

        print ('\nAccuracy %: ', round(accuracy, 2))

        print('Sensitivity/Recall %: ', round(sensitivity, 2) )

        print('Specificity %: ', round(specificity, 2))

        print('PPV %:', round(ppv, 2))

        print('NPV %:', round(npv, 2))
    
    
    
def evaluation_multiclass(actual_col, predicted_col, feat_name, labels):
    
    from IPython.display import Markdown, display
    
    eval_list = []
    
    def printmd(string):
        display(Markdown(string))
    
    printmd('**{}**'.format(feat_name))
        
    from sklearn.metrics import confusion_matrix
    
    missing_index = (actual_col.isna() | predicted_col.isna())
    
#     if sum(missing_index)!=0:
#         print ('Missing  %: {} \n'.format(round(np.mean(missing_index)*100, 2)))
        
    actual_col = actual_col[~missing_index].astype(float)
    predicted_col = predicted_col[~missing_index].astype(float)
    
    print('N :', sum(~missing_index))
    
    print('\nConfusion Matrix : \n\n', pd.crosstab(actual_col, predicted_col))
    
    for label in labels:
        cm = confusion_matrix(actual_col==label, predicted_col==label)
        
        if cm.shape == (1,1):
            tn, fp, fn, tp = 0, 0, 0, cm[0][0]
            
            total=(tn+fp+fn+tp)
            accuracy=(tp + tn)*100/total
            sensitivity = tp*100 / (tp+fn)
            specificity = np.nan
            ppv = tp*100 / (tp+fp)
            npv = np.nan
            
        else: 
            tn, fp, fn, tp = cm.ravel()

            total=(tn+fp+fn+tp)
            accuracy=(tp + tn)*100/total
            sensitivity = tp*100 / (tp+fn)
            specificity = tn*100 / (tn+fp)
            ppv = tp*100 / (tp+fp)
            npv = tn*100 / (tn+fn)
        
#         print('Label:{}, tn:{}, fp:{}, fn:{}, tp:{}'.format(label, tn,fp, fn, tp))
        
        eval_list.append([accuracy, sensitivity, specificity, ppv, npv])
        
    eval_list.append(np.mean(eval_list, axis=0))
    
    eval_df = pd.DataFrame(eval_list, index = labels+['overall'], columns=['Accuracy','Sensitivity/Recall','Specificity','PPV','NPV'])
    
    print('\n\n Evaluation metrics :')
    print(round(eval_df, 2))
#     return round(eval_df, 2)

## Sample validation

In [117]:
endo_df_col_uc_sample = endo_df_col_uc.sample(50).copy().reset_index(drop=True)

In [53]:
# endo_df_col_uc_sample.to_csv(
#     '/Users/pkc17/MGH/Colon_RPDR_analysis/Validation/sample_data/endo_df_col_uc_sample.csv')

In [123]:
Counter(endo_df_col_uc_sample['mayo_score'].dropna())

Counter({2.0: 2, 0.0: 1, 2.5: 1, 3.0: 4, 1.0: 1})

In [124]:
Counter(endo_df_col_uc_sample['mayo_stage'].dropna())

Counter({2.0: 24, 0.0: 8, 1.0: 9, 2.5: 1, 3.0: 6})

In [126]:
endo_df_col_uc_sample_unval = endo_df_col_uc_sample.loc[
    :,  ['EMPI', 'MRN', 'datetime', 'Report_Number', 'Report_Text']].copy()

endo_df_col_uc_sample_unval['UC_lab'] = ''
endo_df_col_uc_sample_unval['Mayo_Def_lab'] = ''
endo_df_col_uc_sample_unval['Mayo_Score_lab'] = ''

In [54]:
# endo_df_col_uc_sample_unval.to_csv(
#     '/Users/pkc17/MGH/Colon_RPDR_analysis/Validation/sample_data/endo_df_col_uc_sample_unval.csv')

In [49]:
endo_df_col_uc_sample_unval = pd.read_csv(
    '/Users/pkc17/MGH/Colon_RPDR_analysis/Validation/sample_data/endo_df_col_uc_sample_unval.csv')

### Results

In [72]:
endo_df_col_uc_sample = pd.read_csv(
    '/Users/pkc17/MGH/Colon_RPDR_analysis/Validation/sample_data/endo_df_col_uc_sample.csv', index_col=0)

In [79]:
endo_df_col_uc_sample.loc[endo_df_col_uc_sample['pan_colitis']==True, 'ulcerative_colitis'] = True

In [73]:
endo_df_col_uc_sample_val = pd.read_csv(
    '/Users/pkc17/MGH/Colon_RPDR_analysis/Validation/sample_data/endo_df_col_uc_sample_val.csv', index_col=0)

In [377]:
endo_df_col_uc_sample_val

Unnamed: 0,EMPI,MRN,datetime,Report_Number,Report_Text,UC_lab,Mayo_Def_lab,Mayo_Score_lab
0,105909604,14883672,10/12/11 15:11,83157,Findings:\n The perianal and digital recta...,1,0.0,
1,105438680,23989569,4/5/10 15:33,42468,Findings:\n The perianal and digital recta...,1,1.0,
2,100540753,1693687,4/2/12 16:04,92421813,Findings:\n Scattered pseudopolyps were fo...,1,0.0,
3,100542050,10606333,3/28/00 2:33,3,IMPRESSION:Chronic ulcerative colitis.\n \nCOM...,1,,
4,101081400,4101123,4/12/18 15:05,92791914,Findings:\n Diffuse moderate inflammation ...,1,2.0,
5,101780118,754331,2/3/03 11:41,0203-0018,Findings: A continuous area of nonbleeding...,1,2.0,
6,100085608,1027333,12/5/06 0:00,92135892,Findings: The digital rectal exam was n...,1,2.0,
7,100171977,2467124,12/30/09 11:10,92258376,Findings:\n The digital rectal exam was no...,1,0.0,
8,112457081,33936535,12/8/16 10:57,184915,Findings:\n The perianal and digital recta...,1,0.0,
9,101240202,865456,4/15/15 11:14,92630251,Findings:\n The perianal and digital recta...,1,0.0,


In [151]:
endo_df_col_uc_sample_merge = pd.merge(
    endo_df_col_uc_sample_val,
    endo_df_col_uc_sample.loc[:, ['EMPI','Report_Number','ulcerative_colitis','mayo_stage','disease_list']],
    how='left', on=['EMPI', 'Report_Number'])

In [149]:
endo_df_col_uc_sample_val.columns

Index(['EMPI', 'MRN', 'datetime', 'Report_Number', 'Report_Text', 'UC_lab',
       'Mayo_Def_lab', 'Mayo_Score_lab'],
      dtype='object')

In [83]:
Counter(endo_df_col_uc_sample_merge==endo_df_col_uc_sample_merge)

Counter({True: 26})

In [152]:
evaluation_binary(endo_df_col_uc_sample_merge, 
                  actual_cols=['UC_lab'], predicted_cols=['ulcerative_colitis'])

**UC_lab**

N : 26

Confusion Matrix : 

 ulcerative_colitis   1
UC_lab                
1                   26

Accuracy %:  100.0
Sensitivity/Recall %:  100.0
Specificity %:  nan
PPV %: 100.0
NPV %: nan


In [153]:
evaluation_multiclass(
    endo_df_col_uc_sample_merge['Mayo_Def_lab'], endo_df_col_uc_sample_merge['mayo_stage'], 
    feat_name='Mayo stage by def', labels=[0,1,2,3])

**Mayo stage by def**

N : 22

Confusion Matrix : 

 mayo_stage    0.0  1.0  2.0  3.0
Mayo_Def_lab                    
0.0             5    2    1    0
1.0             0    2    3    0
2.0             0    0    8    0
3.0             0    0    0    1


 Evaluation metrics :
         Accuracy  Sensitivity/Recall  Specificity     PPV     NPV
0           86.36               62.50       100.00  100.00   82.35
1           77.27               40.00        88.24   50.00   83.33
2           81.82              100.00        71.43   66.67  100.00
3          100.00              100.00       100.00  100.00  100.00
overall     86.36               75.62        89.92   79.17   91.42


In [154]:
evaluation_multiclass(
    endo_df_col_uc_sample_merge['Mayo_Score_lab'], endo_df_col_uc_sample_merge['mayo_stage'], 
    feat_name='Mayo stage by def', labels=[0,1,2,3])

**Mayo stage by def**

N : 3

Confusion Matrix : 

 mayo_stage      0.0  2.0  2.5
Mayo_Score_lab               
0.0               1    0    0
2.0               0    1    0
2.5               0    0    1


 Evaluation metrics :
         Accuracy  Sensitivity/Recall  Specificity    PPV    NPV
0           100.0               100.0        100.0  100.0  100.0
1           100.0               100.0          NaN  100.0    NaN
2           100.0               100.0        100.0  100.0  100.0
3           100.0               100.0          NaN  100.0    NaN
overall     100.0               100.0          NaN  100.0    NaN


In [385]:
# endo_df_col_uc_sample_merge.loc[
#     (~endo_df_col_uc_sample_merge['Mayo_Def_lab'].isna())&
#     (endo_df_col_uc_sample_merge['Mayo_Def_lab']!=endo_df_col_uc_sample_merge['mayo_stage'])
# ]

In [157]:
endo_df_col_uc_sample_merge.loc[
    (~endo_df_col_uc_sample_merge['Mayo_Def_lab'].isna())&
    (endo_df_col_uc_sample_merge['Mayo_Def_lab']!=endo_df_col_uc_sample_merge['mayo_stage'])
].to_csv('/Users/pkc17/MGH/Colon_RPDR_analysis/Validation/sample_data/sample_disagreement.csv', index=False)

## Validation

In [110]:
endo_df_uc.to_csv(
'/Users/pkc17/MGH/Colon_RPDR_analysis/Validation/endo_df_uc.csv', index=False)

In [86]:
bins = [1990,2000,2005,2010,2015,2021]
labels = ['1991-2000', '2001-2005', '2006-2010', '2011-2015', '2016-2021']

endo_df_uc['dx_year_grp'] = pd.cut(endo_df_uc['datetime'].dt.year, bins=bins, labels=labels)

In [655]:
# #Mayo 0

# mayo_d0_empi = (
#     endo_df_uc_sub.loc[(endo_df_uc_sub['mayo_stage']==0)&(endo_df_uc_sub['mayo_score'].isna()), 'EMPI']
#     .sample(55, random_state=rndm_state)
# )
# combined_empi = mayo_d0_empi

# mayo_s0_empi = (
#     endo_df_uc_sub.loc[(endo_df_uc_sub['mayo_score']==0)&(~endo_df_uc_sub['EMPI'].isin(combined_empi)), 'EMPI']
#     .sample(15, random_state=rndm_state)
# )
# combined_empi = pd.concat([combined_empi, mayo_s0_empi])

# # Mayo 1
# mayo_d1_empi = (
#     endo_df_uc_sub.loc[(endo_df_uc_sub['mayo_stage']==1)&(~endo_df_uc_sub['EMPI'].isin(combined_empi)), 'EMPI']
#     .sample(70, random_state=rndm_state)
# )

# combined_empi = pd.concat([combined_empi, mayo_d1_empi])

# mayo_s2_empi = (
#     endo_df_uc_sub.loc[(endo_df_uc_sub['mayo_stage']==2)&(~endo_df_uc_sub['EMPI'].isin(combined_empi)), 'EMPI']
#     .sample(100, random_state=rndm_state)
# )

# combined_empi = pd.concat([combined_empi, mayo_s2_empi])
    
# mayo_s3_empi = (
#     endo_df_uc_sub.loc[(endo_df_uc_sub['mayo_stage']==3)&(~endo_df_uc_sub['EMPI'].isin(combined_empi)), 'EMPI']
#     .sample(70, random_state=rndm_state)
# )

# combined_empi = pd.concat([combined_empi, mayo_s3_empi])


In [125]:
def create_validationset(endoscopy_df, sample_size=[70,70,100,70], scored_sample_frac=0.15, random_state=222):
    
    filter_mayo_int = (endoscopy_df['mayo_stage'].isin([0,1,2,3]))
    df = endoscopy_df[filter_mayo_int].copy()

    df = df.sort_values(['EMPI', 'Report_Date_Time']).reset_index(drop=True).copy()

    print('Count of reports with decimal mayo scores that are being excluded from sampling:', sum(~filter_mayo_int))
    print('\nTotal number of reports in the population:', df.shape[0])
    print('Total number of patients in the population:', df['EMPI'].nunique())
    
    df = df.groupby('EMPI').sample(n=1, random_state=random_state).copy()
    
    combined_empi = pd.Series([], dtype=str)
    
    for i in range(4):
        
        score_samplesize = round(scored_sample_frac*sample_size[i]/5)*5
        def_samplesize = sample_size[i]-score_samplesize
        
        # Sampling scores based on definitions
        filter_def_empi = (
            (df['mayo_stage']==i) & (df['mayo_score'].isna())
            & ~(df['EMPI'].isin(combined_empi))
        )
        
        mayo_def_empi = (
            df.loc[filter_def_empi, 'EMPI']
            .sample(def_samplesize, random_state=random_state)
        )
        combined_empi = pd.concat([combined_empi, mayo_def_empi])
        
        # Sampling scores based on direct mention
        filter_score_empi = (
            (df['mayo_score']==i)
            & ~(df['EMPI'].isin(combined_empi))
        )
        
        mayo_score_empi = (
            df.loc[filter_score_empi, 'EMPI']
            .sample(score_samplesize, random_state=random_state)
        )
        combined_empi = pd.concat([combined_empi, mayo_score_empi])
        
    result_df = df[df['EMPI'].isin(combined_empi)].copy().reset_index(drop=True)
    
    print('Total number of reports to be validated:', result_df['EMPI'].nunique())
    
    return result_df
            

In [126]:
uc_validation_set = create_validationset(endo_df_uc, sample_size=[70,70,100,70], 
                                     scored_sample_frac=0.20, random_state=222)

Count of reports with decimal mayo scores that are being excluded from sampling: 16

Total number of reports in the population: 6466
Total number of patients in the population: 2948
Total number of reports to be validated: 310


In [104]:
def compare_distributions(validation_df, population_df, feature_col, feature_desc):
    
    print('{} N and % in sample population:\n\n'.format(feature_desc),
      validation_df[feature_col].sort_values().value_counts(),
      sep='', end='\n\n')

    print(validation_df[feature_col].sort_values().value_counts()*100/validation_df[feature_col].dropna().shape[0],
          sep='', end='\n\n')

    print('\n{} N and % in the entire cohort:\n\n'.format(feature_desc),
          population_df[feature_col].sort_values().value_counts(),
          sep='', end='\n\n')

    print(population_df[feature_col].sort_values().value_counts()*100/population_df[feature_col].dropna().shape[0],
          sep='', end='\n\n')

In [127]:
compare_distributions(uc_validation_set, endo_df_uc, feature_col='mayo_stage', feature_desc='Mayo Scores')

Mayo Scores N and % in sample population:

2.0    100
3.0     70
1.0     70
0.0     70
Name: mayo_stage, dtype: int64

2.0    32.258065
3.0    22.580645
1.0    22.580645
0.0    22.580645
Name: mayo_stage, dtype: float64


Mayo Scores N and % in the entire cohort:

2.0    2805
0.0    1493
1.0    1399
3.0     769
2.5       8
1.5       7
0.5       1
Name: mayo_stage, dtype: int64

2.0    43.273681
0.0    23.033015
1.0    21.582845
3.0    11.863622
2.5     0.123419
1.5     0.107991
0.5     0.015427
Name: mayo_stage, dtype: float64



In [128]:
compare_distributions(uc_validation_set, endo_df_uc, 
                      feature_col='mayo_score', feature_desc='Mayo Scores (directly mentioned)')

Mayo Scores (directly mentioned) N and % in sample population:

2.0    20
3.0    15
1.0    15
0.0    15
Name: mayo_score, dtype: int64

2.0    30.769231
3.0    23.076923
1.0    23.076923
0.0    23.076923
Name: mayo_score, dtype: float64


Mayo Scores (directly mentioned) N and % in the entire cohort:

2.0    265
3.0    170
1.0    140
0.0    107
2.5      8
1.5      7
0.5      1
Name: mayo_score, dtype: int64

2.0    37.965616
3.0    24.355301
1.0    20.057307
0.0    15.329513
2.5     1.146132
1.5     1.002865
0.5     0.143266
Name: mayo_score, dtype: float64



In [129]:
compare_distributions(uc_validation_set, endo_df_uc, 
                      feature_col='inflammation_severity', feature_desc='Severity of Inflammation')

Severity of Inflammation N and % in sample population:

9.0    154
0.0     50
1.0     35
3.0     24
2.0     22
Name: inflammation_severity, dtype: int64

9.0    54.035088
0.0    17.543860
1.0    12.280702
3.0     8.421053
2.0     7.719298
Name: inflammation_severity, dtype: float64


Severity of Inflammation N and % in the entire cohort:

9.0    3565
0.0     794
1.0     791
2.0     747
3.0     322
Name: inflammation_severity, dtype: int64

9.0    57.324329
0.0    12.767326
1.0    12.719087
2.0    12.011577
3.0     5.177681
Name: inflammation_severity, dtype: float64



In [130]:
compare_distributions(uc_validation_set, endo_df_uc, 
                      feature_col='MRN_Type', feature_desc='MRN Type')

MRN Type N and % in sample population:

MGH    154
BWH    121
FH      14
NSM     13
NWH      8
Name: MRN_Type, dtype: int64

MGH    49.677419
BWH    39.032258
FH      4.516129
NSM     4.193548
NWH     2.580645
Name: MRN_Type, dtype: float64


MRN Type N and % in the entire cohort:

MGH    3140
BWH    2665
FH      321
NWH     209
NSM     147
Name: MRN_Type, dtype: int64

MGH    48.441839
BWH    41.113854
FH      4.952175
NWH     3.224313
NSM     2.267819
Name: MRN_Type, dtype: float64



In [131]:
compare_distributions(uc_validation_set, endo_df_uc, 
                      feature_col='dx_year_grp', feature_desc='Diagnosis year group')

Diagnosis year group N and % in sample population:

2016-2021    111
2011-2015     87
2006-2010     43
2001-2005     43
1991-2000     26
Name: dx_year_grp, dtype: int64

2016-2021    35.806452
2011-2015    28.064516
2006-2010    13.870968
2001-2005    13.870968
1991-2000     8.387097
Name: dx_year_grp, dtype: float64


Diagnosis year group N and % in the entire cohort:

2016-2021    1911
2011-2015    1811
2006-2010    1259
2001-2005     890
1991-2000     611
Name: dx_year_grp, dtype: int64

2016-2021    29.481641
2011-2015    27.938908
2006-2010    19.423018
2001-2005    13.730330
1991-2000     9.426103
Name: dx_year_grp, dtype: float64



### Split into 5 parts

In [111]:
uc_validation_set.to_csv(
    '/Users/pkc17/MGH/Colon_RPDR_analysis/Validation/data/uc_validation_set.csv', index=False)

In [156]:
uc_validation_set_unval = uc_validation_set.loc[:,['EMPI', 'MRN', 'Report_Number', 'Report_Text']].copy()

uc_validation_set_unval['UC_lab'] = ''
uc_validation_set_unval['Mayo_Def_lab'] = ''
uc_validation_set_unval['Mayo_Score_lab'] = ''
uc_validation_set_unval['Inflammation_severity'] = ''

In [157]:
np.random.seed(222)
uc_val_rand = np.random.choice(
    uc_validation_set_unval.shape[0], uc_validation_set_unval.shape[0], replace=False)
split_count = round((uc_validation_set_unval.shape[0]-20)/5)

uc_validation_set.loc[uc_val_rand[-20:], :].to_csv(
    '/Users/pkc17/MGH/Colon_RPDR_analysis/Validation/data/uc_validation_set_overlap.csv', index=False)
uc_validation_set.loc[uc_val_rand[:-20], :].to_csv(
    '/Users/pkc17/MGH/Colon_RPDR_analysis/Validation/data/uc_validation_set_nonoverlap.csv', index=False)


uc_validation_set_1 = (
    uc_validation_set_unval
    .loc[
        np.concatenate((uc_val_rand[0:split_count], uc_val_rand[-20:])),:]
    .copy().reset_index(drop=True)
)

uc_validation_set_2 = (
    uc_validation_set_unval
    .loc[
        np.concatenate((uc_val_rand[split_count:split_count*2], uc_val_rand[-20:])),:]
    .copy().reset_index(drop=True)
)

uc_validation_set_3 = (
    uc_validation_set_unval
    .loc[
        np.concatenate((uc_val_rand[split_count*2:split_count*3], uc_val_rand[-20:])),:]
    .copy().reset_index(drop=True)
)

uc_validation_set_4 = (
    uc_validation_set_unval
    .loc[
        np.concatenate((uc_val_rand[split_count*3:split_count*4], uc_val_rand[-20:])),:]
    .copy().reset_index(drop=True)
)

uc_validation_set_5 = (
    uc_validation_set_unval
    .loc[
        np.concatenate((uc_val_rand[split_count*4:split_count*5], uc_val_rand[-20:])),:]
    .copy().reset_index(drop=True)
)

In [161]:
uc_validation_set_1.to_excel(
    '/Users/pkc17/MGH/Colon_RPDR_analysis/Validation/data/unvalidated/uc_validation_set_1.xlsx', index=False)
uc_validation_set_2.to_excel(
    '/Users/pkc17/MGH/Colon_RPDR_analysis/Validation/data/unvalidated/uc_validation_set_2.xlsx', index=False)
uc_validation_set_3.to_excel(
    '/Users/pkc17/MGH/Colon_RPDR_analysis/Validation/data/unvalidated/uc_validation_set_3.xlsx', index=False)
uc_validation_set_4.to_excel(
    '/Users/pkc17/MGH/Colon_RPDR_analysis/Validation/data/unvalidated/uc_validation_set_4.xlsx', index=False)
uc_validation_set_5.to_excel(
    '/Users/pkc17/MGH/Colon_RPDR_analysis/Validation/data/unvalidated/uc_validation_set_5.xlsx', index=False)

In [173]:
uc_validation_set_5.shape

(78, 8)

# geospatial map

In [None]:
endo_colitis

In [960]:
endo_colitis['Zip_code'].nunique()

949

In [961]:
endo_colitis['City'].nunique()

788

In [962]:
endo_colitis['County'].nunique()

209

In [963]:
endo_colitis['latitude'].nunique()

917

In [1005]:
endo_map = (endo_colitis
            .groupby(['latitude', 'longitude', 'dx_year'])
            .count()['EMPI']
            .reset_index().rename(columns={'EMPI':'Count'})
            .astype({'dx_year': 'int32'})
            .sort_values('dx_year')
)

In [944]:
import folium
import folium.plugins as plugins

In [1001]:
np.random.seed(3141592)
initial_data = np.random.normal(size=(100, 2)) * np.array([[1, 1]]) + np.array(
    [[48, 5]]
)

move_data = np.random.normal(size=(100, 2)) * 0.01

data = [(initial_data + move_data * i).tolist() for i in range(100)]

weight = 1  # default value
for time_entry in data:
    for row in time_entry:
        row.append(weight)

In [696]:
# m = folium.Map([48.0, 5.0], tiles="stamentoner", zoom_start=6)

# hm = plugins.HeatMapWithTime(data)

# hm.add_to(m)

# m

In [1007]:
from datetime import datetime, timedelta

time_index = [
    (datetime.now() + k * timedelta(1)).strftime("%Y-%m-%d") for k in range(len(data))
]

In [1004]:
len(time_index)

100

In [697]:
m = folium.Map([48.0, 5.0], tiles="stamentoner", zoom_start=6)

hm = plugins.HeatMapWithTime(data, index=time_index, auto_play=True, max_opacity=0.3)

hm.add_to(m)

m

# Negspacy

In [123]:
import spacy
from negspacy.negation import Negex
from negspacy.termsets import termset

ts = termset("en_clinical")

config={
        "neg_termset":{
            "pseudo_negations": ts.terms['pseudo_negations'] + ['and stage', 'grade', 'active'],
            "preceding_negations": ts.terms['preceding_negations'] + ['negative'],
            "following_negations": ts.terms['following_negations'] + ['negative', 'unremarkable', 'is not', 'are not', 'does not', 'may not', 'have not', 'was not', 'were not', 'absent', 'not present'],
            "termination": ts.terms['termination'] + ['note:', 'moderate']
        }
    }


#from spacy.pipeline import EntityRuler

corpus = 'en_core_sci_lg' # en_ner_bc5cdr_md, en_core_sci_md, en_core_sci_lg

# en = phrases for general english language text
# en_clinical DEFAULT = adds phrases specific to clinical domain to general english
# en_clinical_sensitive = adds additional phrases to help rule out historical and possibly irrelevant entities

nlp_2 = spacy.load(corpus) 

# ruler = EntityRuler(nlp_2, overwrite_ents=True)

# patterns = [
#     {"label": "ENTITY", "pattern": [{"LOWER": "chronic inflammation"}]}
#         ]
# ruler.add_patterns(patterns)

nlp_2.add_pipe(
    "negex",
    config = config
)


<negspacy.negation.Negex at 0x7f82ec04be50>

In [467]:


def entity_recognition_2(text):
    
    text = text.lower()
    
    entity_result = ''
    
    for line in text.split('.'):
        
        #line = line.strip()
        line = " ".join(line.split())
        line = (line
#                 .replace('chronic active', 'chronic-active')
#                 .replace('active chronic', 'active-chronic')
#                 .replace('chronic inactive', 'chronic-inactive')
                .replace('severely', 'severe')
                .replace('moderately', 'moderate')
                .replace('mildly', 'mild').replace('mildl', 'mild')
                .replace('floridly', 'florid')
                .replace('severe pseudomembranous', 'severe-pseudomembranous')
                .replace('self limited', 'self-limited')
                .replace('moderate to severe', 'moderate&severe')
                .replace('moderate to focally severe', 'moderate&severe')
                .replace('mild to moderate', 'mild&moderate')
                .replace('mild to focally moderate', 'mild&moderate')
                .replace('mild to severe', 'mild&severe')
                .replace('mild to focally severe', 'mild&severe')
                .replace('chronic-inactive', 'chronic inactive')
                .replace('active chronic', 'active-chronic')
                .replace('acute and chronic', 'acute-chronic')
                .replace('acute on chronic', 'acute-chronic')
                .replace('severe active', 'severe-active')
                .replace('severe chronic', 'severe-chronic')
                .replace('active severe', 'active-severe')
                .replace('chronic severe', 'chronic-severe')
                .replace('pancolitis, moderate&severe', 'moderate&severe pancolitis')
                .replace('colitis, moderate&severe', 'moderate&severe colitis')
                .replace('colitis, severe', 'severe colitis')
                .replace('active-severe', 'severe-active')
                .replace('colitis, moderate', 'moderate colitis')
                .replace('active-moderate', 'moderate-active')
                .replace('severe ischemic', 'severe-ischemic')
                .replace('moderate active', 'moderate-active')
                .replace('moderate chronic', 'moderate-chronic')
                .replace('active moderate', 'active-moderate')
                .replace('chronic moderate', 'chronic-moderate')
                .replace('mild active', 'mild-active')
                .replace('mild chronic', 'mild-chronic')
                .replace(' active colitis', ' active-colitis')
                .replace('mild ', 'mild-')
                .replace('moderate ', 'moderate-')
                .replace('severe ', 'severe-')
                .replace('inactive ', 'inactive-')
                .replace('ulcerative colitis', 'ulcerative-colitis')
                .replace('healed colitis', 'healed-colitis')
                .replace('surveillance', 'not present')
#                 .replace('chronic active', 'chronic-active')
#                 .replace('inactive chronic', 'inactive-chronic')
# # #                 .replace('active colitis', 'active-colitis')
#                 .replace('inactive colitis', 'inactive-colitis')
               )
                
        
        global doc, e
            
        doc = nlp_2(line)
    
        for e in doc.ents:
            
            e_text = e.text
            e_text = re.sub(' +', ' ', e_text)
            e_bool = e._.negex
            
            # Replace negation words in the entity and adjust sentiment
            if e_text.startswith(('no ', 'non-', 'non ')):
                to_match = ['^no ', '^non-', '^non ']
                e_text = re.sub('|'.join(to_match), '', e_text)
                e_bool = not e_bool
            
#             chronic_col = bool(re.search(r'\b(?:chronic\W+(?:\w+\W+){0,1}?(colitis|pancolitis))\b', line))
#             active_col = bool(re.search(r'\b(?:active\W+(?:\w+\W+){0,1}?colitis)\b', line))
#             inactive_col = bool(re.search(r'\b(?:inactive\W+(?:\w+\W+){0,1}?colitis)\b', line))
            
#             if chronic_col and active_col and e_text=='colitis':
#                 entity_result = entity_result + 'chronic active ' + e_text + str(not e_bool) + '\n'
#             elif chronic_col and inactive_col and e_text=='colitis':
#                 entity_result = entity_result + 'chronic inactive ' + e_text + str(not e_bool) + '\n'
#             elif chronic_col and e_text=='colitis':
#                 entity_result = entity_result + 'chronic ' + e_text + str(not e_bool) + '\n'
#             elif active_col and e_text=='colitis':
#                 entity_result = entity_result + 'active ' + e_text + str(not e_bool) + '\n'
#             elif inactive_col and e_text=='colitis':
#                 entity_result = entity_result + 'inactive ' + e_text + str(not e_bool) + '\n'
#             else:

            e_text = " ".join(e_text.split())
            entity_result = entity_result + e_text + ' ' + str(not e_bool) + '\n'
                
        
    return entity_result

In [411]:
" ".join(foo.split())

'COLON REDIIT'

In [124]:
line = '''
Mild (Mayo Score 1) ulcerative colitis 
                       in one patch in the ascending colon. 
                       Otherwise, no active inflammation detected 
                       in the colon'''.lower()

In [125]:
print(entity_recognition_2(line))

NameError: name 'entity_recognition_2' is not defined

In [398]:
line = ''' 2. PARTIAL COLECTOMY, SIGMOID (65 CM), RECTUM (3.5 CM):
     Diffuse chronic inactive colitis with a focal polypoid area of HIGH GRADE
       DYSPLASIA (3.2 cm), extending to distal inked edge of specimen.
     No invasion is seen.
     No granulomas are noted.
     No lymph nodes are identified in the specimen.     
 
3. SPECIMEN LABELED "DISTAL MARGIN":
     Chronic inactive colitis, diffuse.
     No granulomas or dysplasia.  '''.lower()

In [400]:
test_result = entity_recognition_2(line)

In [None]:
for x in test_result.split('\n'):
    if 'colitis' in x and 'True' in x:
        colitis = True
    if 'chronic' in x and 'colitis' in x and 'True' in x:
        chronic_colitis = True
    if 'mild' in x and 'colitis' in x and 'True' in x:
        mild_colitis = True
    if 'moderate' in x and 'colitis' in x and 'True' in x:
        moderate_colitis = True
    if 'severe' in x and 'colitis' in x and 'True' in x:
        severe_colitis = True
    if 'active' in x and 'colitis' in x and 'True' in x:
        active_colitis = True
    if 'inactive' in x and 'colitis' in x and 'True' in x:
        inactive_colitis = True
    if 'acute' in x and 'colitis' in x and 'True' in x:
        acute_colitis = True
    

In [417]:
[x for x in test_result.split('\n') if ('colitis' in x and 'True' in x)]

['diffuse chronic inactive-colitis True', 'chronic inactive-colitis True']

In [None]:
[1 for x in test_result.split('\n') if ('colitis' in x and 'True' in x)]

In [399]:
print(entity_recognition_2(line))

partial True
sigmoid True
rectum True
diffuse chronic inactive-colitis True
focal polypoid True
high grade dysplasia True
distal True
specimen True
invasion False
granulomas False
lymph nodes False
identified False
specimen False
specimen True
labeled True
distal margin True
chronic inactive-colitis True
granulomas False
dysplasia False



In [407]:
filter_1 = path_merged['Visit']=='Inpatient'
filter_2 = ~path_merged['Endo_Report_Description'].isna()
filter_3 = path_merged['Report_Text'].str.contains('colitis', case=False, na=False)
filter_4 = path_merged['has_final_diagnosis']==True
filter_5 = path_merged['Report_Text'].str.contains('active', case=False, na=False)
filter_6 = path_merged['Report_Text'].str.contains('acute', case=False, na=False)
filter_7 = path_merged['Report_Text'].str.contains('severe', case=False, na=False)
filter_8 = path_merged['Report_Text'].str.contains('moderate', case=False, na=False)
filter_9 = path_merged['Report_Text'].str.contains('pancolitis', case=False, na=False)
filter_10 = path_merged['Report_Text'].str.contains('mild', case=False, na=False)
filter_11 = path_merged['Report_Text'].str.contains('inactive', case=False, na=False)


In [408]:
filter_13 = path_merged['Report_Text'].str.contains('COLECTOMY', case=False, na=False)


In [1061]:
filter_12 = path_merged['Report_Text'].str.contains('inactive colitis', case=False, na=False)


In [1107]:
filter_13 = path_merged['Report_Text'].str.contains('biops', case=False, na=False)
# filter_14 = path_merged['Report_Text'].str.contains('sigmoid', case=False, na=False)
# filter_15 = path_merged['Report_Text'].str.contains('endoscop', case=False, na=False)

In [1105]:
sum(filter_1&filter_3&filter_4&filter_15&~filter_13)

12

In [1110]:
path_merged[filter_1&filter_3&filter_4&filter_13].sample(300).to_csv('/Users/pkc17/MGH/IBD_RPDR/biopsies.csv')
path_merged[filter_1&filter_3&filter_4&~filter_13].sample(300).to_csv('/Users/pkc17/MGH/IBD_RPDR/non_biopsies.csv')

In [698]:
for index, row in path_merged[filter_1&filter_3&filter_4&filter_13].sample(200).iterrows():
    print('#####################################', end='\n\n')
    print(index, end='\n\n')
    print(row.MRN_Type,  end='\n\n')
    print(row.Report_Description,  end='\n\n')
    print(row.Report_Status,  end='\n\n')
    print(row.datetime,  end='\n\n')
    print(row.Endo_Report_Description,  end='\n\n')
    print(row.Visit,  end='\n\n')
    print(row['Report_Text'], end='\n\n')
    print(entity_recognition_2(row['Report_Text']), end='\n\n')
    print('#####################################', end='\n\n')

In [749]:
dem = pd.read_csv('/Users/pkc17/Downloads/Decrypted/HK961_20210611_114201_Dem.txt', delimiter='|')C.          COLON, SIGMOID, BIOPSY:
    COLONIC MUCOSA WITH FOCAL ACTIVE COLITIS, MODERATE TO FOCALLY SEVERE. 
SEENOTE.

In [751]:
dem.to_csv('/Users/pkc17/Downloads/Decrypted/HK961_20210611_114201_Dem.csv')

In [762]:
endo_df_col = endoscopy.load_RPDR_endo('/Users/pkc17/Downloads/Decrypted/HK961_20210611_114201_End.txt', delimiter='|', datetime_col='Report_Date_Time')

Reformatting path file to allow multi-line report text to be readable, saving as : /Users/pkc17/Downloads/Decrypted/HK961_20210611_114201_End_multiline_corrected.txt
Reading from : /Users/pkc17/Downloads/Decrypted/HK961_20210611_114201_End_multiline_corrected.txt


In [765]:
endo_df_col.reset_index(drop=True).to_csv('/Users/pkc17/Downloads/Decrypted/HK961_20210611_114201_End.csv')

In [766]:
rad_df_col = endoscopy.load_RPDR_endo('/Users/pkc17/Downloads/Decrypted/HK961_20210611_114201_Rad.txt', delimiter='|', datetime_col='Report_Date_Time')

Reformatting path file to allow multi-line report text to be readable, saving as : /Users/pkc17/Downloads/Decrypted/HK961_20210611_114201_Rad_multiline_corrected.txt
Reading from : /Users/pkc17/Downloads/Decrypted/HK961_20210611_114201_Rad_multiline_corrected.txt


In [768]:
rad_df_col.reset_index(drop=True).to_csv('/Users/pkc17/Downloads/Decrypted/HK961_20210611_114201_Rad.csv')

In [425]:
filter_1 = endo_df_col_3['Report_Text'].str.contains('colitis', case=False, na=False)

In [699]:
for index, row in endo_df_col_3[filter_1].sample(200).iterrows():
    print('#####################################', end='\n\n')
    print(index, end='\n\n')
    print(row.MRN_Type,  end='\n\n')
    print(row.Report_Description,  end='\n\n')
    print(row.Report_Status,  end='\n\n')
    print(row.datetime,  end='\n\n')
    print(row['Report_Text'], end='\n\n')
    print(entity_recognition_2(row['Report_Text']), end='\n\n')
    print('#####################################', end='\n\n')

#### Function design

In [422]:
endo_df_col_3['Report_Text']

Unnamed: 0_level_0,EMPI,EPIC_PMRN,MRN_Type,MRN,Report_Number,Report_Date_Time,Report_Description,Report_Status,Report_Type,Report_Text,datetime,has_dx_start,dx_start_line,has_dx_end,dx_end_line,dx_end_line_LAFD
unique_report_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
100000055_1,100000055,10040000407,BWH,00009241,1,11/10/2005 12:46:00 PM,EGD,Signed,NDO,FINDINGS:\nHYPOPHARYNX: The hypopharynx appea...,2005-11-10 12:46:00,True,FINDINGS:,True,PROCEDURE CODES:,11.0
100000055_2,100000055,10040000407,BWH,00009241,2,11/11/2005 12:47:00 PM,COLONOSCOPY,Signed,NDO,FINDINGS: The colonic mucosa appeared entirel...,2005-11-11 12:47:00,True,FINDINGS: The colonic mucosa appeared entirel...,True,RECOMMENDATION:,11.0
100000055_25739,100000055,10040000407,BWH,00009241,25739,4/13/2009 9:36:00 AM,COLONOSCOPY,Signed,NDO,Findings:\n A sessile dimin polyp was foun...,2009-04-13 09:36:00,True,Findings:,True,Recommendation: - Discharge patient to ho...,12.0
100000055_43720,100000055,10040000407,BWH,00009241,43720,4/30/2010 3:58:00 PM,ERCP,Signed,NDO,Impression: - Choledocholithiasis was...,2010-04-30 15:58:00,True,Impression: - Choledocholithiasis was...,True,"Recommendation: - Watch for pancreatitis,...",3.0
100000055_122215,100000055,10040000407,BWH,00009241,122215,9/30/2013 8:36:00 AM,Upper GI endoscopy,Signed,NDO,Findings:\n The examined esophagus was nor...,2013-09-30 08:36:00,True,Findings:,True,Recommendation: - Return patient to hospi...,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115453324_92545347,115453324,10145099312,MGH,7294669,92545347,8/30/2013 11:20:00 AM,Upper GI endoscopy,Final,NDO,Findings:\n The esophagus was normal. Mult...,2013-08-30 11:20:00,True,Findings:,True,Recommendation: - Regular diet.,7.0
115453324_92545437,115453324,10145099312,MGH,7294669,92545437,9/1/2013 9:23:00 AM,Colonoscopy,Revised,NDO,Findings:\n The colon (entire examined por...,2013-09-01 09:23:00,True,Findings:,True,Recommendation: - Per GI team,9.0
115453324_92545809,115453324,10145099312,MGH,7294669,92545809,9/4/2013 8:23:00 AM,Video capsule endoscopy,Final,NDO,"Findings:\n Images of the esophagus, stoma...",2013-09-04 08:23:00,True,Findings:,False,,-1.0
115468050_146699,115468050,10145248109,BWH,44307320,146699,1/30/2015 10:21:00 AM,Flexible Sigmoidoscopy,Signed,NDO,Findings:\n Internal hemorrhoids were foun...,2015-01-30 10:21:00,True,Findings:,True,Recommendation: - Await pathology results...,11.0


In [434]:
temp_df = endo_df_col_3.sample(1000)

In [444]:
def is_colitis(pathdf, corpus='en_core_sci_lg', term_set='en_clinical', update=True):

    '''
    corpus: en_ner_bc5cdr_md, en_core_sci_md, en_core_sci_lg
    termset: en, en_clinical, en_clinical_sensitive

    '''
    
    fil_subset = pathdf.MRN_Type.isin(['MGH', 'BWH','NWH','FH','NSM'])
    df_path = pathdf[fil_subset].copy()


    import spacy
    from negspacy.negation import Negex
    from negspacy.termsets import termset
    #from spacy.pipeline import EntityRuler

    ts = termset(term_set)

    config={
        "neg_termset":{
            "pseudo_negations": ts.terms['pseudo_negations'] + ['and stage', 'grade'],
            "preceding_negations": ts.terms['preceding_negations'] + ['negative', 'grade 0'],
            "following_negations": ts.terms['following_negations'] + ['negative', 'not', 'absent', 'grade 0'],
            "termination": ts.terms['termination'] + ['note:', 'moderate']
        }
    }


    nlp_2 = spacy.load(corpus) 

    # ruler = EntityRuler(nlp_2, overwrite_ents=True)
    # patterns = [
    #     {"label": "ENTITY", "pattern": [{"LOWER": "chronic inflammation"}]}
    #         ]
    # ruler.add_patterns(patterns)

    nlp_2.add_pipe(
        "negex",
        config = config
    )

    num_reports = df_path.shape[0]
    colitis_col = []
    chronic_colitis_col = []
    mild_colitis_col = []
    moderate_colitis_col = []
    severe_colitis_col = []
    inactive_colitis_col = []
    active_colitis_col= []
    acute_colitis_col = []
    disease_list_col = []

    for i in range(0,num_reports):

        # extract path report for this entry
        disease_list = []
        report_text = df_path.iloc[i,:].Report_Text
        result_text = entity_recognition_2(report_text)


        colitis = False
        chronic_colitis = False
        mild_colitis = False
        moderate_colitis = False
        severe_colitis = False
        inactive_colitis = False
        active_colitis = False
        acute_colitis = False


        for x in result_text.split('\n'):
            if 'colitis' in x and 'True' in x:
                colitis = True
            if 'chronic' in x and 'colitis' in x and 'True' in x:
                chronic_colitis = True
            if 'mild' in x and 'colitis' in x and 'True' in x:
                mild_colitis = True
            if 'moderate' in x and 'colitis' in x and 'True' in x:
                moderate_colitis = True
            if 'severe' in x and 'colitis' in x and 'True' in x:
                severe_colitis = True
            if 'active' in x and 'colitis' in x and 'True' in x:
                active_colitis = True
            if 'inactive' in x and 'colitis' in x and 'True' in x:
                inactive_colitis = True
            if 'acute' in x and 'colitis' in x and 'True' in x:
                acute_colitis = True

            if 'colitis' in x and 'True' in x:
                disease_list.append(x)
        
        colitis_col.append(colitis)
        chronic_colitis_col.append(chronic_colitis)
        mild_colitis_col.append(mild_colitis)
        moderate_colitis_col.append(moderate_colitis)
        severe_colitis_col.append(severe_colitis)
        active_colitis_col.append(active_colitis)
        inactive_colitis_col.append(inactive_colitis)
        acute_colitis_col.append(acute_colitis)
        disease_list_col.append(disease_list)
        
    df_path['colitis'] = colitis_col
    df_path['chronic_colitis'] = chronic_colitis_col
    df_path['mild_colitis'] = mild_colitis_col
    df_path['moderate_colitis'] = moderate_colitis_col
    df_path['severe_colitis'] = severe_colitis_col
    df_path['active_colitis'] = active_colitis_col
    df_path['inactive_colitis'] = inactive_colitis_col
    df_path['acute_colitis'] = acute_colitis_col
    df_path['disease_list'] = disease_list_col
   
    if update:
        # re-merge with original data
        print('Updating input path dataframe')
        pathdf['colitis'] = np.nan
        pathdf['chronic_colitis'] = np.nan
        pathdf['mild_colitis'] = np.nan
        pathdf['moderate_colitis'] = np.nan
        pathdf['severe_colitis'] = np.nan
        pathdf['active_colitis'] = np.nan
        pathdf['inactive_colitis'] = np.nan
        pathdf['acute_colitis'] = np.nan
        pathdf['disease_list'] = np.nan
        pathdf.update(df_path)
        return_df = pathdf.copy()
    else:
        # return this mgh path only file
        print('Returning MGH, BWH only entries with truncated path reports')
        return_df = df_path
        

    return return_df

    

In [445]:
colitis_df = is_colitis(temp_df.copy(), update=True)

Updating input path dataframe


## Function test

In [241]:
import spacy
from negspacy.negation import Negex
from negspacy.termsets import termset

ts = termset("en_clinical")

config={
        "neg_termset":{
            "pseudo_negations": ts.terms['pseudo_negations'] + ['and stage', 'grade', 'active'],
            "preceding_negations": ts.terms['preceding_negations'] + ['negative'],
            "following_negations": ts.terms['following_negations'] + ['negative', 'unremarkable', 'is not', 'are not', 'does not', 'may not', 'have not', 'was not', 'were not', 'absent', 'not present'],
            "termination": ts.terms['termination'] + ['note:', 'moderate']
        }
    }


#from spacy.pipeline import EntityRuler

corpus = 'en_core_sci_lg' # en_ner_bc5cdr_md, en_core_sci_md, en_core_sci_lg

# en = phrases for general english language text
# en_clinical DEFAULT = adds phrases specific to clinical domain to general english
# en_clinical_sensitive = adds additional phrases to help rule out historical and possibly irrelevant entities

nlp_2 = spacy.load(corpus) 

# ruler = EntityRuler(nlp_2, overwrite_ents=True)

# patterns = [
#     {"label": "ENTITY", "pattern": [{"LOWER": "chronic inflammation"}]}
#         ]
# ruler.add_patterns(patterns)

nlp_2.add_pipe(
    "negex",
    config = config
)


<negspacy.negation.Negex at 0x7f802ad7e820>

In [353]:
text = '''Mayo grade 1 in rectum, grade 1-2 in 
                     sigmoid, grade 1 in descending colon
'''.lower()

In [351]:
re.findall(r'grade.*?(\d+(?:,\d+)*(?:\.\d+)?)', text)

['1', '1', '1']

In [356]:
importlib.reload(entity_recognition)

<module 'entity_recognition' from '/Users/pkc17/MGH/RPDR/entity_recognition.py'>

In [358]:
print(entity_recognition_colon.entities(text, nlp_2))

rectum True
grade True
sigmoid True
grade True
descending colon True
colitis mayo-1.5 True



# End

In [50]:
importlib.reload(entity_recognition)

<module 'entity_recognition' from '/Users/pkc17/MGH/RPDR/entity_recognition.py'>

In [3]:
div_dem = demographics.load_RPDR_dem('/Users/pkc17/Downloads/diverticulitis/DCC3_20210901_114721_Dem.txt', delimiter='|')

In [11]:
div_dem.to_csv('/Users/pkc17/Downloads/diverticulitis/DCC3_20210901_114721_Dem_processed.csv')

In [20]:
div_endo = endoscopy.load_RPDR_endo('/Users/pkc17/Downloads/diverticulitis/DCC3_20210901_114721_End.txt', delimiter='|', datetime_col='Report_Date_Time')
# div_endo = div_endo.sort_values(['EMPI', 'datetime'])

div_endo_2 = endoscopy.truncate_dx_start(div_endo.copy(), update=True)

div_endo_3 = endoscopy.truncate_dx_end(div_endo_2.copy(), update=True)

Reading from : /Users/pkc17/Downloads/diverticulitis/DCC3_20210901_114721_End_multiline_corrected.txt
Truncating to only final diagnosis...
Updating input path dataframe with truncated MGH, BWH path reports


In [22]:
div_endo_3.reset_index().drop(['dx_end_line_LAFD', 'unique_report_id'], axis=1).to_csv('/Users/pkc17/Downloads/diverticulitis/DCC3_20210901_114721_End_processed.csv')

In [29]:
div_rad = endoscopy.load_RPDR_endo('/Users/pkc17/Downloads/diverticulitis/DCC3_20210901_114721_Rad.txt', delimiter='|', datetime_col='Report_Date_Time')
# div_endo = div_endo.sort_values(['EMPI', 'datetime'])

div_rad_2 = endoscopy.truncate_dx_start(div_rad.copy(), update=True)

div_rad_3 = endoscopy.truncate_dx_end(div_rad_2.copy(), update=True)

Reading from : /Users/pkc17/Downloads/diverticulitis/DCC3_20210901_114721_Rad_multiline_corrected.txt
Truncating to only final diagnosis...
Updating input path dataframe with truncated MGH, BWH path reports


In [30]:
div_rad_3.reset_index().drop(['dx_end_line_LAFD', 'unique_report_id'], axis=1).to_csv('/Users/pkc17/Downloads/diverticulitis/DCC3_20210901_114721_Rad_processed.csv')

In [33]:
div_radt = pd.read_csv('/Users/pkc17/Downloads/diverticulitis/DCC3_20210901_114721_Rdt.txt', delimiter='|')

In [35]:
div_radt.to_csv('/Users/pkc17/Downloads/diverticulitis/DCC3_20210901_114721_Rdt_processed.csv')