In [1]:
# import relevant packages
from collections import defaultdict
import pandas as pd
import os
import numpy as np

In [2]:
# read data dictionary 
data_dict = pd.read_csv("data_dictionary.csv")

In [3]:
filepath = "/Volumes/SRUTI/release_08-19-2020/training/"

In [4]:
# list files in training set
os.listdir(filepath)

['observation.csv',
 'device_exposure.csv',
 'visit_occurrence.csv',
 'goldstandard.csv',
 'procedure_occurrence.csv',
 'person.csv',
 'measurement.csv',
 'location.csv',
 'drug_exposure.csv',
 'condition_occurrence.csv',
 'observation_period.csv',
 '._observation.csv',
 '._device_exposure.csv',
 '._visit_occurrence.csv',
 '._goldstandard.csv',
 '._procedure_occurrence.csv',
 '._person.csv',
 '._measurement.csv',
 '._location.csv',
 '._drug_exposure.csv',
 '._condition_occurrence.csv',
 '._observation_period.csv']

In [17]:
# read all csv files
# observation = pd.read_csv(filepath+"observation.csv")
# device_exposure = pd.read_csv(filepath+"device_exposure.csv")
# visit_occurrence = pd.read_csv(filepath+"visit_occurrence.csv")
# goldstandard = pd.read_csv(filepath+"goldstandard.csv")
# procedure_occurrence = pd.read_csv(filepath+"procedure_occurrence.csv")
# person = pd.read_csv(filepath+"person.csv")
measurement = pd.read_csv(filepath+"measurement.csv")
# location = pd.read_csv(filepath+"location.csv")
# drug_exposure = pd.read_csv(filepath+"drug_exposure.csv")
# condition_occurrence = pd.read_csv(filepath+"condition_occurrence.csv")
# observation_period = pd.read_csv(filepath+"observation_period.csv")

### Exploratory data analysis

In [6]:
data_dict.head()

Unnamed: 0,concept_id,concept_name,table
0,22274,Neoplasm of uncertain behavior of larynx,condition_occurrence
1,22281,Sickle cell-hemoglobin SS disease,condition_occurrence
2,22288,Hereditary elliptocytosis,condition_occurrence
3,22340,Esophageal varices without bleeding,condition_occurrence
4,22350,Edema of larynx,condition_occurrence


In [7]:
data_dict.table.unique()

array(['condition_occurrence', 'observation', 'drug_exposure',
       'measurement', 'visit_occurrence', 'procedure_occurrence',
       'device_exposure'], dtype=object)

In [8]:
# convert to a dict for easy lookup
concept_id_map = dict(zip(data_dict.concept_id,data_dict.concept_name))

In [9]:
# data_dictionary has concept_name and concept_id for 7 tables

In [10]:
# tables that do not have a concept_id column are: goldstandard, person, location and observation_period

In [11]:
from datetime import datetime

def total_hours(s1, s2, FMT):
    tdelta = datetime.strptime(s2, FMT) - datetime.strptime(s1, FMT)
    # return time difference in hours
    return tdelta.total_seconds()/3600

#### Observation

Description: The OBSERVATION table captures clinical facts about a Person obtained in the context of examination, questioning or a procedure. Any data that cannot be represented by any other domains, such as social and lifestyle facts, medical history, family history, etc. are recorded here.

Number of Rows: 1,865,000
    
Vocabulary: SNOMED, HCPCS

In [12]:
observation.head(10)

Unnamed: 0,observation_id,person_id,observation_concept_id,observation_date,observation_datetime,observation_type_concept_id,value_as_number,value_as_string,value_as_concept_id,qualifier_concept_id,unit_concept_id,provider_id,visit_occurrence_id,visit_detail_id,observation_source_value,observation_source_concept_id,unit_source_value,qualifier_source_value
0,1,33941,4196147,2015-04-16,2015-04-16,3028553,100.0,,,,,,,,,4196147,,
1,2,26808,4196147,2019-01-22,2019-01-22,3028553,93.0,,,,,,,,,4196147,,
2,3,8964,4196147,2010-06-23,2010-06-23,3028553,92.0,,,,,,,,,4196147,,
3,4,15753,3003798,2011-12-10,2011-12-10,3028553,,,,,,,,,,3003798,,
4,5,2316,37208405,2014-07-24,2014-07-24,38000280,,Never,763692.0,,,,,,,37208405,,
5,6,32570,4196147,2017-04-17,2017-04-17,3028553,95.0,,,,8554.0,,,,,4196147,%,
6,7,11842,37208405,2020-04-21,2020-04-21,38000280,,No,4188540.0,,,,,,,37208405,,
7,8,37199,3003798,2012-03-01,2012-03-01,3028553,,,,,,,,,,3003798,,
8,9,33758,37208405,2018-10-27,2018-10-27,38000280,,Yes,4188539.0,,,,,,,37208405,,
9,10,14274,37208405,2010-10-05,2010-10-05,38000280,,No,4188540.0,,,,,,,37208405,,


In [54]:
# example observations
print(concept_id_map[4196147])
print(concept_id_map[3003798])
print(concept_id_map[37208405])

Peripheral oxygen saturation
Blood pressure method
History of alcohol use


In [55]:
observation.columns

Index(['observation_id', 'person_id', 'observation_concept_id',
       'observation_date', 'observation_datetime',
       'observation_type_concept_id', 'value_as_number', 'value_as_string',
       'value_as_concept_id', 'qualifier_concept_id', 'unit_concept_id',
       'provider_id', 'visit_occurrence_id', 'visit_detail_id',
       'observation_source_value', 'observation_source_concept_id',
       'unit_source_value', 'qualifier_source_value'],
      dtype='object')

In [56]:
# drop columns that are not important
observation.drop(columns=['observation_id', 'observation_date', 'observation_datetime', 
                          'value_as_concept_id', 'qualifier_concept_id', 'unit_concept_id', 
                          'provider_id', 'visit_occurrence_id', 'visit_detail_id', 'observation_type_concept_id',
                          'unit_source_value', 'qualifier_source_value', 'observation_source_value', 'observation_source_concept_id'], inplace=True)

In [14]:
observation['observation_concept_id'] = [concept_id_map[i] for i in observation.observation_concept_id]

In [58]:
observation.rename(columns={"observation_concept_id": "observation_type"}, errors="raise", inplace=True)

In [59]:
observation.head()

Unnamed: 0,person_id,observation_type,value_as_number,value_as_string
0,33941,Peripheral oxygen saturation,100.0,
1,26808,Peripheral oxygen saturation,93.0,
2,8964,Peripheral oxygen saturation,92.0,
3,15753,Blood pressure method,,
4,2316,History of alcohol use,,Never


In [18]:
# observation[(observation.person_id==33941)&(observation.observation_concept_id=="Peripheral oxygen saturation")]

--------------

#### Device Exposure

Description: The DEVICE_EXPOSURE table captures information about a person's exposure to a foreign physical object or instrument which is used for diagnostic or therapeutic purposes through a mechanism beyond chemical action.

Number of Rows: 800

Vocabulary: SNOMED

In [12]:
device_exposure.head(10)

Unnamed: 0,device_exposure_id,person_id,device_concept_id,device_exposure_start_date,device_exposure_start_datetime,device_exposure_end_date,device_exposure_end_datetime,device_type_concept_id,unique_device_id,quantity,provider_id,visit_occurrence_id,visit_detail_id,device_source_value,device_source_concept_id
0,1,3615,45768197,2018-07-08,2018-07-08 13:15:00,2018-07-08,2018-07-08 13:15:00,44818707,,1,,,,,45768197
1,2,921,45768197,2016-06-09,2016-06-09 13:15:00,2016-06-12,2016-06-12 13:15:00,44818707,,1,,,,,45768197
2,3,3736,45768197,2018-12-16,2018-12-16 13:15:00,2018-12-24,2018-12-24 13:15:00,44818707,,1,,,,,45768197
3,4,20374,45768197,2017-09-16,2017-09-16 13:15:00,2017-09-23,2017-09-23 13:15:00,44818707,,1,,,,,45768197
4,5,43281,45768197,2010-10-06,2010-10-06 13:15:00,2010-10-09,2010-10-09 13:15:00,44818707,,1,,,,,45768197
5,6,6275,45768197,2018-04-06,2018-04-06 13:15:00,2018-04-09,2018-04-09 13:15:00,44818707,,1,,,,,45768197
6,7,38369,45768197,2015-03-15,2015-03-15 13:15:00,2015-03-23,2015-03-23 13:15:00,44818707,,1,,,,,45768197
7,8,6449,45768197,2020-02-03,2020-02-03 13:15:00,2020-02-03,2020-02-03 13:15:00,44818707,,1,,,,,45768197
8,9,19224,45768197,2013-07-23,2013-07-23 13:15:00,2013-07-30,2013-07-30 13:15:00,44818707,,1,,,,,45768197
9,10,38719,45768197,2012-01-16,2012-01-16 13:15:00,2012-01-19,2012-01-19 13:15:00,44818707,,1,,,,,45768197


In [13]:
device_exposure.device_concept_id.unique()

array([45768197])

In [14]:
# only 1 device here
print(concept_id_map[45768197])

Ventilator


In [16]:
device_exposure.drop(columns=['device_exposure_id', 'device_exposure_start_date', 'device_exposure_end_date',
       'device_type_concept_id', 'unique_device_id', 'quantity', 'provider_id',
       'visit_occurrence_id', 'visit_detail_id', 'device_source_value',
       'device_source_concept_id'], inplace=True)

In [18]:
device_exposure['ventilator_time'] = [total_hours(s1,s2, FMT='%Y-%m-%d %H:%M:%S') for s1,s2 in zip(device_exposure.device_exposure_start_datetime, device_exposure.device_exposure_end_datetime)]

In [20]:
# drop datetime columns
device_exposure.drop(columns=['device_concept_id', 'device_exposure_start_datetime', 'device_exposure_end_datetime'], inplace=True)

In [21]:
device_exposure.head()

Unnamed: 0,person_id,ventilator_time
0,3615,0.0
1,921,72.0
2,3736,192.0
3,20374,168.0
4,43281,72.0


--------------

#### Visit Occurance

Description: The VISIT_OCCURRENCE table contains the spans of time a Person continuously receives medical services from one or more providers at a Care Site in a given setting within the health care system. Visits are classified into 4 settings: outpatient care, inpatient confinement, emergency room, and long-term care. Persons may transition between these settings over the course of an episode of care (for example, treatment of a disease onset).

Number of Rows: 740,000

In [36]:
visit_occurrence.head()

Unnamed: 0,visit_occurrence_id,person_id,visit_concept_id,visit_start_date,visit_start_datetime,visit_end_date,visit_end_datetime,visit_type_concept_id,provider_id,care_site_id,visit_source_value,visit_source_concept_id,admitting_source_concept_id,admitting_source_value,discharge_to_concept_id,discharge_to_source_value,preceding_visit_occurrence_id
0,1,35780,9203,2013-11-07,2013-11-07 16:40:00,2013-11-07,2013-11-07 16:40:00,44818518,,,,9203,32209,,32209,,
1,2,40235,5083,2004-06-22,2004-06-22 16:40:00,2004-06-29,2004-06-29 16:40:00,44818518,,,,5083,32209,,32209,,
2,3,476,581479,1995-10-06,1995-10-06 16:40:00,1995-10-10,1995-10-10 16:40:00,44818518,,,,581479,32209,,32209,,
3,4,6064,44790889,1989-09-10,1989-09-10 16:40:00,1989-09-20,1989-09-20 16:40:00,44818518,,,,44790889,32209,,32209,,
4,5,43007,9203,1989-05-25,1989-05-25 16:40:00,1989-05-28,1989-05-28 16:40:00,44818518,,,,9203,32209,,32209,,


In [37]:
visit_occurrence.drop(columns=['visit_occurrence_id',
       'visit_start_date',
       'visit_end_date', 'visit_type_concept_id', 'provider_id',
       'care_site_id', 'visit_source_value', 
       'admitting_source_concept_id', 'admitting_source_value',
       'discharge_to_concept_id', 'discharge_to_source_value',
       'preceding_visit_occurrence_id'], inplace=True)

In [38]:
visit_occurrence['visit_duration'] = [total_hours(s1, s2, FMT='%Y-%m-%d %H:%M:%S') for s1,s2 in zip(visit_occurrence.visit_start_datetime, visit_occurrence.visit_end_datetime)]

In [40]:
visit_occurrence

Unnamed: 0,person_id,visit_concept_id,visit_start_datetime,visit_end_datetime,visit_source_concept_id,visit_duration
0,35780,9203,2013-11-07 16:40:00,2013-11-07 16:40:00,9203,0.0
1,40235,5083,2004-06-22 16:40:00,2004-06-29 16:40:00,5083,168.0
2,476,581479,1995-10-06 16:40:00,1995-10-10 16:40:00,581479,96.0
3,6064,44790889,1989-09-10 16:40:00,1989-09-20 16:40:00,44790889,240.0
4,43007,9203,1989-05-25 16:40:00,1989-05-28 16:40:00,9203,72.0
...,...,...,...,...,...,...
3121641,20337,9201,1993-07-02 16:40:00,1993-07-12 16:40:00,9201,240.0
3121642,9101,32037,1995-09-13 16:40:00,1995-09-14 16:40:00,32037,24.0
3121643,14454,5083,1987-08-08 16:40:00,1987-08-08 16:40:00,5083,0.0
3121644,8210,32037,2013-04-25 16:40:00,2013-05-04 16:40:00,32037,216.0


In [41]:
visit_occurrence['visit_concept_id'] = [concept_id_map[i] for i in visit_occurrence.visit_concept_id]

In [43]:
visit_occurrence.drop(columns=['visit_start_datetime', 'visit_end_datetime', 'visit_source_concept_id'], inplace=True)

In [45]:
visit_occurrence.rename(columns={"visit_concept_id": "visit_type"}, errors="raise", inplace=True)

In [46]:
visit_occurrence.head()

Unnamed: 0,person_id,visit_type,visit_duration
0,35780,Emergency Room Visit,0.0
1,40235,Telehealth,168.0
2,476,Ambulatory Rehabilitation Visit,96.0
3,6064,Telephone call to a patient,240.0
4,43007,Emergency Room Visit,72.0


--------------

#### Gold Standard

Description: The goldstandard file will be available to models while they train and can be used designate true positive and true negative patients. This file stores the true status of the patients in relation to the question being asked. For instance, for question 1, the status is a binary declaring whether a patient tested positive (1.0) or negative (0.0).

Number of Rows: Same as person table

In [21]:
goldstandard.head()

Unnamed: 0,person_id,status
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0


In [108]:
goldstandard.status.value_counts()

0.0    44947
1.0     4944
Name: status, dtype: int64

In [112]:
# this will be target y to predict
y = np.array(goldstandard.status)

------------

#### Procedure Occurance

Description: The PROCEDURE_OCCURRENCE table contains records of activities or processes ordered by, or carried out by, a healthcare provider on the patient to have a diagnostic or therapeutic purpose.

Number of Rows: 34,000

Vocabulary: CPT4

In [22]:
procedure_occurrence.head()

Unnamed: 0,procedure_occurrence_id,person_id,procedure_concept_id,procedure_date,procedure_datetime,procedure_type_concept_id,modifier_concept_id,quantity,provider_id,visit_occurrence_id,visit_detail_id,procedure_source_value,procedure_source_concept_id,modifier_source_value
0,1,162,2747010,2011-07-24,2011-07-24,257,,,,,,,2747010,
1,2,10568,2788032,2019-09-01,2019-09-01,257,,,,,,,2788032,
2,3,9675,2752623,2018-02-01,2018-02-01,257,,,,,,,2752623,
3,4,4385,4148375,2017-05-02,2017-05-02,257,,,,,,,2001538,
4,5,40273,42894832,2010-07-25,2010-07-25,257,,,,,,,42894832,


In [92]:
procedure_occurrence.drop(columns=['procedure_occurrence_id',
       'procedure_date', 'procedure_type_concept_id',
       'modifier_concept_id', 'quantity', 'provider_id', 'visit_occurrence_id',
       'visit_detail_id', 'procedure_source_value',
       'procedure_source_concept_id', 'modifier_source_value'], inplace=True)

In [94]:
procedure_concept_id = [concept_id_map[i] for i in procedure_occurrence.procedure_concept_id]

In [96]:
procedure_occurrence['procedure_concept_id'] = procedure_concept_id

In [None]:
procedure_occurrence.rename(columns={"procedure_concept_id": "procedure_type"}, errors="raise", inplace=True)

In [97]:
procedure_occurrence.head()

Unnamed: 0,person_id,procedure_concept_id,procedure_datetime
0,162,"Excision of Ileum, Open Approach",2011-07-24
1,10568,"Performance of Cardiac Output, Continuous",2019-09-01
2,9675,"Repair Anal Sphincter, Open Approach",2018-02-01
3,4385,Catheterization of both left and right heart,2017-05-02
4,40273,"Replacement of Thoracic Aorta, Ascending/Arch ...",2010-07-25


------------

#### Person

Description: The PERSON table stores demographic information about all the patients in the repository. The Person table contains records that uniquely identify each patient in the data.

Number of Rows: 9,000

In [126]:
person.head()

Unnamed: 0,person_id,gender_concept_id,year_of_birth,month_of_birth,day_of_birth,birth_datetime,race_concept_id,ethnicity_concept_id,location_id,provider_id,care_site_id,person_source_value,gender_source_value,gender_source_concept_id,race_source_value,race_source_concept_id,ethnicity_source_value,ethnicity_source_concept_id
0,0,8507,1948,3,7,1948-03-07,8527,38003563,6977,,,,M,0,,30,,28
1,1,8507,1932,9,25,1932-09-25,8515,38003564,1605,,,,M,0,,10,,29
2,2,8532,1963,6,2,1963-06-02,8527,38003564,2242,,,,F,0,,30,,29
3,3,8532,1989,1,5,1989-01-05,8527,38003564,97,,,,F,0,,30,,29
4,4,8507,1951,7,25,1951-07-25,8527,38003563,3915,,,,M,0,,30,,28


In [127]:
person.gender_concept_id.value_counts()

8532    27130
8507    22761
Name: gender_concept_id, dtype: int64

In [129]:
person.drop(columns=['gender_concept_id', 'year_of_birth', 'month_of_birth',
       'day_of_birth', 'race_concept_id',
       'ethnicity_concept_id', 'provider_id', 'care_site_id',
       'person_source_value',
       'gender_source_concept_id', 'race_source_value',
       'ethnicity_source_value'], inplace=True)

In [130]:
person.head()

Unnamed: 0,person_id,birth_datetime,location_id,gender_source_value,race_source_concept_id,ethnicity_source_concept_id
0,0,1948-03-07,6977,M,30,28
1,1,1932-09-25,1605,M,10,29
2,2,1963-06-02,2242,F,30,29
3,3,1989-01-05,97,F,30,29
4,4,1951-07-25,3915,M,30,28


In [131]:
# what are all the races?
person.race_source_concept_id.value_counts()

30    22582
13     9134
10     9124
26     9051
Name: race_source_concept_id, dtype: int64

In [132]:
# what are all the genders?
person.gender_source_value.value_counts()

F    27130
M    22761
Name: gender_source_value, dtype: int64

In [133]:
# what are all the ethnicities?
person.ethnicity_source_concept_id.value_counts()

29    27326
19    13590
28     8975
Name: ethnicity_source_concept_id, dtype: int64

In [134]:
person.rename(columns={"gender_source_value": "gender", "race_source_concept_id": "race", "ethnicity_source_concept_id": "ethnicity", "birth_datetime":"age", "location_id":"location"}, errors="raise", inplace=True)

In [135]:
person['age'] = [round(total_hours(s1=s1,s2='2021-02-04',FMT='%Y-%m-%d')/(24*365)) for s1 in person.age] # in years

In [137]:
person.head()

Unnamed: 0,person_id,age,location,gender,race,ethnicity
0,0,73,6977,M,30,28
1,1,88,1605,M,10,29
2,2,58,2242,F,30,29
3,3,32,97,F,30,29
4,4,70,3915,M,30,28


-----------

#### Measurement

Description: The MEASUREMENT table contains records of Measurement, i.e. structured values (numerical or categorical) obtained through systematic and standardized examination or testing of a Person or Person's sample. The MEASUREMENT table contains both orders and results of such Measurements as laboratory tests, vital signs, quantitative findings from pathology reports, etc.

Number of Rows: 6,376,000

Vocabulary: LOINC, SNOMED

In [51]:
measurement.head()

Unnamed: 0,person_id,measurement_type,measurement_date,value_as_number,range_low,range_high,unit_source_value
0,43584,Creatine kinase [Enzymatic activity/volume] in...,2014-07-19,106.0,62.0,325.0,
1,23570,Triglyceride [Mass/volume] in Serum or Plasma,2011-10-24,137.0,,150.0,mg/dL
2,15111,Kappa light chains/Lambda light chains [Mass R...,2018-08-15,0.01,0.26,1.65,
3,37555,Aspartate aminotransferase [Enzymatic activity...,2018-11-10,16.0,9.0,38.0,U/L
4,37443,CD3+CD8+ (T8 suppressor cells) cells/100 cells...,2011-06-03,2.03,0.9,2.5,


In [21]:
concept_id_map[3042531] = 'Unknown measurement'

In [22]:
measurement['measurement_concept_id'] = [concept_id_map[i] for i in measurement.measurement_concept_id]

In [35]:
measurement.drop(columns=['measurement_id', 
       'measurement_datetime', 'measurement_time',
       'measurement_type_concept_id', 'operator_concept_id',
       'value_as_concept_id', 'unit_concept_id',
       'provider_id', 'visit_occurrence_id', 'visit_detail_id',
       'measurement_source_value', 'measurement_source_concept_id', 'value_source_value'], inplace=True)

In [36]:
measurement.rename(columns={"measurement_concept_id": "measurement_type"}, errors="raise", inplace=True)

In [37]:
measurement.head()

Unnamed: 0,person_id,measurement_type,measurement_date,value_as_number,range_low,range_high,unit_source_value
0,43584,Creatine kinase [Enzymatic activity/volume] in...,2014-07-19,106.0,62.0,325.0,
1,23570,Triglyceride [Mass/volume] in Serum or Plasma,2011-10-24,137.0,,150.0,mg/dL
2,15111,Kappa light chains/Lambda light chains [Mass R...,2018-08-15,0.01,0.26,1.65,
3,37555,Aspartate aminotransferase [Enzymatic activity...,2018-11-10,16.0,9.0,38.0,U/L
4,37443,CD3+CD8+ (T8 suppressor cells) cells/100 cells...,2011-06-03,2.03,0.9,2.5,


In [35]:
measurement[(measurement.person_id==23570)&(measurement.measurement_concept_id=='Heart rate')]

Unnamed: 0,measurement_id,person_id,measurement_concept_id,measurement_date,measurement_datetime,measurement_time,measurement_type_concept_id,operator_concept_id,value_as_number,value_as_concept_id,unit_concept_id,range_low,range_high,provider_id,visit_occurrence_id,visit_detail_id,measurement_source_value,measurement_source_concept_id,unit_source_value,value_source_value
22271584,22271585,23570,Heart rate,2013-10-11,2013-10-11 00:27:00,,3028553,,104.0,,8483.0,70.0,110.0,,,,,3027018,bpm,104.0
22762536,22762537,23570,Heart rate,2010-11-26,2010-11-26 00:27:00,,3028553,,78.0,,8483.0,80.0,150.0,,,,,3027018,bpm,78.0


-----------

#### Location

Description: The LOCATION table represents a generic way to capture address information of Persons. In this challenge, only state and truncated zip code are available.

Number of Rows: 7800

In [25]:
location.head()

Unnamed: 0,location_id,address_1,address_2,city,state,zip,county,location_source_value
0,1,,,,WA,981,,
1,2,,,,WA,981,,
2,3,,,,WA,981,,
3,5,,,,WA,981,,
4,7,,,,WA,981,,


In [97]:
location.zip.value_counts()

981    49891
Name: zip, dtype: int64

In [None]:
# all entries are the same, this dataframe is not important for now

------------

#### Drug exposure

Description: The DEVICE_EXPOSURE table captures information about a person's exposure to a foreign physical object or instrument which is used for diagnostic or therapeutic purposes through a mechanism beyond chemical action.

Number of Rows: 800
    
Vocabulary: SNOMED

In [9]:
drug_exposure.head()

Unnamed: 0,drug_exposure_id,person_id,drug_concept_id,drug_exposure_start_date,drug_exposure_start_datetime,drug_exposure_end_date,drug_exposure_end_datetime,verbatim_end_date,drug_type_concept_id,stop_reason,...,sig,route_concept_id,lot_number,provider_id,visit_occurrence_id,visit_detail_id,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value
0,1,2633,713852,2011-09-01,2011-09-01,2011-09-10,2011-09-10,,38000177,,...,,,,,,,,713852,Oral,
1,2,27038,1157460,2011-06-09,2011-06-09,2011-06-12,2011-06-12,,38000177,,...,,,,,,,,1157460,Oral,
2,3,37342,1308216,2014-12-14,2014-12-14,2014-12-14,2014-12-14,,38000177,,...,,,,,,,,1308216,,
3,4,10275,19077343,2014-08-15,2014-08-15,2014-08-16,2014-08-16,,38000177,,...,,,,,,,,19077343,Oral,
4,5,27345,40223112,2016-03-20,2016-03-20,2016-03-20,2016-03-20,,38000177,,...,,,,,,,,40223112,Oral,


In [10]:
drug_exposure.columns

Index(['drug_exposure_id', 'person_id', 'drug_concept_id',
       'drug_exposure_start_date', 'drug_exposure_start_datetime',
       'drug_exposure_end_date', 'drug_exposure_end_datetime',
       'verbatim_end_date', 'drug_type_concept_id', 'stop_reason', 'refills',
       'quantity', 'days_supply', 'sig', 'route_concept_id', 'lot_number',
       'provider_id', 'visit_occurrence_id', 'visit_detail_id',
       'drug_source_value', 'drug_source_concept_id', 'route_source_value',
       'dose_unit_source_value'],
      dtype='object')

In [13]:
drug_concept_id = []
for i in drug_exposure.drug_concept_id:
    try: drug_concept_id.append(concept_id_map[i])
    except KeyError: drug_concept_id.append("unknown drug")

In [14]:
drug_exposure['drug_concept_id'] = drug_concept_id

In [27]:
drug_exposure['drug_exposure_duration'] = [total_hours(s1,s2,FMT='%Y-%m-%d') for s1,s2 in zip(drug_exposure.drug_exposure_start_date,drug_exposure.drug_exposure_end_date)]

In [29]:
drug_exposure.drop(columns=['drug_exposure_id','drug_exposure_start_datetime', 'drug_exposure_end_datetime',
       'verbatim_end_date', 'drug_type_concept_id', 'stop_reason', 'sig', 'route_concept_id', 'lot_number',
       'provider_id', 'visit_occurrence_id', 'visit_detail_id', 'days_supply',
       'drug_source_value', 'drug_source_concept_id', 'drug_exposure_start_date', 'drug_exposure_end_date',
       'dose_unit_source_value'], errors="ignore", inplace=True)

In [30]:
drug_exposure.head()

Unnamed: 0,person_id,drug_concept_id,refills,quantity,route_source_value,drug_exposure_duration
0,2633,ropinirole 0.5 MG Oral Tablet,0.0,90.0,Oral,216.0
1,27038,unknown drug,,,Oral,72.0
2,37342,Lisinopril,6.0,60.0,,0.0
3,10275,ferrous sulfate 325 MG Delayed Release Oral Ta...,3.0,28.0,Oral,24.0
4,27345,Codeine sulfate 30 MG Oral Tablet,0.0,120.0,Oral,0.0


In [None]:
drug_exposure.rename(columns={"drug_concept_id": "drug_name"}, errors="raise", inplace=True)

-----

#### Condition Occurance

Description: The 'Drug' domain captures records about the utilization of a Drug when ingested or otherwise introduced into the body. A Drug is a biochemical substance formulated in such a way that when administered to a Person it will exert a certain physiological effect. Drugs include prescription and over-the-counter medicines, vaccines, and large-molecule biologic therapies. Radiological devices ingested or applied locally do not count as Drugs. Drug Exposure is inferred from clinical events associated with orders, prescriptions written, pharmacy dispensings, procedural administrations, and other patient-reported information.

Number of Rows: 6,582,000

Vocabulary: RxNorm

RxNorm is the attempt to unify the reference standard for medications and drugs. RxNorm incorporates other vocabularies, linking them together to unify drug referencing. 

In [22]:
condition_occurrence.head()

Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,stop_reason,provider_id,visit_occurrence_id,visit_detail_id,condition_source_value,condition_source_concept_id,condition_status_source_value,condition_status_concept_id
0,1,36715,436659,2017-01-08,2017-01-08,2017-01-17,2017-01-17,32020,,,,,,35206695,ENCOUNTER/ORDER SUMMARY,4033240
1,2,21628,4115171,2016-03-18,2016-03-18,2016-03-23,2016-03-23,32019,,,,,,45533939,CHARGES,4230359
2,3,41547,78232,2011-02-13,2011-02-13,2011-02-22,2011-02-22,43542353,,,,,,45591807,ORDER,4033240
3,4,11546,439846,2017-10-04,2017-10-04,2017-10-05,2017-10-05,32019,,,,,,35207792,CHARGES,4230359
4,5,43031,78097,2017-10-01,2017-10-01,2017-10-10,2017-10-10,32019,,,,,,45552285,BILLING,4230359


In [23]:
condition_occurrence['condition_concept_id'] = [concept_id_map[i] for i in condition_occurrence.condition_concept_id]

In [24]:
condition_occurrence['condition_duration'] = [total_hours(s1,s2,FMT='%Y-%m-%d') for s1,s2 in zip(condition_occurrence.condition_start_date, condition_occurrence.condition_end_date)]

In [25]:
condition_occurrence.drop(columns=['condition_occurrence_id',
       'condition_start_date', 'condition_start_datetime',
       'condition_end_date', 'condition_end_datetime',
       'condition_type_concept_id', 'stop_reason', 'provider_id',
       'visit_occurrence_id', 'visit_detail_id', 'condition_source_value',
       'condition_source_concept_id', 'condition_status_source_value',
       'condition_status_concept_id'], inplace=True)

In [26]:
condition_occurrence.rename(columns={"condition_concept_id": "condition_type"}, errors="raise", inplace=True)

In [29]:
condition_occurrence.head()

Unnamed: 0,person_id,condition_type,condition_duration
0,36715,Iron deficiency anemia,216.0
1,21628,Pain in right lower limb,120.0
2,41547,Shoulder joint pain,216.0
3,11546,Left heart failure,24.0
4,43031,Secondary malignant neoplasm of bone,216.0


---------

#### Observation Period

Description: The OBSERVATION_PERIOD table contains records which uniquely define the spans of time for which a Person is at-risk to have clinical events recorded within the source systems, even if no events in fact are recorded (healthy patient with no healthcare interactions).

Number of Rows: 9,000

In [13]:
observation_period.head()

Unnamed: 0,observation_period_id,person_id,observation_period_start_date,observation_period_end_date,period_type_concept_id
0,1,0,1960-05-05,2019-05-09,44814724
1,2,1,1961-10-15,2019-05-03,44814724
2,3,2,1963-06-05,2019-12-15,44814724
3,4,3,1989-05-12,2020-03-21,44814724
4,5,4,1960-06-27,2019-11-04,44814724


In [31]:
observation_time = [total_hours(str(s1), str(s2), FMT='%Y-%m-%d')/24 for s1,s2 in zip(observation_period.observation_period_start_date, observation_period.observation_period_end_date)]

In [32]:
observation_period['observation_time'] = observation_time #in days

In [105]:
observation_period.period_type_concept_id.value_counts()

44814724    49357
Name: period_type_concept_id, dtype: int64

In [34]:
observation_period.drop(columns=['observation_period_id','observation_period_start_date',
       'observation_period_end_date', 'period_type_concept_id'], inplace=True)

In [35]:
observation_period.head()

Unnamed: 0,person_id,observation_time
0,0,21553.0
1,1,21019.0
2,2,20647.0
3,3,11271.0
4,4,21679.0


observation time might be an indicator of exisiting health related problems in a patient

---