!pip install --upgrade --force-reinstall plotly

# 1.0 Setup

In [None]:
#import importlib
#plotly_spec = importlib.util.find_spec('plotly')
#print(plotly_spec)
#if plotly_spec is None:
#    print('Installing plotly...')
!pip install --upgrade --force-reinstall plotly
!pip install geopandas==0.3.0
!pip install pyshp==1.2.10
!pip install shapely==1.6.3
!pip install xlrd

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.plotly as py
import plotly.figure_factory as ff
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Scatter, Figure, Layout
from IPython.display import display, HTML
init_notebook_mode(connected=True)
import os
from os import listdir

In [None]:
!ls -la dswg/data

In [None]:
!pwd

In [None]:
cwd = os.getcwd()
if cwd == '/home/jovyan':  # We are in Jupyter Docker
    datadir = '/home/jovyan/work/dswg/data'
elif cwd == '/content':    # We are in Google Colab
    datadir = '/content/dswg/data'
print(datadir,':')
if os.path.exists(datadir):
  print(listdir(datadir))
else:
  print("DOESN'T EXIST")

In [None]:
if not os.path.exists(datadir + '/patients.csv'): 
    # Checkout the data from Github
    !git clone https://github.com/stevejohnson2001/dswg
    !mkdir /content/dswg/data/temp

## 1.1 Read in the data

In [None]:
# Read in the data
dd = {}

dd['patients'] = {'pat_id':     {'type': np.str, 'required':True, 'primarykey_col':0},
                  'birth_date': {'type': np.datetime64, 'required':True},
                  'death_date': {'type': np.datetime64}, 
                  'ssn':        {'type': np.str},
                  'drivers':    {'type': np.str},
                  'passport':   {'type': np.str},
                  'prefix':     {'type': np.str},
                  'first':      {'type': np.str, 'required':True},
                  'last':       {'type': np.str, 'required':True},
                  'suffix':     {'type': np.str},
                  'maiden':     {'type': np.str},
                  'marital':    {'type': np.str},
                  'race':       {'type': np.str},
                  'ethnicity':  {'type': np.str},
                  'gender':     {'type': np.str, 'required':True},
                  'birthplace': {'type': np.str},
                  'address':    {'type': np.str, 'required':True}
                  }
dd['encounters'] = {'enc_id':                 {'type': np.str, 'required':True, 'primarykey_col':0},
                    'enc_date':               {'type': np.datetime64, 'required':True},
                    'enc_pat_id':             {'type': np.str, 'required':True},
                    'enc_code':               {'type': np.str, 'required':True},
                    'enc_description':        {'type': np.str, 'required':True},
                    'enc_reason_code':        {'type': np.str},
                    'enc_reason_description': {'type': np.str}
                   }
dd['observations'] = {'obs_date':        {'type': np.datetime64, 'required':True},
                      'obs_pat_id':      {'type': np.str, 'required':True},
                      'obs_enc_id':      {'type': np.str, 'required':True},
                      'obs_code':        {'type': np.str, 'required':True},
                      'obs_description': {'type': np.str, 'required':True},
                      'obs_value':       {'type': np.str},
                      'obs_units':       {'type': np.str}
                     }
dd['medications'] = {'med_start_date':         {'type': np.datetime64, 'required':True},
                     'med_stop_date':          {'type': np.str, 'required':False},
                     'med_pat_id':             {'type': np.str, 'required':True},
                     'med_enc_id':             {'type': np.str, 'required':True},
                     'med_code':               {'type': np.str, 'required':True},
                     'med_description':        {'type': np.str, 'required':True},
                     'med_reason_code':        {'type': np.str},
                     'med_reason_description': {'type': np.str}
                     }


data = {}
for f in dd:
    m = dd[f]
    col_names = list(m.keys())
    data_types = {k: (v['type'] if v['type'] != np.datetime64 else np.str) for (k,v) in m.items()}
    date_cols = [k for k,v in m.items() if v['type'] == np.datetime64]
    key_cols = [v['primarykey_col'] for k,v in m.items() if 'primarykey_col' in v]
    if len(key_cols) == 0:
        key_cols = None
    print('{}:'.format(f))
    print('  keys:  {}'.format(key_cols))
    print('  types: {}'.format(data_types))
    print('  cols:  {}'.format(col_names))
    print('  dates: {}'.format(date_cols))
    data[f] = pd.read_csv(datadir + '/{}.csv'.format(f), index_col=key_cols, dtype=data_types, header=0, parse_dates=date_cols, names=col_names)
    display(data[f].head(5))
    print(data[f].dtypes)

In [None]:
patients = data['patients']
encounters = data['encounters']
observations = data['observations']
medications = data['medications']

## 1.2 Change Column Names

In [None]:
# Change the column names


## 1.3 Inject Data Quality Errors

In [None]:
# Inject data quality errors

In [None]:
# Add Days Supply information (bias toward more for Drug Overdose patients)
# Find all Overdose Patients
overdose_patients = set(encounters[encounters['enc_reason_code']=='55680006']['enc_pat_id'])
print(len(overdose_patients))
display(encounters[encounters['enc_pat_id'].isin(overdose_patients)].head(5))
overdose_meds = medications[medications['med_pat_id'].isin(overdose_patients)]
print(overdose_meds.shape)
display(overdose_meds.head(5))

In [None]:
# Display all the Opioid medications
opioids_rxnorm = ['1049369','1310197','1049544']
op1 = medications[medications.med_code.isin(opioids_rxnorm)]
print(op1.shape)
display(op1.head(10))

In [None]:
# Pickle the data
for name, dt in data.items():
    print('Pickling {} of shape: {}'.format(name, dt.shape))
    dt.to_pickle('{}/temp/{}.tmp'.format(datadir, name))

# 3.0 Exploratory Data Analysis

In [None]:
#Load the data
fips = [27000+i for i in range(1,174,2)]
#print(fips)
pop = [np.random.randint(1,100) for i in range(0,len(fips))]
#print(pop)
prevalance = np.random.normal(size=300)



## 3.1 Load the Data

Load the patient and observations (labs) data from the .csv file that we received from the extract.  Convert dates and other fields to the proper format when reading the file.

Displaying the first 5 rows of the data is a good way to look for obvious issues before working with the data in more detail.

In [None]:
# 2.0 Load Data


patients = pd.read_pickle(datadir+'/temp/patients.tmp')
encounters = pd.read_pickle(datadir+'/temp/encounters.tmp')
observations = pd.read_pickle(datadir+'/temp/observations.tmp')
medications = pd.read_pickle(datadir+'/temp/medications.tmp')
#display(patients.head(5))
#display(encounters.head(5))
#display(observations.head(5))
#display(medications.head(5))







### 3.1.1 Find all of the overdose encounters

In [None]:
overdose_encounters = encounters[encounters['enc_reason_code']=='55680006']
print(len(overdose_encounters))
overdose_encounters.head(5)



## 3.1 Data Visualization


Use the power of Matplotlib, Seaborn and Plotly to easily explore the data through rich visualizations including graphs, distributions and maps of the data.

The graphs are interactive through hovering and selecting of elements.  The graphic can be exported to be included in other documents.

In [None]:
# 3.0 Exploratory Data Analysis


values = pop
endpts = list(np.mgrid[min(values):max(values):4j])

colorscale = ["#030512","#1d1d3b","#323268","#3d4b94","#3e6ab0",
              "#4989bc","#60a7c7","#85c5d3","#b7e0e4","#eafcfd"]  # Blues
colorscale = [
    'rgb(68.0, 1.0, 84.0)',
    'rgb(66.0, 64.0, 134.0)',
    'rgb(38.0, 130.0, 142.0)',
    'rgb(63.0, 188.0, 115.0)',
    'rgb(216.0, 226.0, 25.0)'
]  # Green / Yellow

fig = ff.create_choropleth(
    fips=fips, values=values, scope=['Minnesota'], show_state_data=True,
    colorscale=colorscale, 
    binning_endpoints=endpts, 
    round_legend_values=True,
    plot_bgcolor='rgb(229,229,229)',
    paper_bgcolor='rgb(229,229,229)',
    legend_title='Prevalance by County',
    county_outline={'color': 'rgb(0,0,0)', 'width': 0.5},
    state_outline={'color': 'rgb(0,0,0)', 'width': 1},
    exponent_format=True,
)

iplot(fig, filename='choropleth_MN')

sns.countplot(x='RACE', data=patients)

g = sns.countplot(x='ethnicity', data=patients)
z = plt.xticks(rotation=-45)

sns.distplot(prevalance)



## 3.2 Transform the Data

Use the power of Pandas Dataframes to transform the data.  Add new columns as calculations from existing columns, join the data together and get it into the format you need for analysis.



In [None]:
# 3.0 Transform the Data

obs = pd.merge(observations, patients, left_on='obs_pat_id', how='left', right_index=True, indicator=True)
obs['age_at_visit'] = round((pd.to_datetime(obs['obs_date']) - pd.to_datetime(obs['birth_date'])).dt.days/365)
obs['adult'] = np.where(obs['age_at_visit'] >= 18, True, False)
#obs[obs['adult']].head(5)

display(obs.head(5))

enc2 = encounters.reset_index()
print(enc2.columns)
enc2.head(5)


obs.head(5)

obs2 = obs.merge(enc2, how='outer', left_on='obs_enc_id', right_on='enc_id')

obs = obs2

print(obs.shape)
print(obs2.shape)
display(obs.head(5))
display(obs2.head(5))

obs[obs['enc_reason_code'].notnull()].head(5)





### 3.2.1 Compute the Adult variable

In [None]:
w = obs[obs['obs_code']=='29463-7']
print('Number of patients: {:,}'.format(len(w)))
weights = w['obs_value'].astype(np.float)
mean = np.mean(weights)
print('Avg weight: ',mean)
sns.distplot(weights)
plt.xlabel("WEIGHT (kg)")
plt.show()

weight_obs = pd.DataFrame()
weight_obs['gender'] = obs['gender']
weight_obs['value'] = weights
weight_obs['adult'] = obs['adult']
sns.set(style="darkgrid")
g = sns.FacetGrid(weight_obs, row= 'adult', col='gender', margin_titles=True)
bins = np.linspace(0, 60, 13)
g.map(plt.hist, 'value', color="steelblue", lw=0, normed=True)



## 3.3 Save the Dataframes


Write all of the dataframes to disk for the next step

In [None]:
### 3.1 Save the Dataframes


obs.to_pickle(datadir+'/temp/obs.tmp')

# 4.0 Feature Selection

In [None]:
from sklearn.ensemble import RandomForestClassifier

#obs = pd.read_pickle(datadir+'/temp/obs.tmp')

overdose_patient_deaths = list(obs[(obs['obs_code'] == '69453-9') & (obs['obs_value'].str.contains('overdose'))]['enc_pat_id'])
overdose_patients = set(obs[obs['enc_reason_code']=='55680006']['enc_pat_id'])

print(len(overdose_patients))

overdose_obs = obs[obs.obs_pat_id.isin(overdose_patients)]
print(overdose_obs.shape)
display(overdose_obs.head(10))

obs['overdose'] = (obs['enc_pat_id'].isin(overdose_patients))
display(obs[obs['overdose']].head(10))




In [None]:
# fit an Extra Trees model to the data


noncat_col_names = ['age_at_visit']
cat_col_names = ['obs_code', 'marital', 'race', 'ethnicity', 'gender', 'adult', 'overdose']
pred_col_names = ['obs_code', 'marital', 'race', 'ethnicity', 'gender', 'adult', 'age_at_visit']

from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
d = defaultdict(LabelEncoder)
df = obs[cat_col_names]

# Encoding the variable
fit = df.apply(lambda x: d[x.name].fit_transform(x.astype(str)))
display(fit.head(20))
print(d)

# Inverse the encoded
#fit.apply(lambda x: d[x.name].inverse_transform(x))

# Using the dictionary to label future data
#df.apply(lambda x: d[x.name].transform(x))

df2 = pd.concat([fit, obs[noncat_col_names]],axis=1)
display(df2.head(5))



In [None]:
missing_data = df2[df2.isnull().any(axis=1)]
print("Shape = ",missing_data.shape)
display(missing_data.head(10))

good_data = df2[df2.notnull().any(axis=1)]
print("Shape = ",good_data.shape)
display(good_data.head(10))
print(good_data.dtypes)

#good_data['age_at_visit'] = good_data['age_at_visit'].astype('int64')

good_data = good_data[good_data['age_at_visit'].notnull()]
print("Shape = ",good_data.shape)
print(np.isfinite(good_data).all())



## 4.2.2 Display the relative importance of each variable

In [None]:
model = RandomForestClassifier()
model.fit(good_data[pred_col_names], good_data['overdose'])
# display the relative importance of each attribute
print(model.feature_importances_)

# Number of Male vs Female overdoses
display(pd.crosstab(obs['race'],obs['overdose'],margins=True))
pd.crosstab(obs['race'],obs['overdose'],normalize='index')



### 4.2.3 Display Correlations between all the variables

In [None]:
display(good_data.corr())


In [None]:
# Get Opioid code list from VSAC
# oid 1.3.6.1.4.1.6997.4.1.2.234.999.3.2
xl = pd.ExcelFile(datadir + '/AllPrescribableOpioidsUsedForPainControlIncludingInactiveMedications.xlsx')
df = xl.parse("Code List", skiprows=12)
display(df.head(10))
opioids_rxnorm = list(df['Code'].astype(np.str))
#opioids_rxnorm.extend(['1049369','1310197','1049544'])
print(opioids_rxnorm)



In [None]:
#dt = {'PATIENT':np.str, 'ENCOUNTER':np.str, 'CODE':np.str, 
#      'DESCRIPTION':np.str, 'REASONCODE':np.str, 'REASONDESCRIPTION':np.str }
#medications = pd.read_csv(datadir + '/medications.csv', header=0, parse_dates = [0,1], dtype=dt, keep_default_na=False)
#medications['CODE'] = medications.CODE.astype(np.str)
#print(medications['CODE'][0])
print(medications.dtypes)
display(medications.head(5))

#display(medications[medications['CODE'].isin(opioids_rxnorm)].head(10))
print(medications[medications['med_code'].isin(opioids_rxnorm)].shape)
#display(medications[medications['CODE']==1049221].head(10))
display(medications[medications['med_code']=='1049544'].head(10))



In [None]:
patients_prescribed_opioids = set(medications[medications['med_code'].isin(opioids_rxnorm)]['med_pat_id'])
print(len(patients_prescribed_opioids))

d2 = set(patients_prescribed_opioids).intersection(overdose_patients)
print(len(d2))
print(d2)



In [None]:
meds_prescribed_to_overdose_patients = medications[medications['med_pat_id'].isin(overdose_patients)]
display(meds_prescribed_to_overdose_patients.head(10))




In [None]:
pt_id = '3eaed230-1c60-4221-a96c-f6af5d871072'
#pt = patients.query('pat_id == @pt_id')
#pt = patients[patients.pat_id == pt_id].iloc[0]
pt = patients.loc[pt_id]

print('PATIENT: {} {} ({} {} {} {})'.format(pt['first'],pt['last'],pt['marital'],pt['race'],pt['gender'],pt['ethnicity']))
encs = encounters[encounters.enc_pat_id == pt_id]
#print(encs.shape)
for i, e in encs.iterrows():
    print('  {}: {} ({}) ({})'.format(e['enc_date'], e['enc_description'], e['enc_code'], e['enc_reason_description']))
    meds = medications[medications['med_enc_id'] == e.name]
    for j, m in meds.iterrows():
        print('     MED: {} to {}: {} ({})'.format(m['med_start_date'], m['med_stop_date'], m['med_description'], m['med_code']))
    labs = observations[observations['obs_enc_id'] == e.name]
    for k, l in labs.iterrows():
        print('     LAB: {:%Y-%m-%d %H:%M}: {} ({}) {} {}'.format(l['obs_date'], l['obs_description'], l['obs_code'], l['obs_value'], l['obs_units']))