In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

init_notebook_mode(connected=True)  

In [None]:
enc = pd.read_csv('EncounterData.csv', low_memory = False)



enc.rename(columns = enc.loc[0], inplace = True)
enc.drop(0, axis = 0, inplace = True)

enc_condition = enc.copy()


# changing the dashes in the icd9 column to be nan values for consistency
enc['icd9encounterdiagdescr'] = enc['icd9encounterdiagdescr'].replace('-', np.nan)

# condensing diagnosis rows to a list of diagnoses for each encounter
enc_icd10 = enc[enc['icd10encounterdiagdescr'].notnull()].groupby(['patientid','cln enc date'])['icd10encounterdiagdescr'].apply(set).reset_index()
enc_icd9 = enc[enc['icd9encounterdiagdescr'].notnull()].groupby(['patientid','cln enc date'])['icd9encounterdiagdescr'].apply(set).reset_index()
enc_codedesc = enc[enc['patientsnomedproblemcodedesc'].notnull()].groupby(['patientid','cln enc date'])['patientsnomedproblemcodedesc'].apply(set).reset_index()
enc_diag = enc[enc['enc srv diag'].notnull()].groupby(['patientid','cln enc date'])['enc srv diag'].apply(set).reset_index()

# removing duplicate to make sure there is only one row per patient encounter
enc1 = enc.drop_duplicates(['patientid', 'cln enc date']).reset_index()

# dropping diagnosis columns from dataset without duplicates
enc1.drop('icd10encounterdiagdescr', axis = 1, inplace = True)
enc1.drop('icd9encounterdiagdescr', axis = 1, inplace = True)
enc1.drop('patientsnomedproblemcodedesc', axis = 1, inplace = True)
enc1.drop('enc srv diag', axis = 1, inplace = True)

# merging condensed diagnosis columns
cond = pd.merge(enc1, enc_icd10, on = ['patientid', 'cln enc date'], how = 'left')
cond = pd.merge(cond, enc_icd9, on = ['patientid', 'cln enc date'], how = 'left')
cond = pd.merge(cond, enc_codedesc, on = ['patientid', 'cln enc date'], how = 'left')
cond = pd.merge(cond, enc_diag, on = ['patientid', 'cln enc date'], how = 'left')

# checking to see which columns have mixed datatypes
from pandas.api.types import infer_dtype

columns = cond.columns
for col in columns:
    print(col + ' - ' + infer_dtype(cond[col]))

# making the patientid and zip code columns all strings

cond['patientid'] = [str(x) for x in cond['patientid']]
cond['patient zip'] = [str(x) for x in cond['patient zip']]

# replace incorrect zip code entry
cond['patient zip'] = cond['patient zip'].replace(['2472'], '37184')
cond['patient zip'] = cond['patient zip'].replace('37355-1424', '37355')

# replace incorrect city entry
cond['patient city'] = cond['patient city'].replace(['TULLAHOMATULLAHOMA'], 'TULLAHOMA')

# replace incorrect city and zip for patient
cond.loc[cond['patientid'] == '2421', 'patient city'] = 'Winchester'
cond.loc[cond['patientid'] == '2421', 'patient zip'] = '37398'

# changing the nan values in the patient federal poverty level to be the average federal poverty level

# first making sure that all non-null values are numerical
cond['ptnt  fpl'] = [float(x) for x in cond['ptnt  fpl']]

# finding the mean of the federal poverty levels
mean_fpl = cond['ptnt  fpl'].mean()

# filling all null values with the mean
cond['ptnt  fpl'] = cond['ptnt  fpl'].fillna(mean_fpl)

# changing null values for the patient registration date, patient reason for inactive status,
#    race, ethnicity, patient lang and outgoing referral columns to be unknown

cond['patientregd'] = cond['patientregd'].fillna('unknown')
cond['ptnt rsn fr nctv stts'] = cond['ptnt rsn fr nctv stts'].fillna('unspecified')
cond['race'] = cond['race'].fillna('unspecified')
cond['ethnicity'] = cond['ethnicity'].fillna('unspecified')
cond['patient lang'] = cond['patient lang'].fillna('unspecified')

# if a value is null in the patient deceased column then they are still alive ??
cond['ptnt dcsd ysn'] = cond['ptnt dcsd ysn'].fillna('still alive')

# if a value is null in the outgoing referral column the patient has not gotten a referral ??
cond['auth refto prvdr'] = cond['auth refto prvdr'].fillna('no referral')

# filling in null patient county of residence values based on patient city

# all patients with null counties live in Watertown which is in Wilson county
cond[cond['ptnt cnty f rsdnc'].isnull()]['patient city'].value_counts()

# filling all null counties with Wilson county
cond['ptnt cnty f rsdnc'] = cond['ptnt cnty f rsdnc'].fillna('Wilson')

# CREATING THE AGE COL
## description: a column that represents the patients age at the time of the clinic encounter
from datetime import datetime

# Format according to datetime module
dob = pd.to_datetime(cond['patientdob'], format='%m/%d/%Y')
encdate = pd.to_datetime(cond['cln enc date'], format='%m/%d/%Y')

# Calculate the age in days
age_days = (encdate - dob).dt.days

# Convert age from days to years
age_years = age_days // 365.25

# Create the new column and make the ages ints
cond['age'] = [int(x) for x in age_years]

# CREATING THE YEAR COL
## description: a column that shows what year the patient encounter took place

# function that returns the year from a date format of 'mm/dd/year'
def dayToYear(day):
    return day.split('/')[-1]

# creating the new column
cond['enc year'] = [dayToYear(day) for day in cond['cln enc date']]

def dayToMonth(day):
    return day.split('/')[0]

# creating the new column
cond['enc month'] = [dayToMonth(day) for day in cond['cln enc date']]

def dayToDay(day):
    return day.split('/')[1]

# creating the new column
cond['enc day'] = [dayToDay(day) for day in cond['cln enc date']]


# CREATING THE DISTANCE COL
## description: a column that represents the distance between patients and the clinic (calculated by zip code)

import pgeocode

# function that gets the distance between two zip codes using the pgeocode package
def get_distance(x, y):
    usa_zipcodes = pgeocode.GeoDistance('us')
    distance_in_kms = usa_zipcodes.query_postal_code(x, y.values)
    return distance_in_kms

# creating the new column
cond['distance'] = get_distance('37388', cond['patient zip'])

cond['enc year']=cond['enc year'].astype(int)

In [None]:
## CREATING THE AGE COL (for enc_condition)
# description: a column that represents the patients age at the time of the clinic encounter
from datetime import datetime
# Format according to datetime module
dob = pd.to_datetime(enc_condition['patientdob'], format='%m/%d/%Y')
encdate = pd.to_datetime(enc_condition['cln enc date'], format='%m/%d/%Y')
# Calculate the age in days
age_days = (encdate - dob).dt.days
# Convert age from days to years
age_years = age_days // 365.25
# Create the new column and make the ages ints
enc_condition['age'] = [int(x) for x in age_years]
## CREATING THE YEAR COL (for enc_condition)
enc_condition['year'] = enc_condition['cln enc date'].apply(lambda x: x.split('/')[-1])

encc = enc_condition.copy()
encc.rename(columns = {'icd10encounterdiagdescr': 'icd10'}, inplace = True)
# changing the dashes in the icd10 column to be nan values for consistency
encc['icd10'] = encc['icd10'].replace('-', np.nan)
# dropping the nan values
encc.dropna(subset = ['icd10'], inplace = True)
# make year column numerical
enc_condition['year']=enc_condition['year'].astype(int)

In [None]:
# create ctrends dataframe
ctrends = cond

In [None]:
#ctrends.to_csv('EncounterCondition.csv')

In [None]:
#enc_condition.to_csv('Enc_Condition.csv')

In [None]:
#cond.to_csv('Encounter_Conditions.csv')

In [None]:
#encc.to_csv('Encounter_Cond_Data.csv')