## Setup

### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Reading the Data

In [None]:
enc = pd.read_csv('encounter_data.csv', low_memory = False)
enc.head()

## Data Cleaning

### Changing the column keys to be the column names

In [None]:
enc.rename(columns = enc.loc[0], inplace = True)
enc.drop(0, axis = 0, inplace = True)

In [None]:
enc.info()

In [None]:
enc.columns

### Editing the icd9 column

In [None]:
# changing the dashes in the icd9 column to be nan values for consistency
enc['icd9encounterdiagdescr'] = enc['icd9encounterdiagdescr'].replace('-', np.nan)
enc.head()

### Fixing merge issue

Merge issue: there are multiple rows for each patient encounter based on the number of diagnoses the particular patient has. This was found due to there being a large amount of rows for particular patients.

In [None]:
# condensing diagnosis rows to a list of diagnoses for each encounter
enc_icd10 = enc[enc['icd10encounterdiagdescr'].notnull()].groupby(['patientid','cln enc date'])['icd10encounterdiagdescr'].apply(set).reset_index()
enc_icd9 = enc[enc['icd9encounterdiagdescr'].notnull()].groupby(['patientid','cln enc date'])['icd9encounterdiagdescr'].apply(set).reset_index()
enc_codedesc = enc[enc['patientsnomedproblemcodedesc'].notnull()].groupby(['patientid','cln enc date'])['patientsnomedproblemcodedesc'].apply(set).reset_index()
enc_diag = enc[enc['enc srv diag'].notnull()].groupby(['patientid','cln enc date'])['enc srv diag'].apply(set).reset_index()

# removing duplicate to make sure there is only one row per patient encounter
enc1 = enc.drop_duplicates(['patientid', 'cln enc date']).reset_index()

# dropping diagnosis columns from dataset without duplicates
enc1.drop('icd10encounterdiagdescr', axis = 1, inplace = True)
enc1.drop('icd9encounterdiagdescr', axis = 1, inplace = True)
enc1.drop('patientsnomedproblemcodedesc', axis = 1, inplace = True)
enc1.drop('enc srv diag', axis = 1, inplace = True)

# merging condensed diagnosis columns
cond = pd.merge(enc1, enc_icd10, on = ['patientid', 'cln enc date'], how = 'left')
cond = pd.merge(cond, enc_icd9, on = ['patientid', 'cln enc date'], how = 'left')
cond = pd.merge(cond, enc_codedesc, on = ['patientid', 'cln enc date'], how = 'left')
cond = pd.merge(cond, enc_diag, on = ['patientid', 'cln enc date'], how = 'left')

In [None]:
# making sure no data was lost by checking the shape of both datasets.
# they should have the same number of rows and cond should have an additional 4 diagnoses columns

print(enc1.shape)
print(cond.shape)

### Fixing mixed datatypes

In [None]:
cond.info()

In [None]:
cond.head()

In [None]:
# checking to see which columns have mixed datatypes

from pandas.api.types import infer_dtype

columns = cond.columns
for col in columns:
    print(col + ' - ' + infer_dtype(cond[col]))

In [None]:
# making the patientid and zip code columns all strings

cond['patientid'] = [str(x) for x in cond['patientid']]
cond['patient zip'] = [str(x) for x in cond['patient zip']]

In [None]:
# replace incorrect zip code entry
cond['patient zip'] = cond['patient zip'].replace(['2472'], '37184')
cond['patient zip'] = cond['patient zip'].replace('37355-1424', '37355')

# replace incorrect city entry
cond['patient city'] = cond['patient city'].replace(['TULLAHOMATULLAHOMA'], 'TULLAHOMA')

# replace incorrect city and zip for patient
cond.loc[cond['patientid'] == '2421', 'patient city'] = 'Winchester'
cond.loc[cond['patientid'] == '2421', 'patient zip'] = '37398'

In [None]:
# changing the nan values in the patient federal poverty level to be the average federal poverty level

# first making sure that all non-null values are numerical
cond['ptnt  fpl'] = [float(x) for x in cond['ptnt  fpl']]

# finding the mean of the federal poverty levels
mean_fpl = cond['ptnt  fpl'].mean()

# filling all null values with the mean
cond['ptnt  fpl'] = cond['ptnt  fpl'].fillna(mean_fpl)

In [None]:
# changing null values for the patient registration date, patient reason for inactive status,
#    race, ethnicity, patient lang and outgoing referral columns to be unknown

cond['patientregd'] = cond['patientregd'].fillna('unknown')
cond['ptnt rsn fr nctv stts'] = cond['ptnt rsn fr nctv stts'].fillna('unspecified')
cond['race'] = cond['race'].fillna('unspecified')
cond['ethnicity'] = cond['ethnicity'].fillna('unspecified')
cond['patient lang'] = cond['patient lang'].fillna('unspecified')

# if a value is null in the patient deceased column then they are still alive ??
cond['ptnt dcsd ysn'] = cond['ptnt dcsd ysn'].fillna('still alive')

# if a value is null in the outgoing referral column the patient has not gotten a referral ??
cond['auth refto prvdr'] = cond['auth refto prvdr'].fillna('no referral')

In [None]:
# filling in null patient county of residence values based on patient city

# all patients with null counties live in Watertown which is in Wilson county
cond[cond['ptnt cnty f rsdnc'].isnull()]['patient city'].value_counts()

# filling all null counties with Wilson county
cond['ptnt cnty f rsdnc'] = cond['ptnt cnty f rsdnc'].fillna('Wilson')

In [None]:
cond.info()

In [None]:
# fixing null values for the diagnosis columns (icd9, problem description, srv diagnosis)

# NOTE: will do this once we talk to Emilie

### Adding columns

In [None]:
# CREATING THE AGE COL
## description: a column that represents the patients age at the time of the clinic encounter

from datetime import datetime

# Format according to datetime module
dob = pd.to_datetime(cond['patientdob'], format='%m/%d/%Y')
encdate = pd.to_datetime(cond['cln enc date'], format='%m/%d/%Y')

# Calculate the age in days
age_days = (encdate - dob).dt.days

# Convert age from days to years
age_years = age_days // 365.25

# Create the new column and make the ages ints
cond['age'] = [int(x) for x in age_years]

In [None]:
# CREATING THE YEAR COL
## description: a column that shows what year the patient encounter took place

# function that returns the year from a date format of 'mm/dd/year'
def dayToYear(day):
    return day.split('/')[-1]

# creating the new column
cond['enc year'] = [dayToYear(day) for day in cond['cln enc date']]

In [None]:
# CREATING THE DISTANCE COL
## description: a column that represents the distance between patients and the clinic (calculated by zip code)

import pgeocode

# function that gets the distance between two zip codes using the pgeocode package
def get_distance(x, y):
    usa_zipcodes = pgeocode.GeoDistance('us')
    distance_in_kms = usa_zipcodes.query_postal_code(x, y.values)
    return distance_in_kms

# creating the new column
cond['distance'] = get_distance('37388', cond['patient zip'])

## Visualizations

###                           Heatmap of patients of the clinic


In [None]:
# making a seperate datframe out of the encounter data and only selecting columns needed for the map. Heat map is only-
# - checking usage so only really need patient id

test = cond[['patientid','patientsex','patient zip']]

test.head()

In [None]:
# Importing geopandas to create a heatmap and reading .shp file of US zipcodes

import geopandas as gpd

tn_map= gpd.read_file("tl_2022_us_zcta520.shp")

### rename column to patient zip so that it can merge with the other df of patient id's

tn_map.rename(columns={"ZCTA5CE20":"patient zip"}, inplace=True)

tn_map.head()

In [None]:
### Checking to see if zip codes match the county zipcodes we are analyzing

tn_map[tn_map['patient zip']=='37388']

In [None]:
# removing rows that arent TN specific
## Looking at unique zipcodes of patients so that the big df of US Zipcodes can be specified and become smaller

cond["patient zip"].unique()

## Assign the unique zipcodes to "specific_values" and use it to filter the large df of US zipcodes

specific_values = ['37388', '37355', '37020', '37360', '37398', '37318', '37018',
       '37366', '37324', '37342', '37183', '37330', '37345', '37352',
       '37376', '37160', '37306', '37334', '37184', '37339',
       '37382', '37110', '37359', '37357', '37356', '37301', '37348',
       '37349', '37375', '37313', '37149']


filtered_data = tn_map[tn_map['patient zip'].isin(specific_values)]

filtered_data.head()

In [None]:
##Merging columns for zipcodes 

map_and_stats=filtered_data.merge(test, on="patient zip")

In [None]:
# Plotting heatmap

fig, ax = plt.subplots(1, figsize=(10, 10))
plt.xticks(rotation=90)

# Specify what column to look at

map_and_stats.plot(column="patientid", cmap="Reds", linewidth=0.4, ax=ax, edgecolor=".4")

## add bar graphs to the side of map

bar_info = plt.cm.ScalarMappable(cmap="Reds", norm=plt.Normalize(vmin=0, vmax=8000))
bar_info._A = []
cbar = fig.colorbar(bar_info)

# ----------------------------------------------------------------------------------------------

In [None]:
### trying plotly
import plotly.express as px

In [None]:
with urlopen('https://raw.githubusercontent.com/OpenDataDE/State-zip-code-GeoJSON/master/tn_tennessee_zip_codes_geo.min.json'):
    zipcodes= json.load(response)

In [None]:
# test