## Imports

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

#reading CSV cleaned file of the TN Health dataset 
df = pd.read_csv('filtered_tn_data.csv', index_col = 0)

In [None]:
df.head()

## Graphs

In [None]:
#Graph for urban percentages

# Mapping of diagnosis codes to names
diagnosis_mapping = {
    'A419': 'Sepsis',
    'N390': 'UTI',
    'R079': 'Chest pain',
    'R0789': 'Other chest pain',
    'J209': 'Acute bronchitis',
    'R112': 'Nausea with vomiting',
    'R109': 'Unspecified abdominal pain',
    'M545': 'Low back pain',
    'R51': 'Headache',
    'J069': 'Acute URI',
    'Z1211': 'Colon screening',
    'Z3800': 'Liveborn infant'
}

#Calculating percentage of urban patients for each condition
total_population = len(df[df['Rural'] == False])
diagnosis_counts = df[df['Rural'] == False]['Diag1'].value_counts()
top_10_diagnoses = diagnosis_counts.nlargest(10)
diagnosis_percentages = (top_10_diagnoses / total_population) * 100
# Updating the labels with diagnosis names
diagnosis_labels = [diagnosis_mapping.get(code, code) for code in top_10_diagnoses.index]
# Creating the bar graph using Plotly
fig = px.bar(x=diagnosis_labels, y=diagnosis_percentages.values, labels={'x': 'Diagnosis', 'y': 'Percentage'})
fig.update_layout(title="Top 10 Diagnoses Percentage for Urban TN",
                  xaxis_title="Diagnosis",
                  yaxis_title="Percentage")
fig.show()

In [None]:
#Graph for rural percentages

# Mapping of diagnosis codes to names
diagnosis_mapping = {
    'A419': 'Sepsis',
    'N390': 'UTI',
    'R079': 'Chest pain',
    'R0789': 'Other chest pain',
    'J209': 'Acute bronchitis',
    'R112': 'Nausea with vomiting',
    'R109': 'Unspecified abdominal pain',
    'M545': 'Low back pain',
    'R51': 'Headache',
    'J069': 'Acute URI',
    'Z1211': 'Colon screening',
    'Z3800': 'Liveborn infant'
}

#Calculating percentage of rural patients for each condition
total_population = len(df[df['Rural'] == True])
diagnosis_counts = df[df['Rural'] == True]['Diag1'].value_counts()
top_10_diagnoses = diagnosis_counts.nlargest(10)
diagnosis_percentages = (top_10_diagnoses / total_population) * 100
# Updating the labels with diagnosis names
diagnosis_labels = [diagnosis_mapping.get(code, code) for code in top_10_diagnoses.index]
# Creating the bar graph using Plotly
fig = px.bar(x=diagnosis_labels, y=diagnosis_percentages.values, labels={'x': 'Diagnosis', 'y': 'Percentage'})
fig.update_layout(title="Top 10 Diagnoses Percentage for Rural TN",
                  xaxis_title="Diagnosis",
                  yaxis_title="Percentage")
fig.show()

In [None]:
#Graph for percentages of people who self-paid for each diagnosis

# Get the top 10 diagnoses: df['Diag1'].value_counts().head(10).index

# Initialize lists to store the percentages
percentages_p = []
percentages_other = []

# Calculate the percentage for each diagnosis
for diagnosis in df['Diag1'].value_counts().head(10).index:
    total_count = df[df['Diag1'] == diagnosis].shape[0]
    p_count = df[(df['Diag1'] == diagnosis) & (df['Primary_Payer_Class_Cd'] == 'P')].shape[0]
    p_percentage = (p_count / total_count) * 100
    percentages_p.append(p_percentage)
    percentages_other.append(100 - p_percentage)
    
# Create the bar graph
fig = go.Figure()
# Add the bar for 'Primary_Payer_Class_Cd' equals 'P'
fig.add_trace(go.Bar(x=df['Diag1'].value_counts().head(10).index, y=percentages_p, name="Self-Pay"))
# Add the bar for 'Primary_Payer_Class_Cd' not equal to 'P'
fig.add_trace(go.Bar(x=df['Diag1'].value_counts().head(10).index, y=percentages_other, name="Other Payer Classes"))
# Customize the layout
fig.update_layout(
    title="Percentage of Payer Class by Diagnosis",
    xaxis_title="Diagnosis",
    yaxis_title="Percentage",
    yaxis_tickformat=".1f"
)
# Show the graph
fig.show()

In [None]:
#Graph for preventable diagnoses by county

# making the zip code column strings
df['Patient_Zip'] = df['Patient_Zip'].astype(str)

#Dictionary for zip code to its corresponding county
zip_county_mapping = {
    '37318': 'Franklin County',
    '37398': 'Franklin County',
    '37330': 'Franklin County',
    '37306': 'Franklin County',
    '37345': 'Franklin County',
    '37324': 'Franklin County',
    '37375': 'Franklin County',
    '37376': 'Franklin County',
    '37383': 'Franklin County',
    '37355': 'Coffee County',
    '37388': 'Coffee County',
    '37342': 'Coffee County',
    '37349': 'Coffee County',
    '37382': 'Coffee County',
    '37389': 'Coffee County',
    '37018': 'Coffee County',
    '37361': 'Coffee County',
    '37352': 'Moore County',
    '37020': 'Bedford County',
    '37160': 'Bedford County',
    '37161': 'Bedford County',
    '37162': 'Bedford County',
    '37180': 'Bedford County',
    '37183': 'Bedford County',
    '37110': 'Warren County',
    '37111': 'Warren County',
    '37357': 'Warren County',
    '37378': 'Warren County',
    '37394': 'Warren County',
    '38550': 'Warren County',
    '38581': 'Warren County',
    '37016': 'Cannon County',
    '37026': 'Cannon County',
    '37190': 'Cannon County',
    '37301': 'Grundy County',
    '37305': 'Grundy County',
    '37313': 'Grundy County',
    '37339': 'Grundy County',
    '37356': 'Grundy County',
    '37365': 'Grundy County',
    '37366': 'Grundy County',
    '37387': 'Grundy County',
    '37340': 'Marion County',
    '37347': 'Marion County',
    '37374': 'Marion County',
    '37379': 'Marion County',
    '37396': 'Marion County',
    '37397': 'Marion County'
}

#list of preventable ICD codes
preventable_diagnoses = [
    'I20', 'I240', 'I248', 'I249', 'J45', 'J13', 'J14', 'J153', 'J154', 'J157', 'J159', 'J16', 'J18', 'L03', 'L04',
    'L08', 'L88', 'L980', 'J20', 'J40', 'J41', 'J42', 'J43', 'J44', 'J47', 'I50', 'I110', 'J810', 'R56', 'E86', 'E109',
    'E119', 'E101', 'E131', 'E110', 'E130', 'E10641', 'E11641', 'E106', 'E116', 'E108', 'E118', 'K529', 'K5289', 'G40',
    'I10', 'I119', 'E162', 'N10', 'N11', 'N12', 'N70', 'N73', 'A150', 'A155', 'A159', 'H66', 'J02', 'J03', 'J06',
    'J312', 'A154', 'A156', 'A158', 'A17', 'A18', 'A19'
]
filtered_data = df[df['Patient_Zip'].isin(zip_county_mapping.keys()) & df['Diag1'].isin(preventable_diagnoses)]

filtered_data['County'] = filtered_data['Patient_Zip'].map(zip_county_mapping)
fig = px.histogram(filtered_data, x='County', title='Count of Counties for Preventable Diagnoses')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [None]:
#Graph for preventable diagnoses by zip code

# specify the zip codes
specific_zips = ['37342','37349','37355', '37388','37389','37306','37318','37324','37330','37345','37372','37375', '37398','37352']

# Specify the preventable diagnoses
preventable_diagnoses = ['I20', 'I240', 'I248', 'I249', 'J45', 'J13', 'J14', 'J153', 'J154', 'J157', 'J159', 'J16', 'J18', 'L03', 'L04', 'L08', 'L88', 'L980', 'J20', 'J40', 'J41', 'J42', 'J43', 'J44', 'J47', 'I50', 'I110', 'J810', 'R56', 'E86', 'E109', 'E119', 'E101', 'E131', 'E110', 'E130', 'E10641', 'E11641', 'E106', 'E116', 'E108', 'E118', 'K529', 'K5289', 'G40', 'I10', 'I119', 'E162', 'N10', 'N11', 'N12', 'N70', 'N73', 'A150', 'A155', 'A159', 'H66', 'J02', 'J03', 'J06', 'J312', 'A154', 'A156', 'A158', 'A17', 'A18', 'A19']
# Filter the DataFrame based on the preventable diagnoses
filtered_data = df[df['Diag1'].isin(preventable_diagnoses) & df['Patient_Zip'].isin(specific_zips)]
# Combine specified zip codes into one category 'Other'
# filtered_data.loc[filtered_data['Patient_Zip'].isin(['37376', '37382', '37349', '37318']), 'Patient_Zip'] = 'Other'
# Create a histogram of Patient_Zip counts for the preventable diagnoses
fig = px.histogram(filtered_data, x='Patient_Zip', title='Count of Preventable Diagnoses by Zip Code')
fig.update_layout(xaxis={'categoryorder':'total descending'})
# Display the interactive histogram
fig.show()

In [None]:
# Define the age groups
age_groups = [(18, 32), (33, 48), (49, 64)]
# create a copy of the dataset
filtered_data = df.copy()
# Create a new column 'AgeGroup' in the dataframe based on age ranges
filtered_data['AgeGroup'] = pd.cut(filtered_data['Age'], bins=[age[0] for age in age_groups] + [age_groups[-1][1]],
                        labels=[f"{age[0]}-{age[1]}" for age in age_groups])
# Create a dictionary to map ICD codes to diagnosis names
diagnosis_names = {
    "R109": "Abdominal pain",
    "R112": "Nausea with vomiting",
    "N390": "UTI",
    "R079": "Chest pain",
    "M545": "Low back pain",
    "J069": "Acute URI",
    "R0789": "Other chest pain",
    "R51": "Headache",
    "J209": "Acute bronchitis",
    "A419": "Sepsis",
    "K047": "Periapical abscess",
    "K029": "Dental caries",
    "G43909": "Migraine",
    "Z1211": "Colon screening",
    "J441": "COPD",
    "E11621": "Type 2 diabetes"
}
# Replace the ICD codes with diagnosis names in the dataframe
filtered_data['Diag1'] = filtered_data['Diag1'].map(diagnosis_names)
# Group the counts of diagnoses by age group and diagnosis, and select the top 10 for each group
diagnosis_counts = filtered_data.groupby(['AgeGroup', 'Diag1']).size().reset_index(name='Count')
diagnosis_counts = diagnosis_counts.groupby('AgeGroup').apply(lambda x: x.nlargest(10, 'Count')).reset_index(drop=True)
# Create the bar graph using Plotly
fig = px.bar(diagnosis_counts, x='AgeGroup', y='Count', color='Diag1', title='Top 10 Diagnoses by Age Group')
fig.update_layout(barmode='group')


fig.show()

In [None]:
# Get the top 10 diagnoses based on frequency in the diag1 column
top_10_diagnoses = df['Diag1'].value_counts().nlargest(10).index
# Define the age ranges
age_ranges = [(18,32), (33, 48), (49, 64)]
# Categorize age into the defined ranges
df['Age Range'] = pd.cut(df['Age'], bins=[range_[0] - 0.5 for range_ in age_ranges] + [age_ranges[-1][1] + 0.5],
                         labels=[f'{range_[0]}-{range_[1]}' for range_ in age_ranges])
# Filter the DataFrame for the top 10 diagnoses and age ranges
filtered_df = df[df['Diag1'].isin(top_10_diagnoses)]
# Create an interactive visualization using Plotly
fig = px.histogram(filtered_df, x='Diag1', color='Age Range', title='Overall Top 10 Diagnoses by Age Range',
                   labels={'Diag1': 'Diagnosis', 'Age Range': 'Age Range'},
                   category_orders={'Diag1': top_10_diagnoses, 'Age Range': [f'{range_[0]}-{range_[1]}' for range_ in age_ranges]})
fig.update_layout(barmode='group')  # Display bars stacked on top of each other
fig.show()

In [None]:
# Specify the zip codes and their respective counties
zip_county_mapping = {
    '37318': 'Franklin County',
    '37398': 'Franklin County',
    '37330': 'Franklin County',
    '37306': 'Franklin County',
    '37345': 'Franklin County',
    '37324': 'Franklin County',
    '37375': 'Franklin County',
    '37376': 'Franklin County',
    '37355': 'Coffee County',
    '37388': 'Coffee County',
    '37342': 'Coffee County',
    '37349': 'Coffee County',
    '37352': 'Moore County'
}
# Specify the preventable diagnoses
preventable_diagnoses = [
    'I20', 'I240', 'I248', 'I249', 'J45', 'J13', 'J14', 'J153', 'J154', 'J157', 'J159', 'J16', 'J18', 'L03', 'L04', 'L08', 'L88', 'L980', 'J20', 'J40', 'J41', 'J42', 'J43', 'J44', 'J47', 'I50', 'I110', 'J810', 'R56', 'E86', 'E109', 'E119', 'E101', 'E131', 'E110', 'E130', 'E10641', 'E11641', 'E106', 'E116', 'E108', 'E118', 'K529', 'K5289', 'G40', 'I10', 'I119', 'E162', 'N10', 'N11', 'N12', 'N70', 'N73', 'A150', 'A155', 'A159', 'H66', 'J02', 'J03', 'J06', 'J312', 'A154', 'A156', 'A158', 'A17', 'A18', 'A19'
]
# Filter the DataFrame based on zip codes and preventable diagnoses
filtered_data = df[df['Patient_Zip'].isin(zip_county_mapping.keys()) & df['Diag1'].isin(preventable_diagnoses)]

# Assign the respective counties to each zip code
filtered_data['County'] = filtered_data['Patient_Zip'].map(zip_county_mapping)


# Create an interactive histogram
fig = px.histogram(filtered_data, x='County', title='Count of Counties for Preventable Diagnoses')
# Display the interactive histogram
fig.show()

In [None]:
#Add additional counties/zipcodes
zip_county_mapping = {
    '37318': 'Franklin County',
    '37398': 'Franklin County',
    '37330': 'Franklin County',
    '37306': 'Franklin County',
    '37345': 'Franklin County',
    '37324': 'Franklin County',
    '37375': 'Franklin County',
    '37376': 'Franklin County',
    '37383': 'Franklin County',
    '37355': 'Coffee County',
    '37388': 'Coffee County',
    '37342': 'Coffee County',
    '37349': 'Coffee County',
    '37382': 'Coffee County',
    '37389': 'Coffee County',
    '37018': 'Coffee County',
    '37361': 'Coffee County',
    '37352': 'Moore County',
    '37020': 'Bedford County',
    '37160': 'Bedford County',
    '37161': 'Bedford County',
    '37162': 'Bedford County',
    '37180': 'Bedford County',
    '37183': 'Bedford County',
    '37110': 'Warren County',
    '37111': 'Warren County',
    '37357': 'Warren County',
    '37378': 'Warren County',
    '37394': 'Warren County',
    '38550': 'Warren County',
    '38581': 'Warren County',
    '37016': 'Cannon County',
    '37026': 'Cannon County',
    '37190': 'Cannon County',
    '37301': 'Grundy County',
    '37305': 'Grundy County',
    '37313': 'Grundy County',
    '37339': 'Grundy County',
    '37356': 'Grundy County',
    '37365': 'Grundy County',
    '37366': 'Grundy County',
    '37387': 'Grundy County',
    '37340': 'Marion County',
    '37347': 'Marion County',
    '37374': 'Marion County',
    '37379': 'Marion County',
    '37396': 'Marion County',
    '37397': 'Marion County'
}
preventable_diagnoses = [
    'I20', 'I240', 'I248', 'I249', 'J45', 'J13', 'J14', 'J153', 'J154', 'J157', 'J159', 'J16', 'J18', 'L03', 'L04',
    'L08', 'L88', 'L980', 'J20', 'J40', 'J41', 'J42', 'J43', 'J44', 'J47', 'I50', 'I110', 'J810', 'R56', 'E86', 'E109',
    'E119', 'E101', 'E131', 'E110', 'E130', 'E10641', 'E11641', 'E106', 'E116', 'E108', 'E118', 'K529', 'K5289', 'G40',
    'I10', 'I119', 'E162', 'N10', 'N11', 'N12', 'N70', 'N73', 'A150', 'A155', 'A159', 'H66', 'J02', 'J03', 'J06',
    'J312', 'A154', 'A156', 'A158', 'A17', 'A18', 'A19'
]
filtered_data = df[df['Patient_Zip'].isin(zip_county_mapping.keys()) & df['Diag1'].isin(preventable_diagnoses)]
filtered_data['County'] = filtered_data['Patient_Zip'].map(zip_county_mapping)
fig = px.histogram(filtered_data, x='County', title='Count of Counties for Preventable Diagnoses')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [None]:
df_filtered = df[df['Rural'] == True]
top_10_diagnoses = df_filtered['Diag1'].value_counts().nlargest(10)
# Creating the bar graph using Plotly
fig = px.bar(x=top_10_diagnoses.index, y=top_10_diagnoses.values, labels={'x': 'Diagnosis', 'y': 'Count'})
fig.update_layout(title="Top 10 Diagnoses for Rural TN",
                  xaxis_title="Diagnosis",
                  yaxis_title="Count")
fig.show()

In [None]:
df_filtered = df[df['Rural'] == False]
top_10_diagnoses = df_filtered['Diag1'].value_counts().nlargest(10)
# Creating the bar graph using Plotly
fig = px.bar(x=top_10_diagnoses.index, y=top_10_diagnoses.values, labels={'x': 'Diagnosis', 'y': 'Count'})
fig.update_layout(title="Top 10 Diagnoses for Urban TN",
                  xaxis_title="Diagnosis",
                  yaxis_title="Count")
fig.show()