In [10]:
!pip install dash pandas plotly



In [39]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from io import BytesIO
import base64

# Load the dataset
df = pd.read_csv('us_chronic_disease_indicators.csv')

# EDA visualizations
# 1. Question: Which Disease has the highest datavalue in our dataset?
# Bar chart of chronic diseases prevalence
plt.figure(figsize=(10, 6))
sns.barplot(x='topic', y='datavalue', data=df)
plt.title('Prevalence of Chronic Diseases')
plt.xlabel('Topic')
plt.ylabel('Data Value')
plt.xticks(rotation=45, ha='right')

# Save the Seaborn plot to a BytesIO object
buffer1 = BytesIO()
plt.savefig(buffer1, format='png')
buffer1.seek(0)
image_string1 = base64.b64encode(buffer1.read()).decode('utf-8')
plt.close()

# 2. Question: What is the prevalence and distribution of "chronic obstructive pulmonary disease" and "Cancer" across different states?
# Filter for 'cancer', 'chronic obstructive pulmonary disease'
cancer_chronic_data = df[df['topic'].isin(['Cancer', 'Chronic Obstructive Pulmonary Disease'])]

sns.set(style="whitegrid")
plt.figure(figsize=(12, 6))
ax = sns.countplot(x='locationdesc', data=cancer_chronic_data, hue='topic', palette='viridis',
                   order=df['locationdesc'].value_counts().index)
plt.title('Count of Diseases by State')
plt.xlabel('State')
plt.ylabel('Count')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.legend(title='Disease', bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

buffer2 = BytesIO()
plt.savefig(buffer2, format='png')
buffer2.seek(0)
image_string2 = base64.b64encode(buffer2.read()).decode('utf-8')
plt.close()

# 3. Time series chart depicting the incidence of cancer over the yearstart
cancer_df = df[df['topic'] == 'Cancer']
cancer_count_by_year = cancer_df.groupby('yearstart').size()

plt.figure(figsize=(10, 6))
plt.plot(cancer_count_by_year.index, cancer_count_by_year.values, marker='o')
plt.title('Incidence of Cancer Over the Years')
plt.xlabel('yearstart')
plt.ylabel('Number of Cases')
plt.grid(True)

buffer3 = BytesIO()
plt.savefig(buffer3, format='png')
buffer3.seek(0)
image_string3 = base64.b64encode(buffer3.read()).decode('utf-8')
plt.close()

# 4. Distribution of cancer by Ethnicity using pie chart
cancer_race_df = df[(df['topic'] == 'Cancer') & (df['stratificationcategory1'] == 'Race/Ethnicity')]
cancer_by_race = cancer_race_df.groupby('stratification1').size()

plt.figure(figsize=(8, 8))
plt.pie(cancer_by_race, labels=cancer_by_race.index, autopct='%1.1f%%', startangle=140, colors=plt.cm.Paired.colors)
plt.title('Distribution of Cancer Among Different Race/Ethnicity Categories')

buffer4 = BytesIO()
plt.savefig(buffer4, format='png')
buffer4.seek(0)
image_string4 = base64.b64encode(buffer4.read()).decode('utf-8')
plt.close()

# 5. Time series depicting the trends observed in the distribution of cancer cases within the different Race/Ethnicity.
cancer_race_df = cancer_df[cancer_df['stratificationcategory1'] == 'Race/Ethnicity']
cancer_race_by_year = cancer_race_df.groupby(['yearstart', 'stratification1']).size().unstack()

plt.figure(figsize=(12, 8))
for ethnicity in cancer_race_by_year.columns:
    plt.plot(cancer_race_by_year.index, cancer_race_by_year[ethnicity], marker='o', linestyle='-', label=ethnicity)
plt.title('Progression and Distribution of Cancer Among Different Race/Ethnicity Categories')
plt.xlabel('Year')
plt.ylabel('Number of Cases')
plt.legend(title='Race/Ethnicity', loc='upper left', bbox_to_anchor=(1, 1))
plt.grid(True)

buffer5 = BytesIO()
plt.savefig(buffer5, format='png')
buffer5.seek(0)
image_string5 = base64.b64encode(buffer5.read()).decode('utf-8')
plt.close()

# 6. Question: What is the distribution of Cancer over different race?
cancer_data_other_races = df[(df['topic'] == 'Cancer') &
                              (df['stratification1'] != 'Overall') &
                              (~df['stratification1'].isin(['Male', 'Female']))]

sns.set(style="whitegrid")
plt.figure(figsize=(12, 6))
ax = sns.barplot(x='stratification1', y='datavalue', data=cancer_data_other_races, hue='stratification1', palette='colorblind')
plt.title('Prevalence of Cancer in Other Races')
plt.xlabel('Race')
plt.ylabel('Data Value')
plt.legend(title='Race', bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

buffer6 = BytesIO()
plt.savefig(buffer6, format='png')
buffer6.seek(0)
image_string6 = base64.b64encode(buffer6.read()).decode('utf-8')
plt.close()

# 7. Distribution of cancer by Gender
cancer_gender_df = df[(df['topic'] == 'Cancer') & (df['stratificationcategory1'] == 'Gender')]
cancer_by_gender = cancer_gender_df.groupby('stratification1').size()

plt.figure(figsize=(10, 6))
cancer_by_gender.plot(kind='bar', color='skyblue')
plt.title('Distribution of Cancer Among Different Gender Categories')
plt.xlabel('Gender')
plt.ylabel('Number of Cases')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability

buffer7 = BytesIO()
plt.savefig(buffer7, format='png')
buffer7.seek(0)
image_string7 = base64.b64encode(buffer7.read()).decode('utf-8')
plt.close()

# 8 . Time series depicting the trends observed in the distribution of cancer cases within the different Gender.
cancer_df = df[df['topic'] == 'Cancer']
cancer_gender_df = cancer_df[cancer_df['stratificationcategory1'] == 'Gender']
cancer_gender_by_year = cancer_gender_df.groupby(['yearstart', 'stratification1']).size().unstack()

plt.figure(figsize=(12, 8))

# Plot each Gender category
for gender in cancer_gender_by_year.columns:
    plt.plot(cancer_gender_by_year.index, cancer_gender_by_year[gender], marker='o', linestyle='-', label=gender)

plt.title('Progression and Distribution of Cancer Among Different Gender Categories')
plt.xlabel('Year')
plt.ylabel('Number of Cases')
plt.legend(title='Gender', loc='upper left', bbox_to_anchor=(1, 1))
plt.grid(True)

buffer8 = BytesIO()
plt.savefig(buffer8, format='png')
buffer8.seek(0)
image_string8 = base64.b64encode(buffer8.read()).decode('utf-8')
plt.close()

# 9. Consider the distribution with respect to oral cavity cancer over years
oral_pharynx_mortality_df = df[(df['topic'] == 'Cancer') & (df['question'] == 'Cancer of the oral cavity and pharynx, mortality')]

# Plotting the bar chart
plt.figure(figsize=(8, 6))
plt.bar(oral_pharynx_mortality_df['yearstart'], oral_pharynx_mortality_df['datavalue'], color='skyblue')

# Setting plot attributes
plt.title('Mortality Rate for Cancer of the Oral Cavity and Pharynx')
plt.xlabel('Year')
plt.ylabel('Data Value')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability

# Save the Seaborn plot to a BytesIO object
buffer9 = BytesIO()
plt.savefig(buffer9, format='png')
buffer9.seek(0)

# Convert the BytesIO object to base64 for displaying in Dash app
image_string9 = base64.b64encode(buffer9.read()).decode('utf-8')

# Close the plot to prevent it from being displayed in the Jupyter Notebook
plt.close()

# 10. Consider the distribution with respect to oral cavity cancer across ethnicity
questions = ['Cancer of the oral cavity and pharynx, mortality']

# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 8))

# Loop through each question and plot a stacked bar
for question in questions:
    # Filter rows for the current question
    current_question_df = df[(df['topic'] == 'Cancer') & (df['question'] == question)]

    # Plot a stacked bar for each question
    ax.bar(current_question_df['stratification1'], current_question_df['datavalue'], label=question)

# Setting plot attributes
plt.title('Mortality Rate for Different Types of Cancer')
plt.xlabel('Stratification')
plt.ylabel('Data Value')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.legend(title='Questions', bbox_to_anchor=(1.05, 1), loc='upper left')  # Add legend

# Save the Seaborn plot to a BytesIO object
buffer10 = BytesIO()
plt.savefig(buffer10, format='png')
buffer10.seek(0)

# Convert the BytesIO object to base64 for displaying in Dash app
image_string10 = base64.b64encode(buffer10.read()).decode('utf-8')

# Close the plot to prevent it from being displayed in the Jupyter Notebook
plt.close()

# 11. Question: What is the cancer distribution across different states?
# FILTER CANCER DATA WITHOUT "UNITED STATES" STATE
cancer_data_noUS = df[(df['topic'] == 'Cancer') & (df['locationdesc'] != 'United States')]

# Distribution of 'datavalue' for cancer across states (box plot)
plt.figure(figsize=(14, 6))
sns.barplot(x='locationdesc', y='datavalue', data=cancer_data_noUS, palette='pastel')
plt.title('Distribution of Cancer Data Value Across States')
plt.xlabel('States')
plt.ylabel('datavalue')
plt.xticks(rotation=45, ha='right')

# Save the Seaborn plot to a BytesIO object
buffer11 = BytesIO()
plt.savefig(buffer11, format='png')
buffer11.seek(0)

# Convert the BytesIO object to base64 for displaying in Dash app
image_string11 = base64.b64encode(buffer11.read()).decode('utf-8')

# Close the plot to prevent it from being displayed in the Jupyter Notebook
plt.close()


# Dash app
app = dash.Dash(__name__)

app.layout = html.Div(children=[
    html.H1("US Chronic Disease Indicators"),

    # Visualization 1
    html.Img(src=f'data:image/png;base64,{image_string1}'),

    # Visualization 2
    html.Img(src=f'data:image/png;base64,{image_string2}'),

    # Visualization 3
    html.Img(src=f'data:image/png;base64,{image_string3}'),

    # Visualization 4
    html.Img(src=f'data:image/png;base64,{image_string4}'),

    # Visualization 5
    html.Img(src=f'data:image/png;base64,{image_string5}'),

    # Visualization 6
    html.Img(src=f'data:image/png;base64,{image_string6}'),

    # Visualization 7
    html.Img(src=f'data:image/png;base64,{image_string7}'),

    # Visualization 8
    html.Img(src=f'data:image/png;base64,{image_string8}'),
    
    # Visualization 9
    html.Img(src=f'data:image/png;base64,{image_string9}'),
    
    # Visualization 10
    html.Img(src=f'data:image/png;base64,{image_string10}'),
    
    # Visualization 11
    html.Img(src=f'data:image/png;base64,{image_string11}'),
    
])

if __name__ == '__main__':
    app.run_server(debug=True)


set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.


