# Fry's Seven Stages of Visualization

**Stages**
1. Acquire
2. Parse
3. Filter
4. Mine
5. Represent
6. Refine
7. Interact

In [None]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import ipywidgets as widgets

In [None]:
'''
Stage 1: Acquire data from a dataset from any source like kaggle.
Ex: https://www.kaggle.com/datasets/shekharkoirala/nepal-covid19-dataset
'''
file_path="../data/nepal-covid.csv"
covid_dataframe = pd.read_csv(file_path)
covid_dataframe

In [None]:
'''
Stage 2: Parse data from the dataset
'''
# Preview the data
display(covid_dataframe.info())
display(covid_dataframe.head())

The Age column contains age ranges (like '31 - 40') and an '80+' category, so it's stored as an object.

To convert this to a numeric format, we can:

Take the midpoint of each age range (e.g., '31 - 40' → 35.5)

Convert '80+' to a reasonable estimate like 85

In [None]:
def parse_age(age_str):
    if isinstance(age_str, str):
        if '+' in age_str:
            return 85  # Approx for '80+'
        try:
            parts = age_str.split(' - ')
            return (int(parts[0]) + int(parts[1])) / 2
        except:
            return None
    return None  # Return None for non-string entries

covid_dataframe["Age_numeric"] = covid_dataframe["Age"].apply(parse_age)

covid_dataframe

We now have a age numeric column after parsing data

In [None]:
'''
Stage 3: Filter
Remove all those rows that has missing test or case values
'''
print("Rows with missing/null values")
display(covid_dataframe[covid_dataframe[['Value', 'Age']].isnull().any(axis=1)])


We see there are some rows with NaN values. We will need to drop (remove) them before we proceed

In [None]:
covid_dataframe_filtered = covid_dataframe.dropna(subset=["Value", "Age"])
display(covid_dataframe_filtered[covid_dataframe_filtered[["Value", "Age"]].isnull().any(axis=1)])

Now there are no rows with NaN values

In [None]:
'''
State 4: Mine
We can now extract information from the data
'''
# Convert 'Period' to datetime to 
covid_dataframe_filtered.loc[:, "Period"] = pd.to_datetime(covid_dataframe_filtered["Period"])

# Total cases by province
cases_by_province = covid_dataframe_filtered.groupby("Province")["Value"].sum().sort_values(ascending=False)

# Case trends over time
cases_over_time = covid_dataframe_filtered.groupby("Period")["Value"].sum()

# Case distribution by age
age_group_cases = covid_dataframe_filtered.groupby("Age")["Value"].sum().sort_values(ascending=False)

# Case distribution by gender
gender_cases = covid_dataframe_filtered.groupby("Sex")["Value"].sum()

print("cases_by_province\n", cases_by_province, "\n")
print("gender_cases\n", gender_cases, "\n")
print("cases_over_time\n", cases_over_time, "\n")
print("age_group_cases\n", age_group_cases, "\n")


In [None]:

'''
State 5 and 6: Represent and refine
5: We will create multiple visualizations based on the data we mined to better visualize the results
6. We will provide proper labels, colors and markings to better understand data after looking at the visualization
'''

# Bar chart: Cases by Province
# Bar chart: Total cases by province with actual values above bars
plt.figure(figsize=(10, 6))
bars = plt.bar(cases_by_province.index, cases_by_province.values)
plt.title("Total COVID-19 Cases by Province")
plt.ylabel("Cases")
plt.xticks(rotation=45)

# Add value labels above each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height, f'{int(height):,}', 
             ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

# Line chart: Cases over time
cases_over_time.plot(title="COVID-19 Cases Over Time")
plt.ylabel("Cases")
plt.xlabel("Date")
plt.tight_layout()
plt.show()


# Bar chart: Cases by Age Group
age_group_cases.plot(kind="bar", title="Cases by Age Group")
plt.ylabel("Cases")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Create a pie chart with a legend instead of labels on the slices
plt.figure(figsize=(6, 6))
colors = plt.get_cmap("Set2").colors  # use a pleasant color palette

# Pie chart without labels on the pie
patches, texts, _ = plt.pie(gender_cases, labels=None, autopct='%1.1f%%', colors=colors, startangle=90)

# Add a legend instead
plt.legend(patches, gender_cases.index, title="Gender", loc="best")
plt.title("COVID-19 Cases by Gender")
plt.tight_layout()
plt.show()

In [None]:
'''
State 7: Interact
We will create an interactive select dropdown that will filter based on the province
'''
import ipywidgets as widgets
from IPython.display import display

# Dropdown for province selection
province_dropdown = widgets.Dropdown(
    options=covid_dataframe_filtered["Province"].unique(),
    description="Province:"
)

# Function to show hospital/case details per selected province
def show_province_data(province):
    display(covid_dataframe_filtered[covid_dataframe_filtered["Province"] == province].head())

# Interactive widget
widgets.interact(show_province_data, province=province_dropdown)