In [126]:
# data processing
import polars as pl
import numpy as np

#visualisation
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

In [127]:
INFECTIOUS_DISEASE_DATA_AI_CLEANED_DIR = "../datasets/weekly-infectious-disease-bulletin-cases-AI-cleaned.parquet"
INFECTIOUS_DISEASE_DATA_CLEANED_DIR = "../datasets/weekly-infectious-disease-bulletin-aligned-with-ai-cleaned.csv"

In [128]:
df_ai_cleaned = pl.read_parquet(INFECTIOUS_DISEASE_DATA_AI_CLEANED_DIR)
df_cleaned = pl.read_csv(INFECTIOUS_DISEASE_DATA_CLEANED_DIR)

In [129]:
df_cleaned

epi_week,disease,no._of_cases,year,week,week_start_date,transmission_mode,burden_tier
str,str,i64,i64,i64,str,str,str
"""2012-W15""","""Haemophilus influenzae type b""",0,2012,15,"""2012-04-09""","""Other""","""Low"""
"""2013-W08""","""Nipah virus infection""",0,2013,8,"""2013-02-18""","""Other""","""Low"""
"""2019-W34""","""Rubella""",0,2019,34,"""2019-08-19""","""Vaccine-preventable""","""Medium"""
"""2020-W10""","""Campylobacter enteritis""",8,2020,10,"""2020-03-02""","""Other""","""High"""
"""2017-W07""","""Zika Virus Infection""",1,2017,7,"""2017-02-13""","""Other""","""Medium"""
…,…,…,…,…,…,…,…
"""2020-W07""","""Poliomyelitis""",0,2020,7,"""2020-02-10""","""Vaccine-preventable""","""Low"""
"""2015-W25""","""Dengue Haemorrhagic Fever""",0,2015,25,"""2015-06-15""","""Vector-borne""","""Medium"""
"""2012-W46""","""Dengue Fever""",87,2012,46,"""2012-11-12""","""Vector-borne""","""High"""
"""2020-W29""","""Plague""",0,2020,29,"""2020-07-13""","""Other""","""Low"""


In [130]:
df_ai_cleaned

epidemiological_week,year,week,week_start_date,week_end_date,disease_name,case_count,is_outlier,transmission_mode,burden_tier
str,i32,i32,date,date,str,i64,bool,str,str
"""2018-W34""",2018,34,2018-08-20,2018-08-26,"""Mumps""",7,false,"""Vaccine-preventable""","""High"""
"""2012-W36""",2012,36,2012-09-03,2012-09-09,"""Paratyphoid""",1,false,"""Foodborne""","""Medium"""
"""2020-W10""",2020,10,2020-03-02,2020-03-08,"""Rubella""",0,false,"""Vaccine-preventable""","""Medium"""
"""2020-W50""",2020,50,2020-12-07,2020-12-13,"""Yellow Fever""",0,false,"""Other""","""Low"""
"""2019-W10""",2019,10,2019-03-04,2019-03-10,"""Campylobacter enteritis""",8,false,"""Other""","""High"""
…,…,…,…,…,…,…,…,…,…
"""2016-W01""",2016,1,2016-01-04,2016-01-10,"""Diphtheria""",0,false,"""Vaccine-preventable""","""Low"""
"""2012-W04""",2012,4,2012-01-23,2012-01-29,"""Chikungunya Fever""",0,false,"""Vector-borne""","""High"""
"""2013-W04""",2013,4,2013-01-21,2013-01-27,"""Avian Influenza""",0,false,"""Other""","""Low"""
"""2016-W48""",2016,48,2016-11-28,2016-12-04,"""Acute Viral hepatitis B""",2,false,"""Other""","""High"""


In [131]:
# grouping total cases by diseases for both AI processes and original datasets
df_cleaned.group_by("disease").agg(pl.sum("no._of_cases")).sort("no._of_cases", descending=True)
df_ai_cleaned.group_by("disease_name").agg(pl.sum("case_count")).sort("case_count", descending=True)

disease_name,case_count
str,i64
"""Hand, Foot and Mouth Disease""",235409
"""Dengue Fever""",126642
"""Salmonellosis(non-enteric feve…",16497
"""Mumps""",4213
"""Campylobacterenterosis""",2138
…,…
"""Ebola Virus Disease""",0
"""Japanese Encephalitis""",0
"""Poliomyelitis""",0
"""Nipah virus infection""",0


In [132]:
df_cleaned

epi_week,disease,no._of_cases,year,week,week_start_date,transmission_mode,burden_tier
str,str,i64,i64,i64,str,str,str
"""2012-W15""","""Haemophilus influenzae type b""",0,2012,15,"""2012-04-09""","""Other""","""Low"""
"""2013-W08""","""Nipah virus infection""",0,2013,8,"""2013-02-18""","""Other""","""Low"""
"""2019-W34""","""Rubella""",0,2019,34,"""2019-08-19""","""Vaccine-preventable""","""Medium"""
"""2020-W10""","""Campylobacter enteritis""",8,2020,10,"""2020-03-02""","""Other""","""High"""
"""2017-W07""","""Zika Virus Infection""",1,2017,7,"""2017-02-13""","""Other""","""Medium"""
…,…,…,…,…,…,…,…
"""2020-W07""","""Poliomyelitis""",0,2020,7,"""2020-02-10""","""Vaccine-preventable""","""Low"""
"""2015-W25""","""Dengue Haemorrhagic Fever""",0,2015,25,"""2015-06-15""","""Vector-borne""","""Medium"""
"""2012-W46""","""Dengue Fever""",87,2012,46,"""2012-11-12""","""Vector-borne""","""High"""
"""2020-W29""","""Plague""",0,2020,29,"""2020-07-13""","""Other""","""Low"""


# Checking if there are any discrepancies in transmission mode categorization

In [133]:
df_ai_cleaned.group_by(["disease_name", "transmission_mode"]).agg(pl.count("transmission_mode").alias("count")).sort("count", descending=True)

disease_name,transmission_mode,count
str,str,u32
"""Pertussis""","""Vaccine-preventable""",470
"""Paratyphoid""","""Foodborne""",470
"""Encephalitis""","""Other""",470
"""Rubella""","""Vaccine-preventable""",470
"""Haemophilus influenzae type b""","""Other""",470
…,…,…
"""Botulism""","""Other""",209
"""Ebola Virus Disease""","""Other""",209
"""Campylobacter enteritis""","""Other""",209
"""Japanese Encephalitis""","""Vector-borne""",209


#### Graph helper functions

In [134]:
def create_line_graph(df: pl.DataFrame, x: str, y: str, hover_data: dict, title: str, color: str = "") -> None:
    """
    Plot line graph for y over x
    Args:
        df : pl.DataFrame
            -> dataframe that x and y exist in
        x : String
            -> column name present in dataframe used for x-axis
        y : String
            -> column name present in dataframe used for y-axis
        hover_data : dict
            -> columns to show on hover
        title : str
            -> title of the graph
        color : str
            -> column name to color and group lines by
    Output:
        None
        -> Shows the figure of line graph for y over x
    """
    # Aggregate data by x and color to avoid messy overlapping points
    if color:
        df_grouped = df.group_by([x, color]).agg(pl.sum(y).alias(y)).sort(x)
    else:
        df_grouped = df.group_by(x).agg(pl.sum(y).alias(y)).sort(x)
    
    fig = px.line(
        df_grouped, 
        x=x, 
        y=y,
        color=color if color else None,
        hover_data=hover_data,
        title=title
    )
    fig.show()

def hover_bar_chart(df : pl.DataFrame, x: str, y: str, hover_data: dict, title: str, color: str = "", barmode: str = ""):
        
    fig = px.bar(
        df,
        x=x,
        y=y,
        color=color if (color != "") else None,  # same as hue
        hover_data=hover_data,
        barmode=barmode if barmode != "" else None,
        title=title
    )
    fig.show()

### Analyse change in disease count over from 2013 to 2020, segment by diseases

In [135]:
create_line_graph(df_ai_cleaned, "week_start_date", "case_count", {"week_start_date": True, "case_count": True}, "Cases from 2012 to 2020", "disease_name")
create_line_graph(df_cleaned, "week_start_date", "no._of_cases", {"no._of_cases": True}, "Cases from 2012 to 2020", "disease")

## Analyse Burden tier by diseases

In [136]:
grouped_values = (
    df_cleaned
    .group_by(["burden_tier", "year"])
    .agg(pl.sum("no._of_cases").alias("total_cases"))
    .sort("total_cases", descending=True)
)

hover_bar_chart(grouped_values, "year", "total_cases", {"burden_tier": True, "total_cases": True, "year": True}, "Burden Tier distribution", "burden_tier", "group")

In [137]:
df_cleaned.filter(pl.col("disease") == "Hand, Foot and Mouth Disease")

epi_week,disease,no._of_cases,year,week,week_start_date,transmission_mode,burden_tier
str,str,i64,i64,i64,str,str,str
"""2019-W30""","""Hand, Foot and Mouth Disease""",0,2019,30,"""2019-07-22""","""Other""","""High"""
"""2019-W01""","""Hand, Foot and Mouth Disease""",0,2019,1,"""2018-12-31""","""Other""","""High"""
"""2018-W43""","""Hand, Foot and Mouth Disease""",561,2018,43,"""2018-10-22""","""Other""","""High"""
"""2020-W14""","""Hand, Foot and Mouth Disease""",0,2020,14,"""2020-03-30""","""Other""","""High"""
"""2018-W12""","""Hand, Foot and Mouth Disease""",983,2018,12,"""2018-03-19""","""Other""","""High"""
…,…,…,…,…,…,…,…
"""2018-W14""","""Hand, Foot and Mouth Disease""",1010,2018,14,"""2018-04-02""","""Other""","""High"""
"""2018-W26""","""Hand, Foot and Mouth Disease""",889,2018,26,"""2018-06-25""","""Other""","""High"""
"""2017-W31""","""Hand, Foot and Mouth Disease""",872,2017,31,"""2017-07-31""","""Other""","""High"""
"""2018-W47""","""Hand, Foot and Mouth Disease""",607,2018,47,"""2018-11-19""","""Other""","""High"""


In [138]:
grouped_ai_values = (
    df_ai_cleaned
    .group_by(["disease_name", "burden_tier"])
    .agg(pl.sum("case_count").alias("total_cases"))
    .sort("total_cases", descending=True)
)

grouped_orig_values = (
    df_cleaned
    .group_by("disease", "burden_tier")
    .agg(pl.sum("no._of_cases").alias("total_cases"))
    .sort("total_cases", descending=True)
)

hover_bar_chart(grouped_ai_values, "disease_name", "total_cases", {"total_cases": True, "disease_name": True, "burden_tier": True}, "Total Cases by Disease and Burden Tier", "burden_tier")
hover_bar_chart(grouped_orig_values, "disease", "total_cases", {"total_cases": True, "disease": True, "burden_tier": True}, "Total Cases by Disease", "burden_tier")

## Analysing transmission mode

In [139]:
df_cleaned

epi_week,disease,no._of_cases,year,week,week_start_date,transmission_mode,burden_tier
str,str,i64,i64,i64,str,str,str
"""2012-W15""","""Haemophilus influenzae type b""",0,2012,15,"""2012-04-09""","""Other""","""Low"""
"""2013-W08""","""Nipah virus infection""",0,2013,8,"""2013-02-18""","""Other""","""Low"""
"""2019-W34""","""Rubella""",0,2019,34,"""2019-08-19""","""Vaccine-preventable""","""Medium"""
"""2020-W10""","""Campylobacter enteritis""",8,2020,10,"""2020-03-02""","""Other""","""High"""
"""2017-W07""","""Zika Virus Infection""",1,2017,7,"""2017-02-13""","""Other""","""Medium"""
…,…,…,…,…,…,…,…
"""2020-W07""","""Poliomyelitis""",0,2020,7,"""2020-02-10""","""Vaccine-preventable""","""Low"""
"""2015-W25""","""Dengue Haemorrhagic Fever""",0,2015,25,"""2015-06-15""","""Vector-borne""","""Medium"""
"""2012-W46""","""Dengue Fever""",87,2012,46,"""2012-11-12""","""Vector-borne""","""High"""
"""2020-W29""","""Plague""",0,2020,29,"""2020-07-13""","""Other""","""Low"""
