In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sb

# Import

In [3]:
# import cleaned and prepared data of consultations in RW
rw_data = pd.read_pickle("data/rw-cleaned-prepared-dynamic-data-phase1.pickle")

# import cluster result of "best clustering" determined in notebook: rw-clustering.ipynb
cluster_data = pd.read_pickle("data/dash/rw_clustering_result.pickle")

# Data preparation

In [4]:
# construct spatio temporal cluster dataset
spatio_temporal_columns = [
    "medical_case_consultation_date_day", # time
    "health_facility_name", "health_facility_longitude", "health_facility_latitude",  #spatial variant 1 = hf location
    "District", "Sector", "Cell", "Village", "longitude_village", "latitude_village" #spatial variant 2 = patient's origin
]

# add spatial and temporal dimensions to selected cluster result
spatio_temporal_cluster_data = pd.merge(cluster_data, rw_data[spatio_temporal_columns], left_index=True, right_index=True)

# preprocess selected cluster result for spatial temporal analysis:
# add time units day, week, month, year
from datetime import datetime

#input date is string in format "2022-01-01"
#output is week in format "2022-01"
def getWeek(date): 
    dateObj = datetime.strptime(date, "%Y-%m-%d")
    week = dateObj.strftime("%U")
    if len(week) == 1:
        return str(dateObj.year)+"-0"+str(week)
    else:
        return str(dateObj.year)+"-"+str(week)

# Extract the year, month and week from medical case consultation at day
# note: week starts from week 0
spatio_temporal_cluster_data["temp"] = [(date[0], date[0]+"-"+date[1], getWeek(date[0]+"-"+date[1]+"-"+date[2])) for date in spatio_temporal_cluster_data["medical_case_consultation_date_day"].str.split("-")]
spatio_temporal_cluster_data[["medical_case_consultation_date_year", "medical_case_consultation_date_month", "medical_case_consultation_date_week"]] = spatio_temporal_cluster_data["temp"].apply(pd.Series)
spatio_temporal_cluster_data = spatio_temporal_cluster_data.drop("temp", axis=1)

spatio_temporal_cluster_data

Unnamed: 0,patient_age,patient_gender,PE212 - Respiratory rate (breaths/min) - 8469,S39 - Cough - 7817,PE18 - Chest indrawing - 7811,S180 - How is the infant feeding currently? - 7516,S46 - Convulsions in present illness - 8355,BC - Axillary temperature - 7823 categorical,PE125 - Observation of movement - 8388,S96 - Unable to drink or breastfeed - 7871,...,health_facility_latitude,District,Sector,Cell,Village,longitude_village,latitude_village,medical_case_consultation_date_year,medical_case_consultation_date_month,medical_case_consultation_date_week
0,0.0,male,36.0,Yes,No,,No,< 38 °C,,No,...,-2.0,rusizi,gihundwe,shagasha,karambo,28.940574,-2.497709,2021,2021-12,2021-48
0,0.0,male,36.0,Yes,No,,No,< 38 °C,,No,...,-2.0,rusizi,nyakarenzo,kabuye,nyamagana,28.903823,-2.552700,2021,2021-12,2021-48
0,5.0,female,32.0,Yes,No,,No,< 38 °C,,No,...,-2.0,rusizi,gihundwe,shagasha,karambo,28.940574,-2.497709,2021,2021-12,2021-48
0,5.0,female,32.0,Yes,No,,No,< 38 °C,,No,...,-2.0,rusizi,nyakarenzo,kabuye,nyamagana,28.903823,-2.552700,2021,2021-12,2021-48
1,2.0,female,34.0,Yes,No,,No,< 38 °C,,No,...,-2.0,rusizi,gihundwe,shagasha,karambo,28.940574,-2.497709,2021,2021-12,2021-48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46370,9.0,female,,,,,,,,,...,-2.0,nyamasheke,rangiro,jurwe,kibavu,29.142684,-2.406713,2023,2023-02,2023-05
46371,2.0,male,,No,,,No,>= 38°C,,No,...,-2.0,nyamasheke,mahembe,kagarama,mikingo,29.272525,-2.210330,2023,2023-02,2023-06
46372,5.0,male,,No,,,No,< 38 °C,,No,...,-2.0,rusizi,nyakabuye,nyabintare,barenga,29.062348,-2.558666,2023,2023-02,2023-06
46373,4.0,male,,Yes,No,,No,< 38 °C,,No,...,-2.0,rusizi,nyakabuye,nyabintare,barenga,29.062348,-2.558666,2023,2023-02,2023-06


In [5]:
# store for dash 
spatio_temporal_cluster_data.to_pickle("data/dash/rw-spatio-temporal-cluster-data.pickle")

# Select space and time unit 
... for spatio-temporal analysis and outlier detection
- space = patient's origin or hf location
- time = consultation date (day, week, month, year)

In [7]:
# select time units (week is default)
time = "medical_case_consultation_date_week" # or "medical_case_consultation_date_day", "medical_case_consultation_date_month", "medical_case_consultation_date_year"

# select space units (patient's origin is default)
space = ["District", "Sector", "Cell", "Village", "longitude_village", "latitude_village"] # or ["health_facility_longitude", "health_facility_latitude"]
space_coordinates = ["longitude_village", "latitude_village"] #or ["health_facility_name", "health_facility_longitude", "health_facility_latitude"]

# Spatio-temporal analysis

In [None]:
# see"rw-spatio-temporal-clustering.ipynb"

# Outlier detection for selected feature at certain space time point

In [74]:
spatio_temporal_cluster_data.columns

Index(['patient_age', 'patient_gender',
       'PE212 - Respiratory rate (breaths/min) - 8469', 'S39 - Cough - 7817',
       'PE18 - Chest indrawing - 7811',
       'S180 - How is the infant feeding currently? - 7516',
       'S46 - Convulsions in present illness - 8355',
       'BC - Axillary temperature - 7823 categorical',
       'PE125 - Observation of movement - 8388',
       'S96 - Unable to drink or breastfeed - 7871',
       'S157 - Vomiting everything - 8026',
       'OS9 - Unconscious or Lethargic (Unusually sleepy) - 7378',
       'PE63 - Stiff neck - 8391', 'PE19 - Stridor in calm child - 7812',
       'S42 - Duration of fever (days) - 7819 categorical',
       'S124 - Significant weight loss - 7539',
       'S118 - Significant hemoptysis (>1 episode) - 7941',
       'S176 - Cough duration (days) - 7731 categorical',
       'E46 - Recent close contact with somebody with TB - 7820',
       'PE220 - Identifiable source of fever? - 8399',
       'A55 - Urinary dipstick - 7648'

In [78]:
spatio_temporal_cluster_data['S39 - Cough - 7817'].dtype == "O" # O fr Object = categorical or binary non-numerical feature

False

In [98]:
feature_to_be_checked_for_outliers = 'S39 - Cough - 7817' # numerc feature: patient_age

In [99]:
# compute upper bound for outliers based on z-score
if spatio_temporal_cluster_data[feature_to_be_checked_for_outliers].dtype == "O":

    # countng appearances of categores
    temp = spatio_temporal_cluster_data.copy()
    temp[feature_to_be_checked_for_outliers] = temp[feature_to_be_checked_for_outliers].fillna("NaN") # make NAN an additonal category
    temp = temp.groupby(space+[time, feature_to_be_checked_for_outliers]).size().reset_index(name="count") # contans count of feature's category over space and tme

    #calculat mean and std of count at space pont over tme
    mean_std_feature_over_time = temp.groupby(space+[feature_to_be_checked_for_outliers])["count"].agg(["mean", "std"]).reset_index()

    # prepare df to compute z-score
    z_score_feature = pd.merge(temp, mean_std_feature_over_time, on=space+[feature_to_be_checked_for_outliers])

    # compute z score
    z_score_feature["z_score"] = (z_score_feature["count"] - z_score_feature["mean"]) / z_score_feature["std"]
else: # dtype=float64
    #calculat mean and std of count at space pont over tme
    mean_std_feature_over_time = spatio_temporal_cluster_data.groupby(space)[feature_to_be_checked_for_outliers].agg(["mean", "std"]).reset_index()

    # prepare df to compute z-score
    z_score_feature = pd.merge(spatio_temporal_cluster_data[space+[time, feature_to_be_checked_for_outliers]], mean_std_feature_over_time, on=space)

    # compute z score
    z_score_feature["z_score"] = (z_score_feature[feature_to_be_checked_for_outliers] - z_score_feature["mean"]) / z_score_feature["std"]

# outlers?
z_score_feature["outlier"] = z_score_feature["z_score"] > 3 #is common chosen upper bound threshold

In [100]:
# vsualse outlers over space and tme
fig = px.scatter_mapbox(
    z_score_feature[z_score_feature["outlier"] == True], 
    lon=[c for c in z_score_feature.columns if "longitude" in c][0], 
    lat=[c for c in z_score_feature.columns if "latitude" in c][0], 
    color=feature_to_be_checked_for_outliers, 
    size="count" if "count" in z_score_feature.columns else None ,
    hover_data = z_score_feature.columns,
    center=dict(lon=30, lat=-2.2), zoom=7,
    mapbox_style="stamen-terrain",
    animation_frame = time,
    title=f"Detection of outliers in '{feature_to_be_checked_for_outliers}'",
    #range_color=(0, len(outlier_detection_data["cluster"].unique())-1),
    category_orders={"cluster": sorted(outlier_detection_data["cluster"].unique())}
).update_layout(
    showlegend=True,
    legend_title_text="Cluster"
)
fig.show()

# Outlier detection for number of consultations per cluster at certain space time point
- detection of outliers of number of consultations in clusters at certain space time points != outbreak detection (but it sure alerts the domain experts to further investigate)
- possible approaches:
    - annie's suggestion: working with the distribution/considering the STD:
        - does she mean [HBOS](https://www.dfki.de/fileadmin/user_upload/import/6431_HBOS-poster.pdf)
    - accoriding to this [post](https://docs.oracle.com/cd/E40248_01/epm.1112/cb_statistical/frameset.htm?ch07s02s10s01.html):
        - Median and Median Absolute Deviation Method (MAD) is best (only if the majority of observation are having the same value)
        - Median and Interquartile Deviation Method (IQD) good alternative (this is the way how boxplots show outliers)
    - [review of outlier detection methods](https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=4f5844c9e7db68af7c2c5b918082636c3307cef9)
- to do proper outbreak detection consider the following (potential feature work of this project)
    - [anomaly detection for outbreak detection](https://www.researchgate.net/profile/Mohamad-Farhan-Mohamad-Mohsin/publication/281465876_A_review_on_anomaly_detection_in_disease_outbreak_detection/links/56a57bd908aeef24c58bdfa3/A-review-on-anomaly-detection-in-disease-outbreak-detection.pdf)

In [39]:
# summarize cluster data over space and time
consultations_per_cluster_over_space_time = spatio_temporal_cluster_data.groupby(space+[time, "cluster"]).size().reset_index(name="cluster_size")

# compute percentage of consultations per cluster over space and time
consultations_per_cluster_over_space_time["percentage_cluster_size"] = consultations_per_cluster_over_space_time["cluster_size"] / consultations_per_cluster_over_space_time.groupby(space+[time])["cluster_size"].transform('sum')

In [40]:
variable_to_be_checked_for_outliers = "percentage_cluster_size" #or "cluster_size"

## Outlier detection based on z-score

In [41]:
# compute upper bound for outliers based on z-score
z_score_threshold_of_cluster_at_space = consultations_per_cluster_over_space_time.groupby(space+["cluster"])[variable_to_be_checked_for_outliers].agg(["mean", "std"]).reset_index()

# check for outliers
z_score_outlier_detection = pd.merge(consultations_per_cluster_over_space_time, z_score_threshold_of_cluster_at_space, on=space+["cluster"])
z_score_outlier_detection["z_score"] = (z_score_outlier_detection[variable_to_be_checked_for_outliers] - z_score_outlier_detection["mean"]) / z_score_outlier_detection["std"]
z_score_outlier_detection["outlier"] = z_score_outlier_detection["z_score"] > 3 #is common chosen upper bound threshold

## Outlier detection based on IQR (=Inter Quartile Range)

In [42]:
# compute upper bound for outliers based on iqr
iqr_threshold_of_cluster_at_space = consultations_per_cluster_over_space_time.groupby(space+["cluster"]).agg(
    q1 = (variable_to_be_checked_for_outliers, lambda x: np.percentile(x, 25, method='midpoint')),
    q3 = (variable_to_be_checked_for_outliers, lambda x: np.percentile(x, 75, method='midpoint')),
).reset_index()
iqr_threshold_of_cluster_at_space["iqr"] = iqr_threshold_of_cluster_at_space["q3"] - iqr_threshold_of_cluster_at_space["q1"]
iqr_threshold_of_cluster_at_space["upper_bound"] = iqr_threshold_of_cluster_at_space["q3"] + 1.5 * iqr_threshold_of_cluster_at_space["iqr"]

# check for outliers
iqr_outlier_detection = pd.merge(consultations_per_cluster_over_space_time, iqr_threshold_of_cluster_at_space, on=space+["cluster"])
iqr_outlier_detection["outlier"] = iqr_outlier_detection["upper_bound"] < iqr_outlier_detection[variable_to_be_checked_for_outliers]

## Outlier detection based on Histogram-based Outlier Score (HBOS) of distribution of "number of consultations per space and time point"
Since space and time change so does the distribution.

follow this overview: https://www.dfki.de/fileadmin/user_upload/import/6431_HBOS-poster.pdf
alternatively check out: CBLOF [5] or LDCOF [1] which can be used after K-Means

In [43]:
distribution_percentage_cluster_size_at_space_point_over_time = consultations_per_cluster_over_space_time.groupby(space+["cluster", "percentage_cluster_size"]).size().reset_index(name="frequence")

""""fig = px.line(distribution_percentage_cluster_size_at_space_point_over_time, x="percentage_cluster_size", y="frequence", color="Village")
fig.show()"""

'"fig = px.line(distribution_percentage_cluster_size_at_space_point_over_time, x="percentage_cluster_size", y="frequence", color="Village")\nfig.show()'

##  Summarize outlier detection results

In [None]:
outlier_detection_datasets = [z_score_outlier_detection, iqr_outlier_detection]
outlier_detection_methods = ["z-score", "IQR"]

## Visualize outlier detection results over space and time

### Difference in outlier detection results

In [52]:
#iqr_outlier_detection.groupby(space+[time, "cluster"])["outlier"].sum().reset_index(name="number_outliers")
difference_outlier_detection_results = pd.merge(z_score_outlier_detection, iqr_outlier_detection, on=space+[time, "cluster"], suffixes=('_z_score', '_iqr'))

# space and time points where outlier detection methods disagree in outlier decision
difference_outlier_detection_results[~(difference_outlier_detection_results["outlier_z_score"] == difference_outlier_detection_results["outlier_iqr"])]

Unnamed: 0,District,Sector,Cell,Village,longitude_village,latitude_village,medical_case_consultation_date_week,cluster,cluster_size_z_score,percentage_cluster_size_z_score,...,std,z_score,outlier_z_score,cluster_size_iqr,percentage_cluster_size_iqr,q1,q3,iqr,upper_bound,outlier_iqr
62,nyamasheke,bushekeri,buvungira,bushekeri,29.091001,-2.404073,2022-37,2,1,1.000000,...,0.273861,1.704026,False,1,1.000000,0.333333,0.500000,0.166667,0.750000,True
96,nyamasheke,bushekeri,buvungira,buvungira,29.092868,-2.417103,2022-35,3,1,0.500000,...,0.117851,0.707107,False,1,0.500000,0.416667,0.416667,0.000000,0.416667,True
101,nyamasheke,bushekeri,buvungira,buvungira,29.092868,-2.417103,2022-42,1,1,1.000000,...,0.262966,2.105608,False,1,1.000000,0.250000,0.500000,0.250000,0.875000,True
142,nyamasheke,bushekeri,buvungira,gasebeya,29.110375,-2.447158,2023-05,2,2,0.666667,...,0.175006,2.073787,False,2,0.666667,0.200000,0.333333,0.133333,0.533333,True
167,nyamasheke,bushekeri,buvungira,gisakura,29.090935,-2.434478,2022-45,1,2,1.000000,...,0.247348,2.428143,False,2,1.000000,0.250000,0.500000,0.250000,0.875000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32316,rusizi,rwimbogo,rubugu,rukombe,28.958270,-2.621800,2022-25,2,1,1.000000,...,0.471405,0.707107,False,1,1.000000,0.666667,0.666667,0.000000,0.666667,True
32335,rusizi,rwimbogo,ruganda,cyunguriro,28.989045,-2.641548,2022-45,2,1,1.000000,...,0.471405,0.707107,False,1,1.000000,0.666667,0.666667,0.000000,0.666667,True
32380,rusizi,rwimbogo,ruganda,rubuye,29.003471,-2.647140,2022-29,0,1,1.000000,...,0.353553,0.707107,False,1,1.000000,0.750000,0.750000,0.000000,0.750000,True
32411,rusizi,rwimbogo,ruganda,ruhinga,28.984867,-2.648469,2023-05,3,2,1.000000,...,0.267447,1.713737,False,2,1.000000,0.416667,0.583333,0.166667,0.833333,True


In [58]:
# comparison of number of outliers per cluster
difference_outlier_detection_results.groupby(["cluster"])["outlier_z_score", "outlier_iqr"].sum().reset_index()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,cluster,outlier_z_score,outlier_iqr
0,0,4,145
1,1,1,117
2,2,9,254
3,3,3,212


In [57]:
# comparison of number of outliers per cluster at space point
difference_outlier_detection_results.groupby([time, "cluster"])["outlier_z_score", "outlier_iqr"].sum().reset_index()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,medical_case_consultation_date_week,cluster,outlier_z_score,outlier_iqr
0,2021-48,0,0,0
1,2021-48,1,0,1
2,2021-48,2,0,2
3,2021-48,3,0,0
4,2021-49,0,0,1
...,...,...,...,...
247,2023-05,3,0,5
248,2023-06,0,0,2
249,2023-06,1,0,3
250,2023-06,2,0,10


In [55]:
# comparison of number of outliers per cluster at space point
difference_outlier_detection_results.groupby(space+["cluster"])["outlier_z_score", "outlier_iqr"].sum().reset_index()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,District,Sector,Cell,Village,longitude_village,latitude_village,cluster,outlier_z_score,outlier_iqr
0,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,0,0,0
1,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,1,0,0
2,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,2,0,0
3,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,3,0,0
4,nyamasheke,bushekeri,buvungira,bushekeri,29.091001,-2.404073,0,0,0
...,...,...,...,...,...,...,...,...,...
3622,rusizi,rwimbogo,ruganda,rubuye,29.003471,-2.647140,3,0,0
3623,rusizi,rwimbogo,ruganda,ruhinga,28.984867,-2.648469,0,0,0
3624,rusizi,rwimbogo,ruganda,ruhinga,28.984867,-2.648469,1,0,0
3625,rusizi,rwimbogo,ruganda,ruhinga,28.984867,-2.648469,2,0,1


In [54]:
# comparison of number of outliers per cluster at space time point
difference_outlier_detection_results.groupby(space+[time, "cluster"])["outlier_z_score", "outlier_iqr"].sum().reset_index()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,District,Sector,Cell,Village,longitude_village,latitude_village,medical_case_consultation_date_week,cluster,outlier_z_score,outlier_iqr
0,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,2022-03,1,0,0
1,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,2022-35,0,0,0
2,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,2022-35,1,0,0
3,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,2022-36,1,0,0
4,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,2022-37,1,0,0
...,...,...,...,...,...,...,...,...,...,...
32415,rusizi,rwimbogo,ruganda,ruhinga,28.984867,-2.648469,2022-52,1,0,0
32416,rusizi,rwimbogo,ruganda,ruhinga,28.984867,-2.648469,2022-52,2,0,0
32417,rusizi,rwimbogo,ruganda,ruhinga,28.984867,-2.648469,2023-04,1,0,0
32418,rusizi,rwimbogo,ruganda,ruhinga,28.984867,-2.648469,2023-05,3,0,1


### Outliers per cluster over space and time

In [73]:
# filter for outliers based on selected outlier detection method and then plot outliers over space and time
for i, outlier_detection_data in enumerate(outlier_detection_datasets):
    fig = px.scatter_mapbox(
        outlier_detection_data[outlier_detection_data["outlier"] == True], 
        lon=[c for c in outlier_detection_data.columns if "longitude" in c][0], 
        lat=[c for c in outlier_detection_data.columns if "latitude" in c][0], 
        color="cluster", 
        size="cluster_size",
        hover_data = outlier_detection_data.columns,
        center=dict(lon=30, lat=-2.2), zoom=7,
        mapbox_style="stamen-terrain",
        animation_frame = time,
        title="Detection outliers in 'consultation numbers per cluster' based on method: "+outlier_detection_methods[i],
        #range_color=(0, len(outlier_detection_data["cluster"].unique())-1),
        category_orders={"cluster": sorted(outlier_detection_data["cluster"].unique())}
    ).update_layout(
        showlegend=True,
        legend_title_text="Cluster"
    )
    """for i, c in enumerate(sorted(outlier_detection_data["cluster"].unique())):
        fig['data'][i]['showlegend']=True
        fig['data'][i]['name'] = "test"
    fig.show()
"""
    fig.show()

# Get composition of symptoms, clusters and diagnoses based on for selected space time point
Such an interactive plot will be of high utility for domain experts to further examine the outliers and thus validate them! 

In [None]:
# TODO bar chart for cluster_size/top 10 diagnoses/top 10 bnary features/boxplot contnues features/top 10 categorcal features with animation frame for time