In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sb

# Import

In [3]:
# import cleaned and prepared data of consultations in RW
rw_data = pd.read_pickle("data/rw-cleaned-prepared-dynamic-data-phase1.pickle")

# import cluster result of "best clustering" determined in notebook: rw-clustering.ipynb
cluster_data = pd.read_pickle("data/dash/rw_clustering_result.pickle")

# Data preparation

In [4]:
# construct spatio temporal cluster dataset
spatio_temporal_columns = [
    "medical_case_consultation_date_day", # time
    "health_facility_name", "health_facility_longitude", "health_facility_latitude",  #spatial variant 1 = hf location
    "District", "Sector", "Cell", "Village", "longitude_village", "latitude_village" #spatial variant 2 = patient's origin
]

# add spatial and temporal dimensions to selected cluster result
spatio_temporal_cluster_data = pd.merge(cluster_data, rw_data[spatio_temporal_columns], left_index=True, right_index=True)

# preprocess selected cluster result for spatial temporal analysis:
# add time units day, week, month, year
from datetime import datetime

#input date is string in format "2022-01-01"
#output is week in format "2022-01"
def getWeek(date): 
    dateObj = datetime.strptime(date, "%Y-%m-%d")
    week = dateObj.strftime("%U")
    if len(week) == 1:
        return str(dateObj.year)+"-0"+str(week)
    else:
        return str(dateObj.year)+"-"+str(week)

# Extract the year, month and week from medical case consultation at day
# note: week starts from week 0
spatio_temporal_cluster_data["temp"] = [(date[0], date[0]+"-"+date[1], getWeek(date[0]+"-"+date[1]+"-"+date[2])) for date in spatio_temporal_cluster_data["medical_case_consultation_date_day"].str.split("-")]
spatio_temporal_cluster_data[["medical_case_consultation_date_year", "medical_case_consultation_date_month", "medical_case_consultation_date_week"]] = spatio_temporal_cluster_data["temp"].apply(pd.Series)
spatio_temporal_cluster_data = spatio_temporal_cluster_data.drop("temp", axis=1)

spatio_temporal_cluster_data

Unnamed: 0,patient_age,patient_gender,PE212 - Respiratory rate (breaths/min) - 8469,S39 - Cough - 7817,PE18 - Chest indrawing - 7811,S180 - How is the infant feeding currently? - 7516,S46 - Convulsions in present illness - 8355,BC - Axillary temperature - 7823 categorical,PE125 - Observation of movement - 8388,S96 - Unable to drink or breastfeed - 7871,...,health_facility_latitude,District,Sector,Cell,Village,longitude_village,latitude_village,medical_case_consultation_date_year,medical_case_consultation_date_month,medical_case_consultation_date_week
0,0.0,male,36.0,Yes,No,,No,< 38 °C,,No,...,-2.0,rusizi,gihundwe,shagasha,karambo,28.940574,-2.497709,2021,2021-12,2021-48
0,0.0,male,36.0,Yes,No,,No,< 38 °C,,No,...,-2.0,rusizi,nyakarenzo,kabuye,nyamagana,28.903823,-2.552700,2021,2021-12,2021-48
0,5.0,female,32.0,Yes,No,,No,< 38 °C,,No,...,-2.0,rusizi,gihundwe,shagasha,karambo,28.940574,-2.497709,2021,2021-12,2021-48
0,5.0,female,32.0,Yes,No,,No,< 38 °C,,No,...,-2.0,rusizi,nyakarenzo,kabuye,nyamagana,28.903823,-2.552700,2021,2021-12,2021-48
1,2.0,female,34.0,Yes,No,,No,< 38 °C,,No,...,-2.0,rusizi,gihundwe,shagasha,karambo,28.940574,-2.497709,2021,2021-12,2021-48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46370,9.0,female,,,,,,,,,...,-2.0,nyamasheke,rangiro,jurwe,kibavu,29.142684,-2.406713,2023,2023-02,2023-05
46371,2.0,male,,No,,,No,>= 38°C,,No,...,-2.0,nyamasheke,mahembe,kagarama,mikingo,29.272525,-2.210330,2023,2023-02,2023-06
46372,5.0,male,,No,,,No,< 38 °C,,No,...,-2.0,rusizi,nyakabuye,nyabintare,barenga,29.062348,-2.558666,2023,2023-02,2023-06
46373,4.0,male,,Yes,No,,No,< 38 °C,,No,...,-2.0,rusizi,nyakabuye,nyabintare,barenga,29.062348,-2.558666,2023,2023-02,2023-06


In [5]:
# store for dash 
spatio_temporal_cluster_data.to_pickle("data/dash/rw-spatio-temporal-cluster-data.pickle")

# Select space and time unit 
... for spatio-temporal analysis and outlier detection
- space = patient's origin or hf location
- time = consultation date (day, week, month, year)

In [7]:
# select time units (week is default)
time = "medical_case_consultation_date_week" # or "medical_case_consultation_date_day", "medical_case_consultation_date_month", "medical_case_consultation_date_year"

# select space units (patient's origin is default)
space = ["District", "Sector", "Cell", "Village", "longitude_village", "latitude_village"] # or ["health_facility_longitude", "health_facility_latitude"]
space_coordinates = ["longitude_village", "latitude_village"] #or ["health_facility_name", "health_facility_longitude", "health_facility_latitude"]

# Spatio-temporal analysis

In [None]:
# see"rw-spatio-temporal-clustering.ipynb"

# Outlier detection (for number of consultations per cluster at certain space time)
- detection of outliers of number of consultations in clusters at certain space time points != outbreak detection (but it sure alerts the domain experts to further investigate)
- possible approaches:
    - annie's suggestion: working with the distribution/considering the STD:
        - does she mean [HBOS](https://www.dfki.de/fileadmin/user_upload/import/6431_HBOS-poster.pdf)
    - accoriding to this [post](https://docs.oracle.com/cd/E40248_01/epm.1112/cb_statistical/frameset.htm?ch07s02s10s01.html):
        - Median and Median Absolute Deviation Method (MAD) is best (only if the majority of observation are having the same value)
        - Median and Interquartile Deviation Method (IQD) good alternative (this is the way how boxplots show outliers)
    - [review of outlier detection methods](https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=4f5844c9e7db68af7c2c5b918082636c3307cef9)
- to do proper outbreak detection consider the following (potential feature work of this project)
    - [anomaly detection for outbreak detection](https://www.researchgate.net/profile/Mohamad-Farhan-Mohamad-Mohsin/publication/281465876_A_review_on_anomaly_detection_in_disease_outbreak_detection/links/56a57bd908aeef24c58bdfa3/A-review-on-anomaly-detection-in-disease-outbreak-detection.pdf)

In [13]:
# summarize cluster data over space and time
consultations_per_cluster_over_space_time = spatio_temporal_cluster_data.groupby(space+[time, "cluster"]).size().reset_index(name="cluster_size")

# compute percentage of consultations per cluster over space and time
consultations_per_cluster_over_space_time["percentage_cluster_size"] = consultations_per_cluster_over_space_time["cluster_size"] / consultations_per_cluster_over_space_time.groupby(space+[time])["cluster_size"].transform('sum')
consultations_per_cluster_over_space_time 

Unnamed: 0,District,Sector,Cell,Village,longitude_village,latitude_village,medical_case_consultation_date_week,cluster,cluster_size,percentage_cluster_size
0,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,2022-03,1,1,1.000000
1,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,2022-35,0,2,0.666667
2,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,2022-35,1,1,0.333333
3,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,2022-36,1,1,1.000000
4,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,2022-37,1,2,1.000000
...,...,...,...,...,...,...,...,...,...,...
32415,rusizi,rwimbogo,ruganda,ruhinga,28.984867,-2.648469,2022-52,1,1,0.333333
32416,rusizi,rwimbogo,ruganda,ruhinga,28.984867,-2.648469,2022-52,2,2,0.666667
32417,rusizi,rwimbogo,ruganda,ruhinga,28.984867,-2.648469,2023-04,1,3,1.000000
32418,rusizi,rwimbogo,ruganda,ruhinga,28.984867,-2.648469,2023-05,3,2,1.000000


## Outlier detection based on Histogram-based Outlier Score (HBOS) of distribution of "number of consultations per space and time point"
Since space and time change so does the distribution.

follow this overview: https://www.dfki.de/fileadmin/user_upload/import/6431_HBOS-poster.pdf
alternatively check out: CBLOF [5] or LDCOF [1] which can be used after K-Means

In [22]:
distribution_percentage_cluster_size_at_space_point_over_time = consultations_per_cluster_over_space_time.groupby(space+["cluster", "percentage_cluster_size"]).size().reset_index(name="frequence")
distribution_percentage_cluster_size_at_space_point_over_time

Unnamed: 0,District,Sector,Cell,Village,longitude_village,latitude_village,cluster,percentage_cluster_size,frequence
0,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,0,0.250000,1
1,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,0,0.333333,3
2,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,0,0.500000,5
3,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,0,0.555556,1
4,nyamasheke,bushekeri,buvungira,buhinga,29.080169,-2.418294,0,0.625000,1
...,...,...,...,...,...,...,...,...,...
13171,rusizi,rwimbogo,ruganda,ruhinga,28.984867,-2.648469,3,0.250000,1
13172,rusizi,rwimbogo,ruganda,ruhinga,28.984867,-2.648469,3,0.333333,1
13173,rusizi,rwimbogo,ruganda,ruhinga,28.984867,-2.648469,3,0.500000,2
13174,rusizi,rwimbogo,ruganda,ruhinga,28.984867,-2.648469,3,0.666667,1


In [23]:
fig = px.line(distribution_percentage_cluster_size_at_space_point_over_time, x="percentage_cluster_size", y="frequence", color="Village")
fig.show()

In [None]:
def outlier_detector(space, time):
    return ""