# **Spatio-temporal clustering for syndromic surveillance**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# import cleaned and prepared data of consultations in RW
rw_data = pd.read_pickle("data/rw-cleaned-prepared-dynamic-data-phase1.pickle")

# Data selection
Only pick columns relevant for the spatio-temporal clustering of symptoms. Relevant columns are:
1. time = "medical_case_consultation_date"
2. space
    - hf location = "health_facility_longitude" & "health_facility_latitude"
    - location of patient's home: "BD9 - Village (Umudugudu) - 8062" (cleaned in columns: ....)
3. demographic: 
    - age = "patient_birthdate"
    - sex = "patient_gender"
    - columns with BD in the beginning
    - columns with BM in the beginning
    - diagnosis (= potential labels for clusters) = columns with DF (, CH, A) in the beginning
4. symptoms = columns with CC, PE and (S, OS, A) in the beginning

In [None]:
columns_to_keep = [
    "medical_case_id",
    "medical_case_consultation_date",
    "patient_id",
    "patient_birthdate",
    "patient_gender",
    "health_facility_id",
    "health_facility_group_id",
    "health_facility_longitude",
    "health_facility_latitude",
    "health_facility_name"
]

demographic_columns = [
    "patient_birthdate",
    "patient_gender"
]

spatio_temporal_columns = [
    "medical_case_consultation_date",
    "health_facility_longitude",
    "health_facility_latitude",
    "health_facility_name",
]

In [None]:
# select symptom data
rw_symptoms = rw_data[[col for col in rw_data.columns if col[0:2] == "CC" or col[0:2] == "PE"]]
rw_symptoms

# Quality control of selected data for clustering
- missingness
- low variance

## Low variance

In [None]:
description_rw_symptoms = rw_symptoms.describe()
description_rw_symptoms 

# check for low variance columns
low_var_threshold = 10 # TODO should be different for binary columns
var_rw_data = description_rw_symptoms.loc[["std"]].transpose().dropna().sort_values(by=["std"])

# plot low variance columns
var_rw_data[var_rw_data["std"] <= low_var_threshold].plot(kind="bar", rot=90, title="Low variance columns")

## Missingness

In [None]:
# number of nan per column
nnan_rw_symptoms = rw_symptoms.isnull().sum(axis = 0).to_frame(name="nnan").sort_values("nnan", ascending=False)
nnan_rw_symptoms["nnan_share_in_%"] = nnan_rw_symptoms["nnan"] / rw_symptoms.shape[0] * 100
nnan_rw_symptoms["group"] = pd.cut(nnan_rw_symptoms["nnan_share_in_%"], bins=[0, 1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], right=True, include_lowest=True)
print(nnan_rw_symptoms.groupby("group").size())
nnan_rw_symptoms.groupby("group").size().plot(kind="bar", title="Number of columns per NAN value amount category")

In [None]:
# boxplot of share of nan values for each column
plt.boxplot(nnan_rw_symptoms["nnan_share_in_%"])
plt.title("Share of nan values for each column")
plt.show()

In [None]:
# distribution of share of nan values for each column
plt.hist(nnan_rw_symptoms["nnan_share_in_%"])
plt.title("Histogram of share of nan values for each column")
plt.show()

In [None]:
# columns with no nan
print(nnan_rw_symptoms[nnan_rw_symptoms["nnan"] == 0].size/nnan_rw_symptoms.size*100, "%", "of columns contain no NaN values!")
nnan_rw_symptoms[nnan_rw_symptoms["nnan"] == 0].index

In [None]:
# plot columns with too many NaN values
nnan_threshold = 20 # in %
print(nnan_rw_symptoms[nnan_rw_symptoms["nnan"] / nnan_rw_symptoms.shape[0] * 100 <= nnan_threshold].shape[0] / nnan_rw_symptoms.shape[0] * 100, "%", "of columns contain less or equal than", nnan_threshold,"%", "of NaN values!")
nnan_rw_symptoms[nnan_rw_symptoms["nnan"] / nnan_rw_symptoms.shape[0] * 100 <= nnan_threshold].plot(kind="bar", rot=90, title=f"Columns with less or equal than {nnan_threshold}% of NaN values")

# Data Cleaning

In [None]:
# convert "yes"/"no" columns to floats
rw_symptoms = rw_symptoms.apply(lambda x : x.map({"Yes": 1, "No": 0}), axis=1)

In [None]:
# drop columns with too many NaN values based on nnan_threshold

In [None]:
# replace NaN values with 0 = False = No (TODO how to handle missingness needs to be discussed with ML and domain expert)

# Clustering

## K-Means

In [None]:
!pip install yellowbrick

In [None]:
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

### Configure data for K-means


In [None]:
KMeans_with_NAN_values = True
KMeans_with_symptoms_and_demographic = True

if KMeans_with_NAN_values:
    # convert NAN to 0 indicating that the symptom was not present
    data = rw_symptoms.fillna(0)
else:
    # drop columns which contain NAN values
    data = rw_symptoms.dropna(axis=1)
    print("Dropping columns with NAN, leads to", rw_symptoms.shape[1]-data.shape[1],"less columns.")
    print(data.shape[1], "columns are used for clustering.")

if KMeans_with_symptoms_and_demographic:
    data = data.append(axis=1)

In [None]:
# Finding the optimal number of clusters using the Elbow method with yellowbrick
model = KMeans(n_init =10)
visualizer = KElbowVisualizer(model, k=(1,11))
visualizer.fit(data)
n_clusters = visualizer.elbow_value_ # optimal number of clusters
visualizer.show()

In [None]:
# Performing clustering with the optimal number of clusters
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=0).fit(data)

# Adding the column 'cluster' to the dataframe
data["cluster"] = kmeans.labels_

In [None]:
data.groupby("cluster").size().plot(kind="bar", title="Number of consultations per cluster")