# 1- importing packages:

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

# 2- Loading dataset:

In [2]:
data_path = "Covid19HDDT.csv"

In [3]:
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,sex,age,country,province,city,infection_case,infection_order,elementary_school_count,kindergarten_count,university_count,...,elderly_alone_ratio,nursing_home_count,avg_temp,min_temp,max_temp,precipitation,max_wind_speed,most_wind_direction,avg_relative_humidity,label
0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1,1,1,0,1,1,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1
2,0,2,0,0,2,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,14.0,2.0,1.0,0
3,0,1,0,0,3,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,14.0,3.0,1.0,0
4,1,3,0,0,4,3,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,14.0,4.0,1.0,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50729 entries, 0 to 50728
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   sex                       50729 non-null  int64  
 1   age                       50729 non-null  int64  
 2   country                   50729 non-null  int64  
 3   province                  50729 non-null  int64  
 4   city                      50729 non-null  int64  
 5   infection_case            50729 non-null  int64  
 6   infection_order           50729 non-null  float64
 7   elementary_school_count   50729 non-null  float64
 8   kindergarten_count        50729 non-null  float64
 9   university_count          50729 non-null  float64
 10  academy_ratio             50729 non-null  float64
 11  elderly_population_ratio  50729 non-null  float64
 12  elderly_alone_ratio       50729 non-null  float64
 13  nursing_home_count        50729 non-null  float64
 14  avg_te

In [5]:
df.nunique()

sex                          2
age                          7
country                      2
province                     2
city                        19
infection_case               7
infection_order              4
elementary_school_count      2
kindergarten_count           2
university_count             2
academy_ratio                2
elderly_population_ratio     2
elderly_alone_ratio          2
nursing_home_count           2
avg_temp                     2
min_temp                     2
max_temp                     2
precipitation                2
max_wind_speed              14
most_wind_direction         17
avg_relative_humidity        2
label                        3
dtype: int64

In [10]:
cols = list(df.columns)
for i in cols:
    print(i, list(df[i].unique()))

sex [np.int64(0), np.int64(1)]
age [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]
country [np.int64(0), np.int64(1)]
province [np.int64(0), np.int64(1)]
city [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18)]
infection_case [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]
infection_order [np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0)]
elementary_school_count [np.float64(0.0), np.float64(1.0)]
kindergarten_count [np.float64(0.0), np.float64(1.0)]
university_count [np.float64(0.0), np.float64(1.0)]
academy_ratio [np.float64(0.0), np.float64(1.0)]
elderly_population_ratio [np.float64(0.0), np.float64(1.0)]
elderly_alone_ratio [np.float64(0.0), np.float64(1.0)]
nursing_home_count 

## 2.1- number of each class labels

In [11]:
class_label_counts = []
class_1_counts = df['label'].value_counts().get(1)
class_2_counts = df['label'].value_counts().get(2)
class_0_counts = df['label'].value_counts().get(0)

class_label_counts.append((class_1_counts, class_2_counts, class_0_counts))
print(class_label_counts)

[(np.int64(12891), np.int64(4000), np.int64(33838))]


# 3- Two-class HDDT:

In [12]:
class TwoClassHDDT:
    def __init__(self, max_depth=5):
        self.max_depth = max_depth
        self.tree = None

    def _hellinger_distance(self, p_left, p_right):
        return np.sqrt((np.sqrt(p_left) - np.sqrt(p_right))**2 + 
                       (np.sqrt(1 - p_left) - np.sqrt(1 - p_right))**2)

    def _find_best_split(self, X, y):
        best_feature, best_threshold = None, None
        best_distance = -1

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_mask = X[:, feature] <= threshold
                right_mask = ~left_mask

                if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
                    continue

                # Class probabilities (binary)
                p_left = np.mean(y[left_mask])
                p_right = np.mean(y[right_mask])

                distance = self._hellinger_distance(p_left, p_right)
                if distance > best_distance:
                    best_distance = distance
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _build_tree(self, X, y, depth=0):
        if depth >= self.max_depth or len(np.unique(y)) == 1:
            return np.mean(y)  # Leaf node: probability of class 1

        feature, threshold = self._find_best_split(X, y)
        if feature is None:
            return np.mean(y)

        left_mask = X[:, feature] <= threshold
        right_mask = ~left_mask

        left_subtree = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right_subtree = self._build_tree(X[right_mask], y[right_mask], depth + 1)

        return (feature, threshold, left_subtree, right_subtree)

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)
        return self

    def predict_proba(self, X):
        return np.array([self._predict_single(x) for x in X])

    def _predict_single(self, x, node=None):
        if node is None:
            node = self.tree

        if isinstance(node, (float, np.float64)):
            return node  # Probability of class 1

        feature, threshold, left, right = node
        if x[feature] <= threshold:
            return self._predict_single(x, left)
        else:
            return self._predict_single(x, right)

    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X) >= threshold).astype(int)

# 4- Find the minority class label:

In [13]:
minority_label = np.argmin(np.bincount(df['label']))
minority_label

np.int64(2)

In [14]:
X = df.drop(columns='label').values
y = df['label'].values

In [15]:
y_bin = np.where(y==minority_label, 1, 0)
y_bin

array([0, 0, 0, ..., 1, 1, 1], shape=(50729,))

# 5- Performance evaluation:

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y_bin, test_size=0.3, random_state=23)

## 5.1- Train:

In [19]:
model = TwoClassHDDT(max_depth=10).fit(X_train, y_train)

In [20]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_proba))

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-Score: 1.0
AUC-ROC: 1.0


In [21]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14027
           1       1.00      1.00      1.00      1192

    accuracy                           1.00     15219
   macro avg       1.00      1.00      1.00     15219
weighted avg       1.00      1.00      1.00     15219



In [24]:
np.bincount(y_test)

array([14027,  1192])

In [26]:
np.bincount(y_train)

array([32702,  2808])

# 6- OVO (One Versus One) and OVA (One Versus All):

### 6.1- One vs One

In [32]:
from itertools import combinations

class OVO_HDDT:
    def __init__(self, max_depth=5):
        self.max_depth = max_depth
        self.models = []

    def fit(self, X, y):
        self.classes = np.unique(y)
        for cls1, cls2 in combinations(self.classes, 2):
            mask = np.isin(y, [cls1, cls2])
            X_pair = X[mask]
            y_pair = np.where(y[mask] == cls1, 1, 0)
            model = TwoClassHDDT(max_depth=self.max_depth).fit(X_pair, y_pair)
            self.models.append((cls1, cls2, model))
        return self

    def predict(self, X):
        votes = np.zeros((X.shape[0], len(self.classes)))
        for cls1, cls2, model in self.models:
            preds = model.predict(X)
            for i, pred in enumerate(preds):
                votes[i, cls1 if pred == 1 else cls2] += 1
        return np.argmax(votes, axis=1)

### 6.2- One vs Al:

In [33]:
class OVA_HDDT:
    def __init__(self, max_depth=5):
        self.max_depth = max_depth
        self.models = {}

    def fit(self, X, y):
        self.classes = np.unique(y)
        for cls in self.classes:
            y_binary = np.where(y == cls, 1, 0)
            model = TwoClassHDDT(max_depth=self.max_depth).fit(X, y_binary)
            self.models[cls] = model
        return self

    def predict_proba(self, X):
        probas = np.zeros((X.shape[0], len(self.classes)))
        for i, cls in enumerate(self.classes):
            probas[:, i] = self.models[cls].predict_proba(X)
        return probas

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

### 6.3- Evaluate Performance:

In [34]:
# Load original dataset (3 classes)
X, y = df.drop(columns=["label"]).values, df["label"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# OVA
ova_model = OVA_HDDT(max_depth=5).fit(X_train, y_train)
y_pred_ova = ova_model.predict(X_test)
print("OVA Accuracy:", accuracy_score(y_test, y_pred_ova))

# OVO
ovo_model = OVO_HDDT(max_depth=5).fit(X_train, y_train)
y_pred_ovo = ovo_model.predict(X_test)
print("OVO Accuracy:", accuracy_score(y_test, y_pred_ovo))

OVA Accuracy: 1.0
OVO Accuracy: 1.0
