Kaggle Dataset Link: https://www.kaggle.com/datasets/zhonglifr/thyroid-disease-unsupervised-anomaly-detection

In [52]:
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import KernelDensity

from numpy import quantile

from sklearn.metrics import classification_report

In [36]:
raw_df = pd.read_csv("../../../dataset/annthyroid_unsupervised_anomaly_detection.csv", sep=";")

In [37]:
# remove unnamed column which adds to noise
raw_df.drop(['Unnamed: 22', 'Unnamed: 23'], axis=1, inplace=True)

In [38]:
# format column names
column_names = raw_df.columns.tolist()
column_names = [col_name.strip().lower() for col_name in column_names]
raw_df.columns = column_names

In [39]:
raw_df.drop_duplicates(inplace=True)

In [40]:
target_name = "outlier_label"
target = raw_df["outlier_label"]
features = raw_df.drop(target_name, axis=1)

In [53]:
target.value_counts()

n    6595
o     250
Name: outlier_label, dtype: int64

In [54]:
250 / 6595

0.03790750568612585

In [41]:
# categorical cols
cat_col = []
for col in features.columns:
    val_count = features[col].value_counts()
    if len(val_count) < 10:
        cat_col.append(col)

In [42]:
# numerical cols
num_col = [col for col in features.columns.tolist() if col not in cat_col]

In [43]:
# check for missing values
features.isna().sum()

age                          0
sex                          0
on_thyroxine                 0
query_on_thyroxine           0
on_antithyroid_medication    0
sick                         0
pregnant                     0
thyroid_surgery              0
i131_treatment               0
query_hypothyroid            0
query_hyperthyroid           0
lithium                      0
goitre                       0
tumor                        0
hypopituitary                0
psych                        0
tsh                          0
t3_measured                  0
tt4_measured                 0
t4u_measured                 0
fti_measured                 0
dtype: int64

In [46]:
# check data type
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6845 entries, 0 to 6915
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        6845 non-null   float64
 1   sex                        6845 non-null   float64
 2   on_thyroxine               6845 non-null   float64
 3   query_on_thyroxine         6845 non-null   float64
 4   on_antithyroid_medication  6845 non-null   float64
 5   sick                       6845 non-null   float64
 6   pregnant                   6845 non-null   float64
 7   thyroid_surgery            6845 non-null   float64
 8   i131_treatment             6845 non-null   float64
 9   query_hypothyroid          6845 non-null   float64
 10  query_hyperthyroid         6845 non-null   float64
 11  lithium                    6845 non-null   float64
 12  goitre                     6845 non-null   float64
 13  tumor                      6845 non-null   float

In [47]:
# scale the numeric cols
scaler = StandardScaler()
features[num_col] = scaler.fit_transform(features[num_col])

In [59]:
# Isolation Forest
IF = IsolationForest(n_estimators=200, contamination=0.04)
IF.fit(features)
pred_IF = IF.predict(features)
pred_IF = [1 if value == -1 else 0 for value in pred_IF]


# OneClassSVM Forest
OC = OneClassSVM(nu=0.04)
OC.fit(features)
pred_OC = OC.predict(features)
pred_OC = [1 if value == -1 else 0 for value in pred_OC]


# Kernel Density
KD = KernelDensity()
KD.fit(features)
scores = KD.score_samples(features)
threshold = quantile(scores, 0.05)
pred_KD = [1 if score < threshold else 0 for score in scores]



In [60]:
target = target.map(lambda val: 1 if val == 'o' else 0)

In [61]:
print(f"For Isolation Forest:\n{classification_report(pred_IF, target)}\n\n")
print(f"For OneClassSVM Forest:\n{classification_report(pred_OC, target)}\n\n")
print(f"For Kernel Density:\n{classification_report(pred_KD, target)}\n\n")

For Isolation Forest:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      6571
           1       0.08      0.08      0.08       274

    accuracy                           0.93      6845
   macro avg       0.52      0.52      0.52      6845
weighted avg       0.93      0.93      0.93      6845



For OneClassSVM Forest:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97      6570
           1       0.18      0.16      0.17       275

    accuracy                           0.94      6845
   macro avg       0.57      0.56      0.57      6845
weighted avg       0.93      0.94      0.93      6845



For Kernel Density:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      6502
           1       0.23      0.17      0.20       343

    accuracy                           0.93      6845
   macro avg       0.59      0.57      0.58      6845
weig

Kernel Density gives the best outlier estimates