**Kaggle Dataset Link:** https://www.kaggle.com/datasets/zhonglifr/thyroid-disease-unsupervised-anomaly-detection

# Step 1: Import Libraries

In [1]:
# import pandas for reading the data
import pandas as pd

# sklearn for standardising numerical data
from sklearn.preprocessing import StandardScaler

# sklearn to create IsolationForst, OneClassSVM and KernelDensity
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import KernelDensity

# sklearn to import classification_report
from sklearn.metrics import classification_report

# numpy to import quantile
from numpy import quantile

# Step 2: Read and Explore Dataset

In [3]:
CSV_PATH = "../../../dataset/annthyroid_unsupervised_anomaly_detection.csv"
raw_df = pd.read_csv(CSV_PATH, sep=";")

raw_df.head(3)

Unnamed: 0,Age,Sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,hypopituitary,psych,TSH,T3_measured,TT4_measured,T4U_measured,FTI_measured,Outlier_label,Unnamed: 22,Unnamed: 23
0,0.45,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,61.0,6.0,23.0,87.0,26.0,o,,
1,0.61,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,29.0,15.0,61.0,96.0,64.0,o,,
2,0.16,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,29.0,19.0,58.0,103.0,56.0,o,,


# Step 3: Data Proprocessing

In [4]:
# Remove unwanted columns and make the columns names in similar format

raw_df.drop(["Unnamed: 22", "Unnamed: 23"], axis=1, inplace=True)

In [5]:
# Remove duplicate rows

raw_df.drop_duplicates(inplace=True)

In [6]:
# Check for missing values

raw_df.isna().sum()

Age                          0
Sex                          0
on_thyroxine                 0
query_on_thyroxine           0
on_antithyroid_medication    0
sick                         0
pregnant                     0
thyroid_surgery              0
I131_treatment               0
query_hypothyroid            0
query_hyperthyroid           0
lithium                      0
goitre                       0
tumor                        0
hypopituitary                0
psych                        0
TSH                          0
T3_measured                  0
TT4_measured                 0
T4U_measured                 0
FTI_measured                 0
Outlier_label                0
dtype: int64

In [8]:
# Conver column names to lower case format

raw_df.columns = [item.strip().lower() for item in raw_df.columns]

In [10]:
raw_df.columns

Index(['age', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery',
       'i131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'tsh', 't3_measured',
       'tt4_measured', 't4u_measured', 'fti_measured', 'outlier_label'],
      dtype='object')

In [12]:
# Create feature and target dataframes

target_name = 'outlier_label'
feature_names = [col_name for col_name in raw_df.columns if col_name != target_name]

features = raw_df[feature_names].copy()
target = raw_df[target_name].copy()

In [13]:
features.head(3)

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,i131_treatment,query_hypothyroid,...,lithium,goitre,tumor,hypopituitary,psych,tsh,t3_measured,tt4_measured,t4u_measured,fti_measured
0,0.45,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,61.0,6.0,23.0,87.0,26.0
1,0.61,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,29.0,15.0,61.0,96.0,64.0
2,0.16,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,29.0,19.0,58.0,103.0,56.0


In [15]:
target = target.map(lambda label: -1 if label == 'o' else 1)

In [16]:
target

0      -1
1      -1
2      -1
3      -1
4      -1
       ..
6911    1
6912    1
6913    1
6914    1
6915    1
Name: outlier_label, Length: 6845, dtype: int64

In [None]:
# Check impurity as target column is already given

# If target column is not given, we assume impurity and adjust the impurity as hyperparameter to get the optimum results.

# impurity is used in IsolationForest as contamination, OneClassSVM as nu and quantile threshold in Kernel Density.

In [17]:
target.value_counts()

 1    6595
-1     250
Name: outlier_label, dtype: int64

In [18]:
impurity = (250 / (250 + 6595))

### On reading the database information on Kaggle and analysing the dataset, we form list of categorical and numerical columns

* **Categorical Columns:** 'sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery', 'i131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych'
<br><br>
* **Numerical Columns:** 'age', 'tsh', 't3_measured', 'tt4_measured', 't4u_measured', 'fti_measured'

In [19]:
# Form categorical and numerical columns
categorical_col = ['sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery', 'i131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych']
numerical_col = ['age', 'tsh', 't3_measured', 'tt4_measured', 't4u_measured', 'fti_measured']

In [20]:
# scale the numerical columns
scaler = StandardScaler()
features[numerical_col] = scaler.fit_transform(features[numerical_col])

In [21]:
# Checking the data types of columns
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6845 entries, 0 to 6915
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        6845 non-null   float64
 1   sex                        6845 non-null   float64
 2   on_thyroxine               6845 non-null   float64
 3   query_on_thyroxine         6845 non-null   float64
 4   on_antithyroid_medication  6845 non-null   float64
 5   sick                       6845 non-null   float64
 6   pregnant                   6845 non-null   float64
 7   thyroid_surgery            6845 non-null   float64
 8   i131_treatment             6845 non-null   float64
 9   query_hypothyroid          6845 non-null   float64
 10  query_hyperthyroid         6845 non-null   float64
 11  lithium                    6845 non-null   float64
 12  goitre                     6845 non-null   float64
 13  tumor                      6845 non-null   float

In [22]:
# Convert categorical column data types to category
for col in categorical_col:
    features[col] = features[col].astype('category')

In [23]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6845 entries, 0 to 6915
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   age                        6845 non-null   float64 
 1   sex                        6845 non-null   category
 2   on_thyroxine               6845 non-null   category
 3   query_on_thyroxine         6845 non-null   category
 4   on_antithyroid_medication  6845 non-null   category
 5   sick                       6845 non-null   category
 6   pregnant                   6845 non-null   category
 7   thyroid_surgery            6845 non-null   category
 8   i131_treatment             6845 non-null   category
 9   query_hypothyroid          6845 non-null   category
 10  query_hyperthyroid         6845 non-null   category
 11  lithium                    6845 non-null   category
 12  goitre                     6845 non-null   category
 13  tumor                      6845 n

# Step 4: Build Machine Learning Model

In [24]:
# Isolation Forest
IF = IsolationForest(n_estimators=300, contamination=impurity)
IF.fit(features)
IF_pred = IF.predict(features)

# OneClassSVM Forest
OC = OneClassSVM(nu=impurity)
OC.fit(features)
OC_pred = OC.predict(features)

# Kernel Density
KD = KernelDensity()
KD.fit(features)
scores = KD.score_samples(features)
threshold = quantile(scores, impurity)
KD_pred = [-1 if score < threshold else 1 for score in scores]



In [25]:
print(f"For Isolation Forest:\n{classification_report(IF_pred, target)}\n\n")
print(f"For OneClassSVM:\n{classification_report(OC_pred, target)}\n\n")
print(f"For Kernel Density:\n{classification_report(KD_pred, target)}\n\n")

For Isolation Forest:
              precision    recall  f1-score   support

          -1       0.06      0.06      0.06       250
           1       0.96      0.96      0.96      6595

    accuracy                           0.93      6845
   macro avg       0.51      0.51      0.51      6845
weighted avg       0.93      0.93      0.93      6845



For OneClassSVM:
              precision    recall  f1-score   support

          -1       0.17      0.17      0.17       252
           1       0.97      0.97      0.97      6593

    accuracy                           0.94      6845
   macro avg       0.57      0.57      0.57      6845
weighted avg       0.94      0.94      0.94      6845



For Kernel Density:
              precision    recall  f1-score   support

          -1       0.22      0.22      0.22       250
           1       0.97      0.97      0.97      6595

    accuracy                           0.94      6845
   macro avg       0.60      0.60      0.60      6845
weighted av

Kernel Density gives the best outlier estimates

# Author
Name: Shounak Deshpande <br> Email: shounak.python@gmail.com