# Hypothyroid Dataset Preparation

The input features of the `hypothyroid` dataset are a mix of categorical and numerical values. We want to break it down into purely binary input features.

Columns with a lot of missing values are completely dropped. Other gaps are filled in with random values according to the distribution of the column.

The remaining numerical values are then transformed into binary values via reasonable thresholds.

In [4]:
import os

import numpy as np
import pandas as pd

In [5]:
data_dir = "../data/kliegr"
dataset_file_path = os.path.join(data_dir, "hypothyroid.csv")

dataset_df = pd.read_csv(dataset_file_path)
print(f"Dataset size: {dataset_df.shape}")
dataset_df.head()

Dataset size: (3772, 30)


Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,Class
0,41.0,F,f,f,f,f,f,f,f,f,...,t,125.0,t,1.14,t,109.0,f,,SVHC,negative
1,23.0,F,f,f,f,f,f,f,f,f,...,t,102.0,f,,f,,f,,other,negative
2,46.0,M,f,f,f,f,f,f,f,f,...,t,109.0,t,0.91,t,120.0,f,,other,negative
3,70.0,F,t,f,f,f,f,f,f,f,...,t,175.0,f,,f,,f,,other,negative
4,70.0,F,f,f,f,f,f,f,f,f,...,t,61.0,t,0.87,t,70.0,f,,SVI,negative


In [7]:
def count_nan_none(data):
    nan_counts = data.map(lambda x: x is np.nan).sum()
    none_counts = data.map(lambda x: x is None).sum()
    return nan_counts, none_counts

nan_counts, none_counts = count_nan_none(dataset_df)

total_entries = len(dataset_df)
total_missing = dataset_df.isnull().sum()
percentage_missing = (total_missing / total_entries) * 100

detailed_missing_report = pd.DataFrame({
    'Total Missing': total_missing,
    'Percentage Missing': percentage_missing,
    'NaN Counts': nan_counts,
    'None Counts': none_counts,
    'Data Type': dataset_df.dtypes
})

detailed_missing_report

Unnamed: 0,Total Missing,Percentage Missing,NaN Counts,None Counts,Data Type
age,1,0.026511,0,0,float64
sex,150,3.97667,150,0,object
on thyroxine,0,0.0,0,0,object
query on thyroxine,0,0.0,0,0,object
on antithyroid medication,0,0.0,0,0,object
sick,0,0.0,0,0,object
pregnant,0,0.0,0,0,object
thyroid surgery,0,0.0,0,0,object
I131 treatment,0,0.0,0,0,object
query hypothyroid,0,0.0,0,0,object


In [8]:
# dropping redundant columns and ones with too many missing values

dataset_df = dataset_df.drop(columns=['T3', 'TBG', 'TSH measured', 'T3 measured', 'TT4 measured', 'T4U measured', 'FTI measured', 'TBG measured', 'referral source'])
dataset_df = dataset_df.dropna(subset=['FTI', 'T4U', 'TT4', 'TSH'])
dataset_df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,lithium,goitre,tumor,hypopituitary,psych,TSH,TT4,T4U,FTI,Class
0,41.0,F,f,f,f,f,f,f,f,f,...,f,f,f,f,f,1.3,125.0,1.14,109.0,negative
2,46.0,M,f,f,f,f,f,f,f,f,...,f,f,f,f,f,0.98,109.0,0.91,120.0,negative
4,70.0,F,f,f,f,f,f,f,f,f,...,f,f,f,f,f,0.72,61.0,0.87,70.0,negative
5,18.0,F,t,f,f,f,f,f,f,f,...,f,f,f,f,f,0.03,183.0,1.3,141.0,negative
7,80.0,F,f,f,f,f,f,f,f,f,...,f,f,f,f,f,2.2,80.0,0.7,115.0,negative


In [9]:
# inserting missing age and sex values according to their distributions

age_mean = dataset_df['age'].mean()
age_std = dataset_df['age'].std()
missing_age_count = dataset_df['age'].isnull().sum()
age_replacements = np.random.normal(age_mean, age_std, size=missing_age_count).round()
dataset_df.loc[dataset_df['age'].isnull(), 'age'] = age_replacements

sex_distribution = dataset_df['sex'].dropna().value_counts(normalize=True)
missing_sex_count = dataset_df['sex'].isnull().sum()
sex_replacements = np.random.choice(sex_distribution.index, size=missing_sex_count, p=sex_distribution.values)
dataset_df.loc[dataset_df['sex'].isnull(), 'sex'] = sex_replacements

dataset_df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,lithium,goitre,tumor,hypopituitary,psych,TSH,TT4,T4U,FTI,Class
0,41.0,F,f,f,f,f,f,f,f,f,...,f,f,f,f,f,1.3,125.0,1.14,109.0,negative
2,46.0,M,f,f,f,f,f,f,f,f,...,f,f,f,f,f,0.98,109.0,0.91,120.0,negative
4,70.0,F,f,f,f,f,f,f,f,f,...,f,f,f,f,f,0.72,61.0,0.87,70.0,negative
5,18.0,F,t,f,f,f,f,f,f,f,...,f,f,f,f,f,0.03,183.0,1.3,141.0,negative
7,80.0,F,f,f,f,f,f,f,f,f,...,f,f,f,f,f,2.2,80.0,0.7,115.0,negative


In [10]:
# Hypothyroidism Thresholds (Values suggesting potential hypothyroidism)
age_hypo_threshold = 60     # Age threshold for hypothyroidism; above this suggests hypothyroidism
tsh_hypo_threshold = 4.5    # Elevated TSH suggesting hypothyroidism
tt4_hypo_threshold = 5.0    # Lower limit for Total T4; below this suggests hypothyroidism

# Typically for hyperthyroidism, the following thresholds may not be of any meaning
t4u_hypo_threshold = 0.8    # Lower threshold for Thyroxine-Binding Globulin; below this may suggest hypothyroidism
fti_hypo_threshold = 1.2    # Lower limit for Free Thyroxine Index; below this may suggest hypothyroidism

dataset_df['old'] = dataset_df['age'] > age_hypo_threshold
dataset_df['male'] = dataset_df['sex'] == 'M'
dataset_df['high_tsh'] = dataset_df['TSH'] > tsh_hypo_threshold
dataset_df['low_tt4'] = dataset_df['TT4'] < tt4_hypo_threshold
dataset_df['low_t4u'] = dataset_df['T4U'] < t4u_hypo_threshold
dataset_df['low_fti'] = dataset_df['FTI'] < fti_hypo_threshold
dataset_df['hypothyroid'] = dataset_df['Class'] != 'negative'

dataset_df = dataset_df.map(lambda x: False if x == 'f' else True if x == 't' else x)

dataset_df.columns = dataset_df.columns.str.replace(' ', '_')

dataset_df = dataset_df.drop(columns=['age', 'sex', 'TSH', 'TT4', 'T4U', 'FTI', 'Class'])
dataset_df.head()

Unnamed: 0,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,...,tumor,hypopituitary,psych,old,male,high_tsh,low_tt4,low_t4u,low_fti,hypothyroid
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
5,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,True,False,False


In [12]:
dataset_df.to_csv(os.path.join(data_dir, "hypothyroid_bin.csv"), index=False)

In [14]:
# investigate label balance
label_counts = dataset_df['hypothyroid'].value_counts()
label_counts

hypothyroid
False    2958
True      266
Name: count, dtype: int64