In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'thyroid_disease'

In [3]:
input_dir = './raw/'
inp_fname = 'Thyroid Disease.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,Class
0,41.0,b'F',b'f',b'f',b'f',b'f',b'f',b'f',b'f',b'f',...,b't',125.0,b't',1.14,b't',109.0,b'f',,b'SVHC',b'negative'
1,23.0,b'F',b'f',b'f',b'f',b'f',b'f',b'f',b'f',b'f',...,b't',102.0,b'f',,b'f',,b'f',,b'other',b'negative'
2,46.0,b'M',b'f',b'f',b'f',b'f',b'f',b'f',b'f',b'f',...,b't',109.0,b't',0.91,b't',120.0,b'f',,b'other',b'negative'
3,70.0,b'F',b't',b'f',b'f',b'f',b'f',b'f',b'f',b'f',...,b't',175.0,b'f',,b'f',,b'f',,b'other',b'negative'
4,70.0,b'F',b'f',b'f',b'f',b'f',b'f',b'f',b'f',b'f',...,b't',61.0,b't',0.87,b't',70.0,b'f',,b'SVI',b'negative'


In [5]:
id_col = "id"
target_col = "Class"

## Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id   age   sex on_thyroxine query_on_thyroxine on_antithyroid_medication  \
0   0  41.0  b'F'         b'f'               b'f'                      b'f'   
1   1  23.0  b'F'         b'f'               b'f'                      b'f'   
2   2  46.0  b'M'         b'f'               b'f'                      b'f'   
3   3  70.0  b'F'         b't'               b'f'                      b'f'   
4   4  70.0  b'F'         b'f'               b'f'                      b'f'   

   sick pregnant thyroid_surgery I131_treatment  ... TT4_measured    TT4  \
0  b'f'     b'f'            b'f'           b'f'  ...         b't'  125.0   
1  b'f'     b'f'            b'f'           b'f'  ...         b't'  102.0   
2  b'f'     b'f'            b'f'           b'f'  ...         b't'  109.0   
3  b'f'     b'f'            b'f'           b'f'  ...         b't'  175.0   
4  b'f'     b'f'            b'f'           b'f'  ...         b't'   61.0   

  T4U_measured   T4U FTI_measured    FTI TBG_measured TBG  referral_

## Convert byte strings to strings

In [7]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['sex',
 'on_thyroxine',
 'query_on_thyroxine',
 'on_antithyroid_medication',
 'sick',
 'pregnant',
 'thyroid_surgery',
 'I131_treatment',
 'query_hypothyroid',
 'query_hyperthyroid',
 'lithium',
 'goitre',
 'tumor',
 'hypopituitary',
 'psych',
 'TSH_measured',
 'T3_measured',
 'TT4_measured',
 'T4U_measured',
 'FTI_measured',
 'TBG_measured',
 'referral_source',
 'Class']

In [8]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

Unnamed: 0,id,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,Class
0,0,41.0,F,f,f,f,f,f,f,f,...,t,125.0,t,1.14,t,109.0,f,,SVHC,negative
1,1,23.0,F,f,f,f,f,f,f,f,...,t,102.0,f,,f,,f,,other,negative
2,2,46.0,M,f,f,f,f,f,f,f,...,t,109.0,t,0.91,t,120.0,f,,other,negative
3,3,70.0,F,t,f,f,f,f,f,f,...,t,175.0,f,,f,,f,,other,negative
4,4,70.0,F,f,f,f,f,f,f,f,...,t,61.0,t,0.87,t,70.0,f,,SVI,negative


## Drop unuseful columns

In [9]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1 or data[col].nunique() == 0]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

['TBG_measured', 'TBG']


## Convert ? to NaN

In [10]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [11]:
data.to_csv(outp_fname, index=False)