In [6]:
import numpy as np
import pandas as pd
import os

In [7]:
dataset_name = 'sick_euthyroid'

In [8]:
input_dir = './raw/'
inp_fname = 'sick_euthyroid.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [9]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,target
0,72.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,83.0,0.0,1.0,0.95,0.0,1.0,87.0,1.0,0.0,1
1,45.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,82.0,0.0,1.0,0.73,0.0,1.0,112.0,1.0,0.0,1
2,64.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,101.0,0.0,1.0,0.82,0.0,1.0,123.0,1.0,0.0,1
3,56.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,76.0,0.0,1.0,0.77,0.0,1.0,99.0,1.0,0.0,1
4,78.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,87.0,0.0,1.0,0.95,0.0,1.0,91.0,1.0,0.0,1


In [10]:
n_features = data.shape[1] - 1
data.columns = [f'f{i}' for i in range(n_features)] + ['target']

In [11]:
id_col = "id"
target_col = "target"

## Insert Id Column

In [12]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id    f0   f1   f2   f3   f4   f5   f6   f7   f8  ...    f33  f34  f35  \
0   0  72.0  0.0  1.0  1.0  0.0  1.0  0.0  1.0  0.0  ...   83.0  0.0  1.0   
1   1  45.0  1.0  0.0  1.0  0.0  1.0  0.0  1.0  0.0  ...   82.0  0.0  1.0   
2   2  64.0  1.0  0.0  1.0  0.0  1.0  0.0  1.0  0.0  ...  101.0  0.0  1.0   
3   3  56.0  0.0  1.0  1.0  0.0  1.0  0.0  1.0  0.0  ...   76.0  0.0  1.0   
4   4  78.0  1.0  0.0  0.0  1.0  1.0  0.0  1.0  0.0  ...   87.0  0.0  1.0   

    f36  f37  f38    f39  f40  f41  target  
0  0.95  0.0  1.0   87.0  1.0  0.0       1  
1  0.73  0.0  1.0  112.0  1.0  0.0       1  
2  0.82  0.0  1.0  123.0  1.0  0.0       1  
3  0.77  0.0  1.0   99.0  1.0  0.0       1  
4  0.95  0.0  1.0   91.0  1.0  0.0       1  

[5 rows x 44 columns]


## Convert byte strings to strings

In [13]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

[]

In [14]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

  data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)


Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f33,f34,f35,f36,f37,f38,f39,f40,f41,target
0,0,72.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,83.0,0.0,1.0,0.95,0.0,1.0,87.0,1.0,0.0,1
1,1,45.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,82.0,0.0,1.0,0.73,0.0,1.0,112.0,1.0,0.0,1
2,2,64.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,101.0,0.0,1.0,0.82,0.0,1.0,123.0,1.0,0.0,1
3,3,56.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,76.0,0.0,1.0,0.77,0.0,1.0,99.0,1.0,0.0,1
4,4,78.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,87.0,0.0,1.0,0.95,0.0,1.0,91.0,1.0,0.0,1


## Drop unuseful columns

In [15]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Convert ? to NaN

In [16]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [17]:
data.to_csv(outp_fname, index=False)