In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'Thoracic Surgery'

In [3]:
input_dir = './raw/'
inp_fname = 'thoracic-surgery.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,Class
0,b'2',2.88,2.16,b'2',b'1',b'1',b'1',b'2',b'2',b'4',b'1',b'1',b'1',b'2',b'1',60.0,b'2'
1,b'3',3.4,1.88,b'1',b'1',b'1',b'1',b'1',b'1',b'2',b'1',b'1',b'1',b'2',b'1',51.0,b'2'
2,b'3',2.76,2.08,b'2',b'1',b'1',b'1',b'2',b'1',b'1',b'1',b'1',b'1',b'2',b'1',59.0,b'2'
3,b'3',3.68,3.04,b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',54.0,b'2'
4,b'3',2.44,0.96,b'3',b'1',b'2',b'1',b'2',b'2',b'1',b'1',b'1',b'1',b'2',b'1',73.0,b'1'


In [5]:
id_col = "id"
target_col = "Class"

## Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id    V1    V2    V3    V4    V5    V6    V7    V8    V9   V10   V11   V12  \
0   0  b'2'  2.88  2.16  b'2'  b'1'  b'1'  b'1'  b'2'  b'2'  b'4'  b'1'  b'1'   
1   1  b'3'  3.40  1.88  b'1'  b'1'  b'1'  b'1'  b'1'  b'1'  b'2'  b'1'  b'1'   
2   2  b'3'  2.76  2.08  b'2'  b'1'  b'1'  b'1'  b'2'  b'1'  b'1'  b'1'  b'1'   
3   3  b'3'  3.68  3.04  b'1'  b'1'  b'1'  b'1'  b'1'  b'1'  b'1'  b'1'  b'1'   
4   4  b'3'  2.44  0.96  b'3'  b'1'  b'2'  b'1'  b'2'  b'2'  b'1'  b'1'  b'1'   

    V13   V14   V15   V16 Class  
0  b'1'  b'2'  b'1'  60.0  b'2'  
1  b'1'  b'2'  b'1'  51.0  b'2'  
2  b'1'  b'2'  b'1'  59.0  b'2'  
3  b'1'  b'1'  b'1'  54.0  b'2'  
4  b'1'  b'2'  b'1'  73.0  b'1'  


## Convert byte strings to strings

In [7]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['V1',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'Class']

In [8]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,Class
0,0,2,2.88,2.16,2,1,1,1,2,2,4,1,1,1,2,1,60.0,2
1,1,3,3.4,1.88,1,1,1,1,1,1,2,1,1,1,2,1,51.0,2
2,2,3,2.76,2.08,2,1,1,1,2,1,1,1,1,1,2,1,59.0,2
3,3,3,3.68,3.04,1,1,1,1,1,1,1,1,1,1,1,1,54.0,2
4,4,3,2.44,0.96,3,1,2,1,2,2,1,1,1,1,2,1,73.0,1


## Drop unuseful columns

In [9]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Convert ? to NaN

In [10]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [11]:
data.to_csv(outp_fname, index=False)