In [2]:
import numpy as np
import pandas as pd
import os

In [3]:
dataset_name = 'blood_transfusion'

In [4]:
input_dir = './raw/'
inp_fname = 'blood_transfusion.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [5]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,V1,V2,V3,V4,Class
0,2.0,50.0,12500.0,98.0,b'2'
1,0.0,13.0,3250.0,28.0,b'2'
2,1.0,16.0,4000.0,35.0,b'2'
3,2.0,20.0,5000.0,45.0,b'2'
4,1.0,24.0,6000.0,77.0,b'1'


In [6]:
id_col = "id"
target_col = "Class"

## Insert Id Column

In [7]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())



   id   V1    V2       V3    V4 Class
0   0  2.0  50.0  12500.0  98.0  b'2'
1   1  0.0  13.0   3250.0  28.0  b'2'
2   2  1.0  16.0   4000.0  35.0  b'2'
3   3  2.0  20.0   5000.0  45.0  b'2'
4   4  1.0  24.0   6000.0  77.0  b'1'


## Convert byte strings to strings

In [8]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['Class']

In [9]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

Unnamed: 0,id,V1,V2,V3,V4,Class
0,0,2.0,50.0,12500.0,98.0,2
1,1,0.0,13.0,3250.0,28.0,2
2,2,1.0,16.0,4000.0,35.0,2
3,3,2.0,20.0,5000.0,45.0,2
4,4,1.0,24.0,6000.0,77.0,1


In [10]:
unuseful_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unuseful_columns)
data.drop(columns=unuseful_columns, inplace=True)

[]


In [11]:
data.head()

Unnamed: 0,id,V1,V2,V3,V4,Class
0,0,2.0,50.0,12500.0,98.0,2
1,1,0.0,13.0,3250.0,28.0,2
2,2,1.0,16.0,4000.0,35.0,2
3,3,2.0,20.0,5000.0,45.0,2
4,4,1.0,24.0,6000.0,77.0,1


## Convert ? to NaN

In [12]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File 

In [13]:
data.to_csv(outp_fname, index=False)