In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'banknote_authentication'

In [3]:
input_dir = './raw/'
inp_fname = 'banknote authentication.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,V1,V2,V3,V4,Class
0,3.6216,8.6661,-2.8073,-0.44699,b'1'
1,4.5459,8.1674,-2.4586,-1.4621,b'1'
2,3.866,-2.6383,1.9242,0.10645,b'1'
3,3.4566,9.5228,-4.0112,-3.5944,b'1'
4,0.32924,-4.4552,4.5718,-0.9888,b'1'


In [5]:
id_col = "id"
target_col = "Class"

## Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())



   id       V1      V2      V3       V4 Class
0   0  3.62160  8.6661 -2.8073 -0.44699  b'1'
1   1  4.54590  8.1674 -2.4586 -1.46210  b'1'
2   2  3.86600 -2.6383  1.9242  0.10645  b'1'
3   3  3.45660  9.5228 -4.0112 -3.59440  b'1'
4   4  0.32924 -4.4552  4.5718 -0.98880  b'1'


## Convert byte strings to strings

In [7]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['Class']

In [8]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

Unnamed: 0,id,V1,V2,V3,V4,Class
0,0,3.6216,8.6661,-2.8073,-0.44699,1
1,1,4.5459,8.1674,-2.4586,-1.4621,1
2,2,3.866,-2.6383,1.9242,0.10645,1
3,3,3.4566,9.5228,-4.0112,-3.5944,1
4,4,0.32924,-4.4552,4.5718,-0.9888,1


In [9]:
unuseful_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unuseful_columns)
data.drop(columns=unuseful_columns, inplace=True)

[]


In [10]:
data.head()

Unnamed: 0,id,V1,V2,V3,V4,Class
0,0,3.6216,8.6661,-2.8073,-0.44699,1
1,1,4.5459,8.1674,-2.4586,-1.4621,1
2,2,3.866,-2.6383,1.9242,0.10645,1
3,3,3.4566,9.5228,-4.0112,-3.5944,1
4,4,0.32924,-4.4552,4.5718,-0.9888,1


## Convert ? to NaN

In [11]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File 

In [12]:
data.to_csv(outp_fname, index=False)