In [21]:
import numpy as np
import pandas as pd
import os

In [22]:
dataset_name = 'abalone_binarized'

In [23]:
input_dir = './raw/'
inp_fname = 'abalone.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [24]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,target
0,0.0,0.0,1.0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,-1
1,0.0,0.0,1.0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,1
2,1.0,0.0,0.0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,-1
3,0.0,0.0,1.0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,-1
4,0.0,1.0,0.0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,1


In [25]:
data.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'target'], dtype='object')

In [26]:
n_features = data.shape[1] - 1
data.columns = [f'f{i}' for i in range(n_features)] + ['target']

In [27]:
id_col = "id"
target_col = "target"

## Insert Id Column

In [28]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id   f0   f1   f2     f3     f4     f5      f6      f7      f8     f9  \
0   0  0.0  0.0  1.0  0.455  0.365  0.095  0.5140  0.2245  0.1010  0.150   
1   1  0.0  0.0  1.0  0.350  0.265  0.090  0.2255  0.0995  0.0485  0.070   
2   2  1.0  0.0  0.0  0.530  0.420  0.135  0.6770  0.2565  0.1415  0.210   
3   3  0.0  0.0  1.0  0.440  0.365  0.125  0.5160  0.2155  0.1140  0.155   
4   4  0.0  1.0  0.0  0.330  0.255  0.080  0.2050  0.0895  0.0395  0.055   

   target  
0      -1  
1       1  
2      -1  
3      -1  
4       1  


## Convert byte strings to strings

In [29]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

[]

In [30]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

  data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)


Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,target
0,0,0.0,0.0,1.0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,-1
1,1,0.0,0.0,1.0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,1
2,2,1.0,0.0,0.0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,-1
3,3,0.0,0.0,1.0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,-1
4,4,0.0,1.0,0.0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,1


## Drop unuseful columns

In [31]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Convert ? to NaN

In [32]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [33]:
data.to_csv(outp_fname, index=False)