In [26]:
import numpy as np
import pandas as pd
import os

In [27]:
dataset_name = 'auction'

In [28]:
input_dir = './raw/'
inp_fname = 'auction_verification.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [29]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,process.b1.capacity,process.b2.capacity,process.b3.capacity,process.b4.capacity,property.price,property.product,property.winner,verification.result,verification.time
0,0,0,2,1,59,1,0,False,163.316667
1,0,0,2,1,59,2,0,False,200.86
2,0,0,2,1,59,4,0,False,154.888889
3,0,0,2,1,59,6,0,False,108.64
4,0,0,2,1,60,1,0,True,85.466667


In [30]:
id_col = "id"
target_col = "verification.result"

## Insert Id Column

In [31]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())



   id  process.b1.capacity  process.b2.capacity  process.b3.capacity  \
0   0                    0                    0                    2   
1   1                    0                    0                    2   
2   2                    0                    0                    2   
3   3                    0                    0                    2   
4   4                    0                    0                    2   

   process.b4.capacity  property.price  property.product  property.winner  \
0                    1              59                 1                0   
1                    1              59                 2                0   
2                    1              59                 4                0   
3                    1              59                 6                0   
4                    1              60                 1                0   

   verification.result  verification.time  
0                False         163.316667  
1               

## Convert byte strings to strings

In [32]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

[]

In [33]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

Unnamed: 0,id,process.b1.capacity,process.b2.capacity,process.b3.capacity,process.b4.capacity,property.price,property.product,property.winner,verification.result,verification.time
0,0,0,0,2,1,59,1,0,False,163.316667
1,1,0,0,2,1,59,2,0,False,200.86
2,2,0,0,2,1,59,4,0,False,154.888889
3,3,0,0,2,1,59,6,0,False,108.64
4,4,0,0,2,1,60,1,0,True,85.466667


In [34]:
unuseful_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unuseful_columns)
data.drop(columns=unuseful_columns, inplace=True)

[]


In [35]:
data.drop(columns='verification.time', inplace=True)
data.head()


Unnamed: 0,id,process.b1.capacity,process.b2.capacity,process.b3.capacity,process.b4.capacity,property.price,property.product,property.winner,verification.result
0,0,0,0,2,1,59,1,0,False
1,1,0,0,2,1,59,2,0,False
2,2,0,0,2,1,59,4,0,False
3,3,0,0,2,1,59,6,0,False
4,4,0,0,2,1,60,1,0,True


## Convert ? to NaN

In [36]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File 

In [37]:
data.to_csv(outp_fname, index=False)