In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'QSAR_biodegradation'

In [3]:
input_dir = './raw/'
inp_fname = 'QSAR biodegradation.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V33,V34,V35,V36,V37,V38,V39,V40,V41,Class
0,3.919,2.6909,0.0,0.0,0.0,0.0,0.0,31.4,2.0,0.0,...,0.0,0.0,0.0,2.949,1.591,0.0,7.253,0.0,0.0,b'2'
1,4.17,2.1144,0.0,0.0,0.0,0.0,0.0,30.8,1.0,1.0,...,0.0,0.0,0.0,3.315,1.967,0.0,7.257,0.0,0.0,b'2'
2,3.932,3.2512,0.0,0.0,0.0,0.0,0.0,26.7,2.0,4.0,...,0.0,0.0,1.0,3.076,2.417,0.0,7.601,0.0,0.0,b'2'
3,3.0,2.7098,0.0,0.0,0.0,0.0,0.0,20.0,0.0,2.0,...,0.0,0.0,1.0,3.046,5.0,0.0,6.69,0.0,0.0,b'2'
4,4.236,3.3944,0.0,0.0,0.0,0.0,0.0,29.4,2.0,4.0,...,0.0,0.0,0.0,3.351,2.405,0.0,8.003,0.0,0.0,b'2'


In [5]:
id_col = "id"
target_col = "Class"

## Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id     V1      V2   V3   V4   V5   V6   V7    V8   V9  ...  V33  V34  V35  \
0   0  3.919  2.6909  0.0  0.0  0.0  0.0  0.0  31.4  2.0  ...  0.0  0.0  0.0   
1   1  4.170  2.1144  0.0  0.0  0.0  0.0  0.0  30.8  1.0  ...  0.0  0.0  0.0   
2   2  3.932  3.2512  0.0  0.0  0.0  0.0  0.0  26.7  2.0  ...  0.0  0.0  1.0   
3   3  3.000  2.7098  0.0  0.0  0.0  0.0  0.0  20.0  0.0  ...  0.0  0.0  1.0   
4   4  4.236  3.3944  0.0  0.0  0.0  0.0  0.0  29.4  2.0  ...  0.0  0.0  0.0   

     V36    V37  V38    V39  V40  V41  Class  
0  2.949  1.591  0.0  7.253  0.0  0.0   b'2'  
1  3.315  1.967  0.0  7.257  0.0  0.0   b'2'  
2  3.076  2.417  0.0  7.601  0.0  0.0   b'2'  
3  3.046  5.000  0.0  6.690  0.0  0.0   b'2'  
4  3.351  2.405  0.0  8.003  0.0  0.0   b'2'  

[5 rows x 43 columns]


## Convert byte strings to strings

In [7]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['Class']

In [8]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V33,V34,V35,V36,V37,V38,V39,V40,V41,Class
0,0,3.919,2.6909,0.0,0.0,0.0,0.0,0.0,31.4,2.0,...,0.0,0.0,0.0,2.949,1.591,0.0,7.253,0.0,0.0,2
1,1,4.17,2.1144,0.0,0.0,0.0,0.0,0.0,30.8,1.0,...,0.0,0.0,0.0,3.315,1.967,0.0,7.257,0.0,0.0,2
2,2,3.932,3.2512,0.0,0.0,0.0,0.0,0.0,26.7,2.0,...,0.0,0.0,1.0,3.076,2.417,0.0,7.601,0.0,0.0,2
3,3,3.0,2.7098,0.0,0.0,0.0,0.0,0.0,20.0,0.0,...,0.0,0.0,1.0,3.046,5.0,0.0,6.69,0.0,0.0,2
4,4,4.236,3.3944,0.0,0.0,0.0,0.0,0.0,29.4,2.0,...,0.0,0.0,0.0,3.351,2.405,0.0,8.003,0.0,0.0,2


## Drop unuseful columns

In [9]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Convert ? to NaN

In [10]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [11]:
data.to_csv(outp_fname, index=False)