In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'pie_chart'

In [3]:
input_dir = './raw/'
inp_fname = 'pie_chart.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,ag,ah,ai,aj,ak,al,am,an,ao,def
0,16.0,13.0,1.0,6.0,11.0,20.0,7.0,0.24,8.0,2.5,...,17.0,0.12,68.0,81.0,31.0,21.0,57.0,42.5,29.0,b'N'
1,2.0,7.0,0.0,0.0,7.0,10.0,4.0,0.29,4.0,2.5,...,9.0,0.17,9.0,22.0,5.0,8.0,24.0,33.33,14.0,b'Y'
2,1.0,13.0,5.0,0.0,0.0,22.0,7.0,0.37,10.0,2.2,...,23.0,0.33,38.0,53.0,19.0,19.0,21.0,0.0,19.0,b'N'
3,8.0,3.0,1.0,0.0,1.0,4.0,2.0,0.14,2.0,2.0,...,6.0,0.08,32.0,38.0,19.0,13.0,24.0,6.67,14.0,b'N'
4,1.0,5.0,2.0,1.0,1.0,6.0,3.0,0.15,2.0,3.0,...,9.0,0.13,27.0,33.0,19.0,14.0,23.0,9.52,20.0,b'N'


In [5]:
id_col = "id"
target_col = "def"

## Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id     a     b    c    d     e     f    g     h     i  ...    ag    ah  \
0   0  16.0  13.0  1.0  6.0  11.0  20.0  7.0  0.24   8.0  ...  17.0  0.12   
1   1   2.0   7.0  0.0  0.0   7.0  10.0  4.0  0.29   4.0  ...   9.0  0.17   
2   2   1.0  13.0  5.0  0.0   0.0  22.0  7.0  0.37  10.0  ...  23.0  0.33   
3   3   8.0   3.0  1.0  0.0   1.0   4.0  2.0  0.14   2.0  ...   6.0  0.08   
4   4   1.0   5.0  2.0  1.0   1.0   6.0  3.0  0.15   2.0  ...   9.0  0.13   

     ai    aj    ak    al    am     an    ao   def  
0  68.0  81.0  31.0  21.0  57.0  42.50  29.0  b'N'  
1   9.0  22.0   5.0   8.0  24.0  33.33  14.0  b'Y'  
2  38.0  53.0  19.0  19.0  21.0   0.00  19.0  b'N'  
3  32.0  38.0  19.0  13.0  24.0   6.67  14.0  b'N'  
4  27.0  33.0  19.0  14.0  23.0   9.52  20.0  b'N'  

[5 rows x 39 columns]


## Convert byte strings to strings

In [7]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['def']

In [8]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

  data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)


Unnamed: 0,id,a,b,c,d,e,f,g,h,i,...,ag,ah,ai,aj,ak,al,am,an,ao,def
0,0,16.0,13.0,1.0,6.0,11.0,20.0,7.0,0.24,8.0,...,17.0,0.12,68.0,81.0,31.0,21.0,57.0,42.5,29.0,N
1,1,2.0,7.0,0.0,0.0,7.0,10.0,4.0,0.29,4.0,...,9.0,0.17,9.0,22.0,5.0,8.0,24.0,33.33,14.0,Y
2,2,1.0,13.0,5.0,0.0,0.0,22.0,7.0,0.37,10.0,...,23.0,0.33,38.0,53.0,19.0,19.0,21.0,0.0,19.0,N
3,3,8.0,3.0,1.0,0.0,1.0,4.0,2.0,0.14,2.0,...,6.0,0.08,32.0,38.0,19.0,13.0,24.0,6.67,14.0,N
4,4,1.0,5.0,2.0,1.0,1.0,6.0,3.0,0.15,2.0,...,9.0,0.13,27.0,33.0,19.0,14.0,23.0,9.52,20.0,N


## Drop unuseful columns

In [9]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Convert ? to NaN

In [10]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [11]:
data.to_csv(outp_fname, index=False)