In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'vowel'

In [3]:
input_dir = './raw/'
inp_fname = 'vowel.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,TT,SpeakerNumber,Sex,F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,Class
0,0,0,0,-3.639,0.418,-0.67,1.779,-0.168,1.627,-0.388,0.529,-0.874,-0.814,positive
1,0,0,0,-3.327,0.496,-0.694,1.365,-0.265,1.933,-0.363,0.51,-0.621,-0.488,negative
2,0,0,0,-2.12,0.894,-1.576,0.147,-0.707,1.559,-0.579,0.676,-0.809,-0.049,negative
3,0,0,0,-2.287,1.809,-1.498,1.012,-1.053,1.06,-0.567,0.235,-0.091,-0.795,negative
4,0,0,0,-2.598,1.938,-0.846,1.062,-1.633,0.764,0.394,-0.15,0.277,-0.396,negative


In [5]:
id_col = "id"
target_col = "Class"

## Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id  TT  SpeakerNumber  Sex     F0     F1     F2     F3     F4     F5  \
0   0   0              0    0 -3.639  0.418 -0.670  1.779 -0.168  1.627   
1   1   0              0    0 -3.327  0.496 -0.694  1.365 -0.265  1.933   
2   2   0              0    0 -2.120  0.894 -1.576  0.147 -0.707  1.559   
3   3   0              0    0 -2.287  1.809 -1.498  1.012 -1.053  1.060   
4   4   0              0    0 -2.598  1.938 -0.846  1.062 -1.633  0.764   

      F6     F7     F8     F9      Class  
0 -0.388  0.529 -0.874 -0.814   positive  
1 -0.363  0.510 -0.621 -0.488   negative  
2 -0.579  0.676 -0.809 -0.049   negative  
3 -0.567  0.235 -0.091 -0.795   negative  
4  0.394 -0.150  0.277 -0.396   negative  


## Convert byte strings to strings

In [7]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['Class']

In [8]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

  data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)


Unnamed: 0,id,TT,SpeakerNumber,Sex,F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,Class
0,0,0,0,0,-3.639,0.418,-0.67,1.779,-0.168,1.627,-0.388,0.529,-0.874,-0.814,positive
1,1,0,0,0,-3.327,0.496,-0.694,1.365,-0.265,1.933,-0.363,0.51,-0.621,-0.488,negative
2,2,0,0,0,-2.12,0.894,-1.576,0.147,-0.707,1.559,-0.579,0.676,-0.809,-0.049,negative
3,3,0,0,0,-2.287,1.809,-1.498,1.012,-1.053,1.06,-0.567,0.235,-0.091,-0.795,negative
4,4,0,0,0,-2.598,1.938,-0.846,1.062,-1.633,0.764,0.394,-0.15,0.277,-0.396,negative


## Drop unuseful columns

In [9]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Convert ? to NaN

In [10]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [11]:
data.to_csv(outp_fname, index=False)