In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'letter_img'

In [3]:
input_dir = './raw/'
inp_fname = 'letter_img.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,target
0,2.0,8.0,3.0,5.0,1.0,8.0,13.0,0.0,6.0,6.0,10.0,8.0,0.0,8.0,0.0,8.0,-1
1,5.0,12.0,3.0,7.0,2.0,10.0,5.0,5.0,4.0,13.0,3.0,9.0,2.0,8.0,4.0,10.0,-1
2,4.0,11.0,6.0,8.0,6.0,10.0,6.0,2.0,6.0,10.0,3.0,7.0,3.0,7.0,3.0,9.0,-1
3,7.0,11.0,6.0,6.0,3.0,5.0,9.0,4.0,6.0,4.0,4.0,10.0,6.0,10.0,2.0,8.0,-1
4,2.0,1.0,3.0,1.0,1.0,8.0,6.0,6.0,6.0,6.0,5.0,9.0,1.0,7.0,5.0,10.0,-1


In [5]:
n_features = data.shape[1] - 1
data.columns = [f'f{i}' for i in range(n_features)] + ['target']

In [6]:
id_col = "id"
target_col = "target"

## Insert Id Column

In [7]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id   f0    f1   f2   f3   f4    f5    f6   f7   f8    f9   f10   f11  f12  \
0   0  2.0   8.0  3.0  5.0  1.0   8.0  13.0  0.0  6.0   6.0  10.0   8.0  0.0   
1   1  5.0  12.0  3.0  7.0  2.0  10.0   5.0  5.0  4.0  13.0   3.0   9.0  2.0   
2   2  4.0  11.0  6.0  8.0  6.0  10.0   6.0  2.0  6.0  10.0   3.0   7.0  3.0   
3   3  7.0  11.0  6.0  6.0  3.0   5.0   9.0  4.0  6.0   4.0   4.0  10.0  6.0   
4   4  2.0   1.0  3.0  1.0  1.0   8.0   6.0  6.0  6.0   6.0   5.0   9.0  1.0   

    f13  f14   f15  target  
0   8.0  0.0   8.0      -1  
1   8.0  4.0  10.0      -1  
2   7.0  3.0   9.0      -1  
3  10.0  2.0   8.0      -1  
4   7.0  5.0  10.0      -1  


## Convert byte strings to strings

In [8]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

[]

In [9]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

  data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)


Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,target
0,0,2.0,8.0,3.0,5.0,1.0,8.0,13.0,0.0,6.0,6.0,10.0,8.0,0.0,8.0,0.0,8.0,-1
1,1,5.0,12.0,3.0,7.0,2.0,10.0,5.0,5.0,4.0,13.0,3.0,9.0,2.0,8.0,4.0,10.0,-1
2,2,4.0,11.0,6.0,8.0,6.0,10.0,6.0,2.0,6.0,10.0,3.0,7.0,3.0,7.0,3.0,9.0,-1
3,3,7.0,11.0,6.0,6.0,3.0,5.0,9.0,4.0,6.0,4.0,4.0,10.0,6.0,10.0,2.0,8.0,-1
4,4,2.0,1.0,3.0,1.0,1.0,8.0,6.0,6.0,6.0,6.0,5.0,9.0,1.0,7.0,5.0,10.0,-1


## Drop unuseful columns

In [10]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Convert ? to NaN

In [11]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [12]:
data.to_csv(outp_fname, index=False)