In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'pen_digits'

In [3]:
input_dir = './raw/'
inp_fname = 'pen_digits.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,target
0,47.0,100.0,27.0,81.0,57.0,37.0,26.0,0.0,0.0,23.0,56.0,53.0,100.0,90.0,40.0,98.0,-1
1,0.0,89.0,27.0,100.0,42.0,75.0,29.0,45.0,15.0,15.0,37.0,0.0,69.0,2.0,100.0,6.0,-1
2,0.0,57.0,31.0,68.0,72.0,90.0,100.0,100.0,76.0,75.0,50.0,51.0,28.0,25.0,16.0,0.0,-1
3,0.0,100.0,7.0,92.0,5.0,68.0,19.0,45.0,86.0,34.0,100.0,45.0,74.0,23.0,67.0,0.0,-1
4,0.0,67.0,49.0,83.0,100.0,100.0,81.0,80.0,60.0,60.0,40.0,40.0,33.0,20.0,47.0,0.0,-1


In [5]:
n_features = data.shape[1] - 1
data.columns = [f'f{i}' for i in range(n_features)] + ['target']

In [7]:
id_col = "id"
target_col = "target"

## Insert Id Column

In [8]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id    f0     f1    f2     f3     f4     f5     f6     f7    f8    f9  \
0   0  47.0  100.0  27.0   81.0   57.0   37.0   26.0    0.0   0.0  23.0   
1   1   0.0   89.0  27.0  100.0   42.0   75.0   29.0   45.0  15.0  15.0   
2   2   0.0   57.0  31.0   68.0   72.0   90.0  100.0  100.0  76.0  75.0   
3   3   0.0  100.0   7.0   92.0    5.0   68.0   19.0   45.0  86.0  34.0   
4   4   0.0   67.0  49.0   83.0  100.0  100.0   81.0   80.0  60.0  60.0   

     f10   f11    f12   f13    f14   f15  target  
0   56.0  53.0  100.0  90.0   40.0  98.0      -1  
1   37.0   0.0   69.0   2.0  100.0   6.0      -1  
2   50.0  51.0   28.0  25.0   16.0   0.0      -1  
3  100.0  45.0   74.0  23.0   67.0   0.0      -1  
4   40.0  40.0   33.0  20.0   47.0   0.0      -1  


## Convert byte strings to strings

In [9]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

[]

In [10]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

  data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)


Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,target
0,0,47.0,100.0,27.0,81.0,57.0,37.0,26.0,0.0,0.0,23.0,56.0,53.0,100.0,90.0,40.0,98.0,-1
1,1,0.0,89.0,27.0,100.0,42.0,75.0,29.0,45.0,15.0,15.0,37.0,0.0,69.0,2.0,100.0,6.0,-1
2,2,0.0,57.0,31.0,68.0,72.0,90.0,100.0,100.0,76.0,75.0,50.0,51.0,28.0,25.0,16.0,0.0,-1
3,3,0.0,100.0,7.0,92.0,5.0,68.0,19.0,45.0,86.0,34.0,100.0,45.0,74.0,23.0,67.0,0.0,-1
4,4,0.0,67.0,49.0,83.0,100.0,100.0,81.0,80.0,60.0,60.0,40.0,40.0,33.0,20.0,47.0,0.0,-1


## Drop unuseful columns

In [11]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Convert ? to NaN

In [12]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [13]:
data.to_csv(outp_fname, index=False)