In [5]:
import numpy as np
import pandas as pd
import os

In [6]:
dataset_name = 'satimage'

In [7]:
input_dir = './raw/'
inp_fname = 'satimage.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [8]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,target
0,92.0,115.0,120.0,94.0,84.0,102.0,106.0,79.0,84.0,102.0,...,104.0,88.0,121.0,128.0,100.0,84.0,107.0,113.0,87.0,-1
1,84.0,102.0,106.0,79.0,84.0,102.0,102.0,83.0,80.0,102.0,...,100.0,84.0,107.0,113.0,87.0,84.0,99.0,104.0,79.0,-1
2,84.0,102.0,102.0,83.0,80.0,102.0,102.0,79.0,84.0,94.0,...,87.0,84.0,99.0,104.0,79.0,84.0,99.0,104.0,79.0,-1
3,80.0,102.0,102.0,79.0,84.0,94.0,102.0,79.0,80.0,94.0,...,79.0,84.0,99.0,104.0,79.0,84.0,103.0,104.0,79.0,-1
4,84.0,94.0,102.0,79.0,80.0,94.0,98.0,76.0,80.0,102.0,...,79.0,84.0,103.0,104.0,79.0,79.0,107.0,109.0,87.0,-1


In [9]:
n_features = data.shape[1] - 1
data.columns = [f'f{i}' for i in range(n_features)] + ['target']

In [10]:
id_col = "id"
target_col = "target"

## Insert Id Column

In [11]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id    f0     f1     f2    f3    f4     f5     f6    f7    f8  ...    f27  \
0   0  92.0  115.0  120.0  94.0  84.0  102.0  106.0  79.0  84.0  ...  104.0   
1   1  84.0  102.0  106.0  79.0  84.0  102.0  102.0  83.0  80.0  ...  100.0   
2   2  84.0  102.0  102.0  83.0  80.0  102.0  102.0  79.0  84.0  ...   87.0   
3   3  80.0  102.0  102.0  79.0  84.0   94.0  102.0  79.0  80.0  ...   79.0   
4   4  84.0   94.0  102.0  79.0  80.0   94.0   98.0  76.0  80.0  ...   79.0   

    f28    f29    f30    f31   f32    f33    f34   f35  target  
0  88.0  121.0  128.0  100.0  84.0  107.0  113.0  87.0      -1  
1  84.0  107.0  113.0   87.0  84.0   99.0  104.0  79.0      -1  
2  84.0   99.0  104.0   79.0  84.0   99.0  104.0  79.0      -1  
3  84.0   99.0  104.0   79.0  84.0  103.0  104.0  79.0      -1  
4  84.0  103.0  104.0   79.0  79.0  107.0  109.0  87.0      -1  

[5 rows x 38 columns]


## Convert byte strings to strings

In [12]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

[]

In [13]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

  data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)


Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f27,f28,f29,f30,f31,f32,f33,f34,f35,target
0,0,92.0,115.0,120.0,94.0,84.0,102.0,106.0,79.0,84.0,...,104.0,88.0,121.0,128.0,100.0,84.0,107.0,113.0,87.0,-1
1,1,84.0,102.0,106.0,79.0,84.0,102.0,102.0,83.0,80.0,...,100.0,84.0,107.0,113.0,87.0,84.0,99.0,104.0,79.0,-1
2,2,84.0,102.0,102.0,83.0,80.0,102.0,102.0,79.0,84.0,...,87.0,84.0,99.0,104.0,79.0,84.0,99.0,104.0,79.0,-1
3,3,80.0,102.0,102.0,79.0,84.0,94.0,102.0,79.0,80.0,...,79.0,84.0,99.0,104.0,79.0,84.0,103.0,104.0,79.0,-1
4,4,84.0,94.0,102.0,79.0,80.0,94.0,98.0,76.0,80.0,...,79.0,84.0,103.0,104.0,79.0,79.0,107.0,109.0,87.0,-1


## Drop unuseful columns

In [14]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Convert ? to NaN

In [15]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [16]:
data.to_csv(outp_fname, index=False)