In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'Steel Plates Faults'

In [3]:
input_dir = './raw/'
inp_fname = 'Steel_Plates_Faults.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V25,V26,V27,V28,V29,V30,V31,V32,V33,Class
0,42.0,50.0,270900.0,270944.0,267.0,17.0,44.0,24220.0,76.0,108.0,...,0.8182,-0.2913,0.5822,1.0,0.0,0.0,0.0,0.0,0.0,b'1'
1,645.0,651.0,2538079.0,2538108.0,108.0,10.0,30.0,11397.0,84.0,123.0,...,0.7931,-0.1756,0.2984,1.0,0.0,0.0,0.0,0.0,0.0,b'1'
2,829.0,835.0,1553913.0,1553931.0,71.0,8.0,19.0,7972.0,99.0,125.0,...,0.6667,-0.1228,0.215,1.0,0.0,0.0,0.0,0.0,0.0,b'1'
3,853.0,860.0,369370.0,369415.0,176.0,13.0,45.0,18996.0,99.0,126.0,...,0.8444,-0.1568,0.5212,1.0,0.0,0.0,0.0,0.0,0.0,b'1'
4,1289.0,1306.0,498078.0,498335.0,2409.0,60.0,260.0,246930.0,37.0,126.0,...,0.9338,-0.1992,1.0,1.0,0.0,0.0,0.0,0.0,0.0,b'1'


In [5]:
id_col = "id"
target_col = "Class"

## Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id      V1      V2         V3         V4      V5    V6     V7        V8  \
0   0    42.0    50.0   270900.0   270944.0   267.0  17.0   44.0   24220.0   
1   1   645.0   651.0  2538079.0  2538108.0   108.0  10.0   30.0   11397.0   
2   2   829.0   835.0  1553913.0  1553931.0    71.0   8.0   19.0    7972.0   
3   3   853.0   860.0   369370.0   369415.0   176.0  13.0   45.0   18996.0   
4   4  1289.0  1306.0   498078.0   498335.0  2409.0  60.0  260.0  246930.0   

     V9  ...     V25     V26     V27  V28  V29  V30  V31  V32  V33  Class  
0  76.0  ...  0.8182 -0.2913  0.5822  1.0  0.0  0.0  0.0  0.0  0.0   b'1'  
1  84.0  ...  0.7931 -0.1756  0.2984  1.0  0.0  0.0  0.0  0.0  0.0   b'1'  
2  99.0  ...  0.6667 -0.1228  0.2150  1.0  0.0  0.0  0.0  0.0  0.0   b'1'  
3  99.0  ...  0.8444 -0.1568  0.5212  1.0  0.0  0.0  0.0  0.0  0.0   b'1'  
4  37.0  ...  0.9338 -0.1992  1.0000  1.0  0.0  0.0  0.0  0.0  0.0   b'1'  

[5 rows x 35 columns]


## Convert byte strings to strings

In [7]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['Class']

In [8]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V25,V26,V27,V28,V29,V30,V31,V32,V33,Class
0,0,42.0,50.0,270900.0,270944.0,267.0,17.0,44.0,24220.0,76.0,...,0.8182,-0.2913,0.5822,1.0,0.0,0.0,0.0,0.0,0.0,1
1,1,645.0,651.0,2538079.0,2538108.0,108.0,10.0,30.0,11397.0,84.0,...,0.7931,-0.1756,0.2984,1.0,0.0,0.0,0.0,0.0,0.0,1
2,2,829.0,835.0,1553913.0,1553931.0,71.0,8.0,19.0,7972.0,99.0,...,0.6667,-0.1228,0.215,1.0,0.0,0.0,0.0,0.0,0.0,1
3,3,853.0,860.0,369370.0,369415.0,176.0,13.0,45.0,18996.0,99.0,...,0.8444,-0.1568,0.5212,1.0,0.0,0.0,0.0,0.0,0.0,1
4,4,1289.0,1306.0,498078.0,498335.0,2409.0,60.0,260.0,246930.0,37.0,...,0.9338,-0.1992,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1


## Drop unuseful columns

In [9]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Save Main Data File

In [10]:
data.to_csv(outp_fname, index=False)