In [117]:
import numpy as np
import pandas as pd
import os, sys

In [118]:
dataset_name = 'steel_plate_faults'
inp_fname = 'Faults.NNA'

In [119]:
input_dir = './data'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')

# Read Data

In [120]:
cols = [
    'X_Minimum', 
    'X_Maximum', 
    'Y_Minimum', 
    'Y_Maximum', 
    'Pixels_Areas', 
    'X_Perimeter', 
    'Y_Perimeter', 
    'Sum_of_Luminosity', 
    'Minimum_of_Luminosity', 
    'Maximum_of_Luminosity', 
    'Length_of_Conveyer', 
    'TypeOfSteel_A300', 
    'TypeOfSteel_A400', 
    'Steel_Plate_Thickness', 
    'Edges_Index', 
    'Empty_Index', 
    'Square_Index', 
    'Outside_X_Index', 
    'Edges_X_Index', 
    'Edges_Y_Index', 
    'Outside_Global_Index', 
    'LogOfAreas', 
    'Log_X_Index', 
    'Log_Y_Index', 
    'Orientation_Index', 
    'Luminosity_Index', 
    'SigmoidOfAreas', 
    'Pastry', 
    'Z_Scratch', 
    'K_Scatch', 
    'Stains', 
    'Dirtiness', 
    'Bumps', 
    'Other_Faults', 
]

In [121]:
data = pd.read_csv(os.path.join(input_dir, inp_fname),header=None, names=cols, sep="\t")
data.head()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,42,50,270900,270944,267,17,44,24220,76,108,...,0.8182,-0.2913,0.5822,1,0,0,0,0,0,0
1,645,651,2538079,2538108,108,10,30,11397,84,123,...,0.7931,-0.1756,0.2984,1,0,0,0,0,0,0
2,829,835,1553913,1553931,71,8,19,7972,99,125,...,0.6667,-0.1228,0.215,1,0,0,0,0,0,0
3,853,860,369370,369415,176,13,45,18996,99,126,...,0.8444,-0.1568,0.5212,1,0,0,0,0,0,0
4,1289,1306,498078,498335,2409,60,260,246930,37,126,...,0.9338,-0.1992,1.0,1,0,0,0,0,0,0


In [122]:
id_col = "Id"
target_col = "Fault_Type"

# Convert OHE into categorical

In [123]:
def convert_ohe_cols_to_categorical(df, ohe_cols, categorized_col_name, drop_ohe_cols=True):
    """function does the opposite of one-hot-encoding"""
    df[categorized_col_name] = df[ohe_cols].idxmax(axis=1)
    if drop_ohe_cols:
        df.drop(columns=ohe_cols, inplace=True)
    return df

In [124]:
# Two columns: TypeOfSteel_A300 and TypeOfSteel_A400 can be converted back to categorical 
ohe_cols = ['TypeOfSteel_A300', 'TypeOfSteel_A400']
categorized_col_name = 'TypeOfSteel'

data = convert_ohe_cols_to_categorical(data, ohe_cols, categorized_col_name)

data[categorized_col_name] = data[categorized_col_name].map({
    "TypeOfSteel_A300": "A300",
    "TypeOfSteel_A400": "A400",
})

data.shape

(1941, 33)

In [125]:
# OHE Target to be converted into categorical 
ohe_cols = ['Pastry', 
    'Z_Scratch', 
    'K_Scatch', 
    'Stains', 
    'Dirtiness', 
    'Bumps', 
    'Other_Faults']
categorized_col_name = target_col
data = convert_ohe_cols_to_categorical(data, ohe_cols, categorized_col_name)
data.shape

(1941, 27)

In [126]:
data.columns

Index(['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas',
       'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity',
       'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer',
       'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index',
       'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index',
       'Outside_Global_Index', 'LogOfAreas', 'Log_X_Index', 'Log_Y_Index',
       'Orientation_Index', 'Luminosity_Index', 'SigmoidOfAreas',
       'TypeOfSteel', 'Fault_Type'],
      dtype='object')

In [127]:
data.shape

(1941, 27)

# Insert Id Column

In [128]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

   Id  X_Minimum  X_Maximum  Y_Minimum  Y_Maximum  Pixels_Areas  X_Perimeter  \
0   0         42         50     270900     270944           267           17   
1   1        645        651    2538079    2538108           108           10   
2   2        829        835    1553913    1553931            71            8   
3   3        853        860     369370     369415           176           13   
4   4       1289       1306     498078     498335          2409           60   

   Y_Perimeter  Sum_of_Luminosity  Minimum_of_Luminosity  ...  Edges_Y_Index  \
0           44              24220                     76  ...         1.0000   
1           30              11397                     84  ...         0.9667   
2           19               7972                     99  ...         0.9474   
3           45              18996                     99  ...         1.0000   
4          260             246930                     37  ...         0.9885   

   Outside_Global_Index  LogOfAreas  L

# Save Main Data File

In [129]:
data.to_csv(outp_fname, index=False)