In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'satellite'

In [3]:
input_dir = './raw/'
inp_fname = 'Satellite.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,V29,V30,V31,V32,V33,V34,V35,V36,Target
0,46.0,40.0,119.0,139.0,42.0,30.0,135.0,157.0,42.0,30.0,...,113.0,50.0,46.0,111.0,116.0,44.0,31.0,131.0,142.0,b'Anomaly'
1,47.0,37.0,119.0,133.0,44.0,34.0,124.0,143.0,44.0,34.0,...,85.0,50.0,39.0,118.0,132.0,43.0,29.0,133.0,143.0,b'Anomaly'
2,80.0,95.0,100.0,74.0,64.0,64.0,104.0,96.0,46.0,36.0,...,81.0,82.0,91.0,92.0,78.0,78.0,83.0,96.0,74.0,b'Anomaly'
3,56.0,51.0,72.0,60.0,59.0,54.0,72.0,60.0,59.0,51.0,...,50.0,57.0,55.0,74.0,61.0,57.0,55.0,78.0,65.0,b'Anomaly'
4,44.0,34.0,129.0,140.0,44.0,34.0,124.0,136.0,44.0,34.0,...,139.0,43.0,31.0,128.0,135.0,43.0,29.0,128.0,132.0,b'Anomaly'


In [5]:
id_col = "id"
target_col = "Target"

## Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id    V1    V2     V3     V4    V5    V6     V7     V8    V9  ...    V28  \
0   0  46.0  40.0  119.0  139.0  42.0  30.0  135.0  157.0  42.0  ...  113.0   
1   1  47.0  37.0  119.0  133.0  44.0  34.0  124.0  143.0  44.0  ...   85.0   
2   2  80.0  95.0  100.0   74.0  64.0  64.0  104.0   96.0  46.0  ...   81.0   
3   3  56.0  51.0   72.0   60.0  59.0  54.0   72.0   60.0  59.0  ...   50.0   
4   4  44.0  34.0  129.0  140.0  44.0  34.0  124.0  136.0  44.0  ...  139.0   

    V29   V30    V31    V32   V33   V34    V35    V36      Target  
0  50.0  46.0  111.0  116.0  44.0  31.0  131.0  142.0  b'Anomaly'  
1  50.0  39.0  118.0  132.0  43.0  29.0  133.0  143.0  b'Anomaly'  
2  82.0  91.0   92.0   78.0  78.0  83.0   96.0   74.0  b'Anomaly'  
3  57.0  55.0   74.0   61.0  57.0  55.0   78.0   65.0  b'Anomaly'  
4  43.0  31.0  128.0  135.0  43.0  29.0  128.0  132.0  b'Anomaly'  

[5 rows x 38 columns]


## Convert byte strings to strings

In [7]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['Target']

In [8]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V28,V29,V30,V31,V32,V33,V34,V35,V36,Target
0,0,46.0,40.0,119.0,139.0,42.0,30.0,135.0,157.0,42.0,...,113.0,50.0,46.0,111.0,116.0,44.0,31.0,131.0,142.0,Anomaly
1,1,47.0,37.0,119.0,133.0,44.0,34.0,124.0,143.0,44.0,...,85.0,50.0,39.0,118.0,132.0,43.0,29.0,133.0,143.0,Anomaly
2,2,80.0,95.0,100.0,74.0,64.0,64.0,104.0,96.0,46.0,...,81.0,82.0,91.0,92.0,78.0,78.0,83.0,96.0,74.0,Anomaly
3,3,56.0,51.0,72.0,60.0,59.0,54.0,72.0,60.0,59.0,...,50.0,57.0,55.0,74.0,61.0,57.0,55.0,78.0,65.0,Anomaly
4,4,44.0,34.0,129.0,140.0,44.0,34.0,124.0,136.0,44.0,...,139.0,43.0,31.0,128.0,135.0,43.0,29.0,128.0,132.0,Anomaly


## Drop unuseful columns

In [9]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Convert ? to NaN

In [10]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [11]:
data.to_csv(outp_fname, index=False)