In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'segment0'

In [3]:
input_dir = './raw/'
inp_fname = 'segment0.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,Region-centroid-col,Region-centroid-row,Region-pixel-count,Short-line-density-5,Short-line-density-2,Vedge-mean,Vegde-sd,Hedge-mean,Hedge-sd,Intensity-mean,Rawred-mean,Rawblue-mean,Rawgreen-mean,Exred-mean,Exblue-mean,Exgreen-mean,Value-mean,Saturatoin-mean,Hue-mean,Class
0,218.0,178.0,9,0.111111,0.0,0.833333,0.547722,1.111109,0.544331,59.62963,52.444443,75.22222,51.22222,-21.555555,46.77778,-25.222221,75.22222,0.318996,-2.040554,negative
1,113.0,130.0,9,0.0,0.0,0.277778,0.250924,0.333333,0.365148,0.888889,0.0,2.555556,0.111111,-2.666667,5.0,-2.333333,2.555556,1.0,-2.123254,negative
2,202.0,41.0,9,0.0,0.0,0.944448,0.772202,1.111112,1.025597,123.03704,111.888885,139.77779,117.44444,-33.444443,50.22222,-16.777779,139.77779,0.199347,-2.299918,negative
3,32.0,173.0,9,0.0,0.0,1.722222,1.781593,9.0,6.749488,43.592594,39.555557,52.88889,38.333336,-12.111111,27.88889,-15.777778,52.88889,0.266914,-1.998857,negative
4,61.0,197.0,9,0.0,0.0,1.444444,1.515353,2.611111,1.925463,49.592594,44.22222,61.555557,43.0,-16.11111,35.88889,-19.777779,61.555557,0.302925,-2.022274,negative


In [5]:
id_col = "id"
target_col = "Class"

## Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id  Region-centroid-col  Region-centroid-row  Region-pixel-count  \
0   0                218.0                178.0                   9   
1   1                113.0                130.0                   9   
2   2                202.0                 41.0                   9   
3   3                 32.0                173.0                   9   
4   4                 61.0                197.0                   9   

   Short-line-density-5  Short-line-density-2  Vedge-mean  Vegde-sd  \
0              0.111111                   0.0    0.833333  0.547722   
1              0.000000                   0.0    0.277778  0.250924   
2              0.000000                   0.0    0.944448  0.772202   
3              0.000000                   0.0    1.722222  1.781593   
4              0.000000                   0.0    1.444444  1.515353   

   Hedge-mean  Hedge-sd  ...  Rawred-mean  Rawblue-mean  Rawgreen-mean  \
0    1.111109  0.544331  ...    52.444443     75.222220      51.222220  

## Convert byte strings to strings

In [7]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['Class']

In [8]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

  data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)


Unnamed: 0,id,Region-centroid-col,Region-centroid-row,Region-pixel-count,Short-line-density-5,Short-line-density-2,Vedge-mean,Vegde-sd,Hedge-mean,Hedge-sd,...,Rawred-mean,Rawblue-mean,Rawgreen-mean,Exred-mean,Exblue-mean,Exgreen-mean,Value-mean,Saturatoin-mean,Hue-mean,Class
0,0,218.0,178.0,9,0.111111,0.0,0.833333,0.547722,1.111109,0.544331,...,52.444443,75.22222,51.22222,-21.555555,46.77778,-25.222221,75.22222,0.318996,-2.040554,negative
1,1,113.0,130.0,9,0.0,0.0,0.277778,0.250924,0.333333,0.365148,...,0.0,2.555556,0.111111,-2.666667,5.0,-2.333333,2.555556,1.0,-2.123254,negative
2,2,202.0,41.0,9,0.0,0.0,0.944448,0.772202,1.111112,1.025597,...,111.888885,139.77779,117.44444,-33.444443,50.22222,-16.777779,139.77779,0.199347,-2.299918,negative
3,3,32.0,173.0,9,0.0,0.0,1.722222,1.781593,9.0,6.749488,...,39.555557,52.88889,38.333336,-12.111111,27.88889,-15.777778,52.88889,0.266914,-1.998857,negative
4,4,61.0,197.0,9,0.0,0.0,1.444444,1.515353,2.611111,1.925463,...,44.22222,61.555557,43.0,-16.11111,35.88889,-19.777779,61.555557,0.302925,-2.022274,negative


## Drop unuseful columns

In [9]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

['Region-pixel-count']


## Convert ? to NaN

In [10]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [11]:
data.to_csv(outp_fname, index=False)