In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'ozone_level'

In [3]:
input_dir = './raw/'
inp_fname = 'ozone_level_detection.csv'
output_dir = f'./'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V64,V65,V66,V67,V68,V69,V70,V71,V72,Class
0,0.8,1.8,2.4,2.1,2.0,2.1,1.5,1.7,1.9,2.3,...,0.15,10.67,-1.56,5795.0,-12.1,17.9,10330.0,-55.0,0.0,b'1'
1,2.8,3.2,3.3,2.7,3.3,3.2,2.9,2.8,3.1,3.4,...,0.48,8.39,3.84,5805.0,14.05,29.0,10275.0,-55.0,0.0,b'1'
2,2.9,2.8,2.6,2.1,2.2,2.5,2.5,2.7,2.2,2.5,...,0.6,6.94,9.8,5790.0,17.9,41.3,10235.0,-40.0,0.0,b'1'
3,4.7,3.8,3.7,3.8,2.9,3.1,2.8,2.5,2.4,3.1,...,0.49,8.73,10.54,5775.0,31.15,51.7,10195.0,-40.0,2.08,b'1'
4,2.6,2.1,1.6,1.4,0.9,1.5,1.2,1.4,1.3,1.4,...,0.304716,9.872418,0.830116,5818.821222,10.511051,37.388335,10164.198442,-0.119949,0.58,b'1'


In [5]:
id_col = "id"
target_col = "Class"

## Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id   V1   V2   V3   V4   V5   V6   V7   V8   V9  ...       V64        V65  \
0   0  0.8  1.8  2.4  2.1  2.0  2.1  1.5  1.7  1.9  ...  0.150000  10.670000   
1   1  2.8  3.2  3.3  2.7  3.3  3.2  2.9  2.8  3.1  ...  0.480000   8.390000   
2   2  2.9  2.8  2.6  2.1  2.2  2.5  2.5  2.7  2.2  ...  0.600000   6.940000   
3   3  4.7  3.8  3.7  3.8  2.9  3.1  2.8  2.5  2.4  ...  0.490000   8.730000   
4   4  2.6  2.1  1.6  1.4  0.9  1.5  1.2  1.4  1.3  ...  0.304716   9.872418   

         V66          V67        V68        V69           V70        V71  \
0  -1.560000  5795.000000 -12.100000  17.900000  10330.000000 -55.000000   
1   3.840000  5805.000000  14.050000  29.000000  10275.000000 -55.000000   
2   9.800000  5790.000000  17.900000  41.300000  10235.000000 -40.000000   
3  10.540000  5775.000000  31.150000  51.700000  10195.000000 -40.000000   
4   0.830116  5818.821222  10.511051  37.388335  10164.198442  -0.119949   

    V72  Class  
0  0.00   b'1'  
1  0.00   b'1'  
2  0.00   b

## Convert byte strings to strings

In [7]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['Class']

In [8]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

for col in byte_string_columns:
    data[col] = data[col].apply(lambda x: convert_byte_string_repr(x)).astype(str)
data.head()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V64,V65,V66,V67,V68,V69,V70,V71,V72,Class
0,0,0.8,1.8,2.4,2.1,2.0,2.1,1.5,1.7,1.9,...,0.15,10.67,-1.56,5795.0,-12.1,17.9,10330.0,-55.0,0.0,1
1,1,2.8,3.2,3.3,2.7,3.3,3.2,2.9,2.8,3.1,...,0.48,8.39,3.84,5805.0,14.05,29.0,10275.0,-55.0,0.0,1
2,2,2.9,2.8,2.6,2.1,2.2,2.5,2.5,2.7,2.2,...,0.6,6.94,9.8,5790.0,17.9,41.3,10235.0,-40.0,0.0,1
3,3,4.7,3.8,3.7,3.8,2.9,3.1,2.8,2.5,2.4,...,0.49,8.73,10.54,5775.0,31.15,51.7,10195.0,-40.0,2.08,1
4,4,2.6,2.1,1.6,1.4,0.9,1.5,1.2,1.4,1.3,...,0.304716,9.872418,0.830116,5818.821222,10.511051,37.388335,10164.198442,-0.119949,0.58,1


## Drop unuseful columns

In [9]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Convert ? to NaN

In [10]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [11]:
data.to_csv(outp_fname, index=False)