In [5]:
import numpy as np
import pandas as pd
import os

In [6]:
dataset_name = 'us_crime'

In [7]:
input_dir = './raw/'
inp_fname = 'us_crime.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [8]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,target
0,0.19,0.33,0.02,0.9,0.12,0.17,0.34,0.47,0.29,0.32,...,0.12,0.42,0.5,0.51,0.64,0.12,0.26,0.2,0.32,-1
1,0.0,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,0.27,...,0.21,0.5,0.34,0.6,0.52,0.02,0.12,0.45,0.0,1
2,0.0,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,0.32,...,0.14,0.49,0.54,0.67,0.56,0.01,0.21,0.02,0.0,-1
3,0.04,0.77,1.0,0.08,0.12,0.1,0.51,0.5,0.34,0.21,...,0.19,0.3,0.73,0.64,0.65,0.02,0.39,0.28,0.0,-1
4,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,0.36,...,0.11,0.72,0.64,0.61,0.53,0.04,0.09,0.02,0.0,-1


In [9]:
n_features = data.shape[1] - 1
data.columns = [f'f{i}' for i in range(n_features)] + ['target']

In [10]:
id_col = "id"
target_col = "target"

## Insert Id Column

In [11]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id    f0    f1    f2    f3    f4    f5    f6    f7    f8  ...   f91   f92  \
0   0  0.19  0.33  0.02  0.90  0.12  0.17  0.34  0.47  0.29  ...  0.12  0.42   
1   1  0.00  0.16  0.12  0.74  0.45  0.07  0.26  0.59  0.35  ...  0.21  0.50   
2   2  0.00  0.42  0.49  0.56  0.17  0.04  0.39  0.47  0.28  ...  0.14  0.49   
3   3  0.04  0.77  1.00  0.08  0.12  0.10  0.51  0.50  0.34  ...  0.19  0.30   
4   4  0.01  0.55  0.02  0.95  0.09  0.05  0.38  0.38  0.23  ...  0.11  0.72   

    f93   f94   f95   f96   f97   f98   f99  target  
0  0.50  0.51  0.64  0.12  0.26  0.20  0.32      -1  
1  0.34  0.60  0.52  0.02  0.12  0.45  0.00       1  
2  0.54  0.67  0.56  0.01  0.21  0.02  0.00      -1  
3  0.73  0.64  0.65  0.02  0.39  0.28  0.00      -1  
4  0.64  0.61  0.53  0.04  0.09  0.02  0.00      -1  

[5 rows x 102 columns]


## Convert byte strings to strings

In [12]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

[]

In [13]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

  data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)


Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,target
0,0,0.19,0.33,0.02,0.9,0.12,0.17,0.34,0.47,0.29,...,0.12,0.42,0.5,0.51,0.64,0.12,0.26,0.2,0.32,-1
1,1,0.0,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,...,0.21,0.5,0.34,0.6,0.52,0.02,0.12,0.45,0.0,1
2,2,0.0,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,...,0.14,0.49,0.54,0.67,0.56,0.01,0.21,0.02,0.0,-1
3,3,0.04,0.77,1.0,0.08,0.12,0.1,0.51,0.5,0.34,...,0.19,0.3,0.73,0.64,0.65,0.02,0.39,0.28,0.0,-1
4,4,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,...,0.11,0.72,0.64,0.61,0.53,0.04,0.09,0.02,0.0,-1


## Drop unuseful columns

In [14]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Convert ? to NaN

In [15]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [16]:
data.to_csv(outp_fname, index=False)