In [6]:
import numpy as np
import pandas as pd
import os

In [7]:
dataset_name = 'jm1'

In [8]:
input_dir = './raw/'
inp_fname = 'jm1.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [9]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,1.1,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,1.3,...,2.0,2.0,2.0,2.0,1.2,1.2,1.2,1.2,1.4,b'false'
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,b'true'
2,72.0,7.0,1.0,6.0,198.0,1134.13,0.05,20.31,55.85,23029.1,...,51.0,10.0,8.0,1.0,17.0,36.0,112.0,86.0,13.0,b'true'
3,190.0,3.0,1.0,3.0,600.0,4348.76,0.06,17.06,254.87,74202.67,...,129.0,29.0,28.0,2.0,17.0,135.0,329.0,271.0,5.0,b'true'
4,37.0,4.0,1.0,4.0,126.0,599.12,0.06,17.19,34.86,10297.3,...,28.0,1.0,6.0,0.0,11.0,16.0,76.0,50.0,7.0,b'true'


In [10]:
id_col = "id"
target_col = "defects"

## Insert Id Column

In [11]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id    loc  v(g)  ev(g)  iv(g)      n        v     l      d       i  ...  \
0   0    1.1   1.4    1.4    1.4    1.3     1.30  1.30   1.30    1.30  ...   
1   1    1.0   1.0    1.0    1.0    1.0     1.00  1.00   1.00    1.00  ...   
2   2   72.0   7.0    1.0    6.0  198.0  1134.13  0.05  20.31   55.85  ...   
3   3  190.0   3.0    1.0    3.0  600.0  4348.76  0.06  17.06  254.87  ...   
4   4   37.0   4.0    1.0    4.0  126.0   599.12  0.06  17.19   34.86  ...   

   lOCode  lOComment  lOBlank  locCodeAndComment  uniq_Op  uniq_Opnd  \
0     2.0        2.0      2.0                2.0      1.2        1.2   
1     1.0        1.0      1.0                1.0      1.0        1.0   
2    51.0       10.0      8.0                1.0     17.0       36.0   
3   129.0       29.0     28.0                2.0     17.0      135.0   
4    28.0        1.0      6.0                0.0     11.0       16.0   

   total_Op  total_Opnd  branchCount   defects  
0       1.2         1.2          1.4  b'false'  


## Convert byte strings to strings

In [12]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['defects']

In [13]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

  data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)


Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,0,1.1,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,...,2.0,2.0,2.0,2.0,1.2,1.2,1.2,1.2,1.4,False
1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,True
2,2,72.0,7.0,1.0,6.0,198.0,1134.13,0.05,20.31,55.85,...,51.0,10.0,8.0,1.0,17.0,36.0,112.0,86.0,13.0,True
3,3,190.0,3.0,1.0,3.0,600.0,4348.76,0.06,17.06,254.87,...,129.0,29.0,28.0,2.0,17.0,135.0,329.0,271.0,5.0,True
4,4,37.0,4.0,1.0,4.0,126.0,599.12,0.06,17.19,34.86,...,28.0,1.0,6.0,0.0,11.0,16.0,76.0,50.0,7.0,True


## Drop unuseful columns

In [14]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Convert ? to NaN

In [15]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [16]:
data.to_csv(outp_fname, index=False)