In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'pc1'

In [3]:
input_dir = './raw/'
inp_fname = 'pc1.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,loc,v(g),ev(g),iv(G),N,V,L,D,I,E,...,lOCode,lOComment,locCodeAndComment,lOBlank,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,1.1,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,1.3,...,2.0,2.0,2.0,2.0,1.2,1.2,1.2,1.2,1.4,b'false'
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,b'true'
2,91.0,9.0,3.0,2.0,318.0,2089.21,0.04,27.68,75.47,57833.24,...,80.0,44.0,11.0,31.0,29.0,66.0,192.0,126.0,17.0,b'true'
3,109.0,21.0,5.0,18.0,381.0,2547.56,0.04,28.37,89.79,72282.68,...,97.0,41.0,12.0,24.0,28.0,75.0,229.0,152.0,38.0,b'true'
4,505.0,106.0,41.0,82.0,2339.0,20696.93,0.01,75.93,272.58,1571506.88,...,457.0,71.0,48.0,49.0,64.0,397.0,1397.0,942.0,178.0,b'true'


In [5]:
id_col = "id"
target_col = "defects"

## Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id    loc   v(g)  ev(g)  iv(G)       N         V     L      D       I  ...  \
0   0    1.1    1.4    1.4    1.4     1.3      1.30  1.30   1.30    1.30  ...   
1   1    1.0    1.0    1.0    1.0     1.0      1.00  1.00   1.00    1.00  ...   
2   2   91.0    9.0    3.0    2.0   318.0   2089.21  0.04  27.68   75.47  ...   
3   3  109.0   21.0    5.0   18.0   381.0   2547.56  0.04  28.37   89.79  ...   
4   4  505.0  106.0   41.0   82.0  2339.0  20696.93  0.01  75.93  272.58  ...   

   lOCode  lOComment  locCodeAndComment  lOBlank  uniq_Op  uniq_Opnd  \
0     2.0        2.0                2.0      2.0      1.2        1.2   
1     1.0        1.0                1.0      1.0      1.0        1.0   
2    80.0       44.0               11.0     31.0     29.0       66.0   
3    97.0       41.0               12.0     24.0     28.0       75.0   
4   457.0       71.0               48.0     49.0     64.0      397.0   

   total_Op  total_Opnd  branchCount   defects  
0       1.2         1.2        

## Convert byte strings to strings

In [7]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['defects']

In [8]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

  data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)


Unnamed: 0,id,loc,v(g),ev(g),iv(G),N,V,L,D,I,...,lOCode,lOComment,locCodeAndComment,lOBlank,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,0,1.1,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,...,2.0,2.0,2.0,2.0,1.2,1.2,1.2,1.2,1.4,False
1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,True
2,2,91.0,9.0,3.0,2.0,318.0,2089.21,0.04,27.68,75.47,...,80.0,44.0,11.0,31.0,29.0,66.0,192.0,126.0,17.0,True
3,3,109.0,21.0,5.0,18.0,381.0,2547.56,0.04,28.37,89.79,...,97.0,41.0,12.0,24.0,28.0,75.0,229.0,152.0,38.0,True
4,4,505.0,106.0,41.0,82.0,2339.0,20696.93,0.01,75.93,272.58,...,457.0,71.0,48.0,49.0,64.0,397.0,1397.0,942.0,178.0,True


## Drop unuseful columns

In [9]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Convert ? to NaN

In [10]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [11]:
data.to_csv(outp_fname, index=False)