In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'kc1'

In [3]:
input_dir = './raw/'
inp_fname = 'kc1.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,1.1,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,1.3,...,2.0,2.0,2.0,2.0,1.2,1.2,1.2,1.2,1.4,b'false'
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,b'true'
2,83.0,11.0,1.0,11.0,171.0,927.89,0.04,23.04,40.27,21378.61,...,65.0,10.0,6.0,0.0,18.0,25.0,107.0,64.0,21.0,b'true'
3,46.0,8.0,6.0,8.0,141.0,769.78,0.07,14.86,51.81,11436.73,...,37.0,2.0,5.0,0.0,16.0,28.0,89.0,52.0,15.0,b'true'
4,25.0,3.0,1.0,3.0,58.0,254.75,0.11,9.35,27.25,2381.95,...,21.0,0.0,2.0,0.0,11.0,10.0,41.0,17.0,5.0,b'true'


In [5]:
id_col = "id"
target_col = "defects"

## Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id   loc  v(g)  ev(g)  iv(g)      n       v     l      d      i  ...  \
0   0   1.1   1.4    1.4    1.4    1.3    1.30  1.30   1.30   1.30  ...   
1   1   1.0   1.0    1.0    1.0    1.0    1.00  1.00   1.00   1.00  ...   
2   2  83.0  11.0    1.0   11.0  171.0  927.89  0.04  23.04  40.27  ...   
3   3  46.0   8.0    6.0    8.0  141.0  769.78  0.07  14.86  51.81  ...   
4   4  25.0   3.0    1.0    3.0   58.0  254.75  0.11   9.35  27.25  ...   

   lOCode  lOComment  lOBlank  locCodeAndComment  uniq_Op  uniq_Opnd  \
0     2.0        2.0      2.0                2.0      1.2        1.2   
1     1.0        1.0      1.0                1.0      1.0        1.0   
2    65.0       10.0      6.0                0.0     18.0       25.0   
3    37.0        2.0      5.0                0.0     16.0       28.0   
4    21.0        0.0      2.0                0.0     11.0       10.0   

   total_Op  total_Opnd  branchCount   defects  
0       1.2         1.2          1.4  b'false'  
1       1.0       

## Convert byte strings to strings

In [7]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['defects']

In [8]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

  data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)


Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,0,1.1,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,...,2.0,2.0,2.0,2.0,1.2,1.2,1.2,1.2,1.4,False
1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,True
2,2,83.0,11.0,1.0,11.0,171.0,927.89,0.04,23.04,40.27,...,65.0,10.0,6.0,0.0,18.0,25.0,107.0,64.0,21.0,True
3,3,46.0,8.0,6.0,8.0,141.0,769.78,0.07,14.86,51.81,...,37.0,2.0,5.0,0.0,16.0,28.0,89.0,52.0,15.0,True
4,4,25.0,3.0,1.0,3.0,58.0,254.75,0.11,9.35,27.25,...,21.0,0.0,2.0,0.0,11.0,10.0,41.0,17.0,5.0,True


## Drop unuseful columns

In [9]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Convert ? to NaN

In [10]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [11]:
data.to_csv(outp_fname, index=False)