In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'pc2'

In [3]:
input_dir = './raw/'
inp_fname = 'PC2.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,BRANCH_COUNT,CALL_PAIRS,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CONDITION_COUNT,CYCLOMATIC_COMPLEXITY,CYCLOMATIC_DENSITY,DECISION_COUNT,DECISION_DENSITY,DESIGN_COMPLEXITY,...,NODE_COUNT,NORMALIZED_CYLOMATIC_COMPLEXITY,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,NUMBER_OF_LINES,PERCENT_COMMENTS,LOC_TOTAL,c
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,2.0,0.5,1.0,3.0,1.0,3.0,2.0,0.0,0.0,b'FALSE'
1,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,3.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,b'FALSE'
2,1.0,4.0,7.0,24.0,0.0,1.0,0.13,0.0,0.0,1.0,...,7.0,0.03,13.0,21.0,7.0,8.0,34.0,96.88,8.0,b'FALSE'
3,1.0,1.0,11.0,3.0,0.0,1.0,0.08,0.0,0.0,1.0,...,3.0,0.06,29.0,48.0,13.0,7.0,17.0,93.33,12.0,b'FALSE'
4,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,3.0,0.33,5.0,4.0,4.0,1.0,3.0,0.0,1.0,b'FALSE'


In [5]:
id_col = "id"
target_col = "c"

## Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id  BRANCH_COUNT  CALL_PAIRS  LOC_CODE_AND_COMMENT  LOC_COMMENTS  \
0   0           1.0         0.0                   0.0           0.0   
1   1           1.0         1.0                   0.0           0.0   
2   2           1.0         4.0                   7.0          24.0   
3   3           1.0         1.0                  11.0           3.0   
4   4           1.0         1.0                   0.0           0.0   

   CONDITION_COUNT  CYCLOMATIC_COMPLEXITY  CYCLOMATIC_DENSITY  DECISION_COUNT  \
0              0.0                    1.0                1.00             0.0   
1              0.0                    1.0                1.00             0.0   
2              0.0                    1.0                0.13             0.0   
3              0.0                    1.0                0.08             0.0   
4              0.0                    1.0                1.00             0.0   

   DECISION_DENSITY  ...  NODE_COUNT  NORMALIZED_CYLOMATIC_COMPLEXITY  \
0            

## Convert byte strings to strings

In [7]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['c']

In [8]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

Unnamed: 0,id,BRANCH_COUNT,CALL_PAIRS,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CONDITION_COUNT,CYCLOMATIC_COMPLEXITY,CYCLOMATIC_DENSITY,DECISION_COUNT,DECISION_DENSITY,...,NODE_COUNT,NORMALIZED_CYLOMATIC_COMPLEXITY,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,NUMBER_OF_LINES,PERCENT_COMMENTS,LOC_TOTAL,c
0,0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,2.0,0.5,1.0,3.0,1.0,3.0,2.0,0.0,0.0,False
1,1,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,3.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,False
2,2,1.0,4.0,7.0,24.0,0.0,1.0,0.13,0.0,0.0,...,7.0,0.03,13.0,21.0,7.0,8.0,34.0,96.88,8.0,False
3,3,1.0,1.0,11.0,3.0,0.0,1.0,0.08,0.0,0.0,...,3.0,0.06,29.0,48.0,13.0,7.0,17.0,93.33,12.0,False
4,4,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,3.0,0.33,5.0,4.0,4.0,1.0,3.0,0.0,1.0,False


## Drop unuseful columns

In [9]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Convert ? to NaN

In [10]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [11]:
data.to_csv(outp_fname, index=False)