In [1]:
import os
import json
import pandas as pd
import numpy as np

from os.path import isdir, isfile, join

In [2]:
train_data_filepath = "data/big-vul_dataset/train.csv"
test_data_filepath = "data/big-vul_dataset/test.csv"
val_data_filepath = "data/big-vul_dataset/val.csv"

In [3]:
train_df = pd.read_csv(train_data_filepath)

In [4]:
train_df.columns

Index(['index', 'Access Gained', 'Attack Origin', 'Authentication Required',
       'Availability', 'CVE ID', 'CVE Page', 'CWE ID', 'Complexity',
       'Confidentiality', 'Integrity', 'Known Exploits', 'Publish Date',
       'Score', 'Summary', 'Update Date', 'Vulnerability Classification',
       'add_lines', 'codeLink', 'commit_id', 'commit_message', 'del_lines',
       'file_name', 'files_changed', 'func_after', 'func_before', 'lang',
       'lines_after', 'lines_before', 'parentID', 'patch', 'project',
       'project_after', 'project_before', 'target', 'vul_func_with_fix',
       'processed_func', 'flaw_line', 'flaw_line_index'],
      dtype='object')

In [5]:
train_df.loc[train_df["CWE ID"].isnull(),'cwe_is_NaN'] = 1
train_df.loc[train_df["CWE ID"].notnull(), 'cwe_is_NaN'] = 0

In [7]:
nan_cnt = train_df["cwe_is_NaN"].tolist()

nan_cnt.count(1)

29072

In [8]:
train_df_nonan = train_df.query("cwe_is_NaN != 1 or target != 1")

len(train_df_nonan), len(train_df)

(149216, 150908)

In [11]:
nan_cnt = train_df_nonan["cwe_is_NaN"].tolist()

nan_cnt.count(1)

for idx, row in train_df_nonan.iterrows():
    if row["cwe_is_NaN"] == 1:
        assert(row["target"] == 0)

In [12]:
train_df_nonan.drop("cwe_is_NaN", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_nonan.drop("cwe_is_NaN", axis=1, inplace=True)


In [13]:
train_df_nonan.columns

Index(['index', 'Access Gained', 'Attack Origin', 'Authentication Required',
       'Availability', 'CVE ID', 'CVE Page', 'CWE ID', 'Complexity',
       'Confidentiality', 'Integrity', 'Known Exploits', 'Publish Date',
       'Score', 'Summary', 'Update Date', 'Vulnerability Classification',
       'add_lines', 'codeLink', 'commit_id', 'commit_message', 'del_lines',
       'file_name', 'files_changed', 'func_after', 'func_before', 'lang',
       'lines_after', 'lines_before', 'parentID', 'patch', 'project',
       'project_after', 'project_before', 'target', 'vul_func_with_fix',
       'processed_func', 'flaw_line', 'flaw_line_index'],
      dtype='object')

In [20]:
cwes = train_df_nonan["CWE ID"].tolist()

unq_cwes = list(set(cwes))

unq_cwes = list(filter(lambda x: isinstance(x, str), unq_cwes))

vul_sample_counts = dict()

for cwe in unq_cwes:
    vul_sample_counts[cwe] = len(train_df_nonan.query("`CWE ID` == @cwe and target == 1").index)

unq_cwes.insert(0, "safe")



for key in vul_sample_counts:
    if vul_sample_counts[key] == 0:
        unq_cwes.remove(key)

categories = np.array(unq_cwes)

len(unq_cwes)

87

In [21]:
targets = train_df_nonan["target"].tolist()

cat_targets = []

for idx, target in enumerate(targets):
    if target == 0:
        cat_targets.append("safe")
    else:
        cat_targets.append(cwes[idx])

cat_series = pd.Series(cat_targets)

In [None]:
train_df_nonan["target"] = pd.Categorical(cat_series, categories=categories)

In [None]:
cat_train_data_filepath = "data/big-vul_dataset/cat_train.csv"

if not isfile(cat_train_data_filepath):
    with open(cat_train_data_filepath, "w"):
        pass

train_df_nonan.to_csv(cat_train_data_filepath, index=False)

In [None]:
cwes = train_df["CWE ID"].tolist()

unq_cwes = set(cwes)

unq_cwes

In [None]:
nan_cnt = train_df["CWE ID"].isnull().sum()

print(nan_cnt)

In [None]:
train_df_nonan = train_df[~pd.isnull(train_df["CWE ID"])]

assert(len(train_df.index) - len(train_df_nonan.index) == nan_cnt)

In [None]:
cwes = train_df_nonan["CWE ID"].tolist()

unq_cwes = list(set(cwes))

unq_cwes.append("safe")

categories = np.array(unq_cwes)

In [None]:
len(unq_cwes)

In [None]:
unq_cwes

In [None]:
cwe_ids_num = [int(x.split("-")[-1]) for x in unq_cwes]

In [None]:
cwe_ids_num.sort()

len(set(cwe_ids_num))

In [None]:
cwe_cnt_dict = dict()

for idx, cwe_id in enumerate(unq_cwes):
    cwe_cnt_dict[cwe_id] = idx + 1

In [None]:
cwe_cnt_dict

In [None]:
train_df_nonan.columns

In [None]:
original_targets = train_df_nonan["target"].tolist()
categories = train_df_nonan["CWE ID"].tolist()

In [None]:
train_df_nonan_cat = train_df_nonan.astype({"target": "category"})

train_df_nonan_cat.dtypes

In [None]:
train_df_nonan_cat["target"].tolist()

In [None]:
for i in range(len(original_targets)):
    if original_targets[i] == 1:
        train_df_nonan_cat.at[i, "target"] = categories[i]
    else:
        train_df_nonan_cat.at[i, "target"] = "safe"

In [None]:
train_df_nonan.dtypes

In [None]:
train_df_nonan_cat["target"].tolist()

In [None]:
categorization_train_data_filepath = "data/big-vul_dataset/cat_train.csv"

if not isfile(categorization_train_data_filepath):
    with open(categorization_train_data_filepath, "w"):
        pass

In [None]:
train_df_nonan_cat.to_csv(categorization_train_data_filepath, index=False)

In [None]:
train_df = pd.read_csv(categorization_train_data_filepath)

In [None]:
train_df["target"].tolist()

In [None]:
original_targets

In [None]:
list(range(100))

In [None]:
os.chdir("linevul")

train_data_file = "../data/cat/cat_train.csv"
eval_data_file = "../data/cat/cat_val.csv"
test_data_file = "../data/cat/cat_test.csv"

train_df = pd.read_csv(train_data_file)
eval_df = pd.read_csv(eval_data_file)
test_df = pd.read_csv(test_data_file)

train_targets = list(set(train_df["target"].tolist()))
eval_targets = list(set(eval_df["target"].tolist()))
test_targets = list(set(test_df["target"].tolist()))

len(train_targets), len(eval_targets), len(test_targets)

In [None]:
train_targets