In [1]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch

from os.path import isdir, isfile, join
from tqdm import tqdm

from data_visualizer import TextDataset
from transformers import RobertaTokenizer
from torch.utils.data import DataLoader, Dataset, SequentialSampler

from ripser import ripser
from persim import plot_diagrams

In [2]:
zd_sim_json_filepath = "data/big-vul_dataset/zd_sim_batches.json"
train_data_csv_filepath = "data/big-vul_dataset/train.csv"
zd_csv_filepath = "data/zero_day/zero_day.csv"

In [3]:
with open(zd_sim_json_filepath, "r") as f:
    zd_sim = json.load(f)

In [6]:
len(zd_sim["zd_sim_batches"])

402

In [5]:
train_df = pd.read_csv(train_data_csv_filepath)

In [7]:
zd_df = pd.read_csv(zd_csv_filepath)

In [8]:
zd_cnt = len(zd_df.index)
train_cnt = len(train_df.index)

In [9]:
total_labels = 0
total_pos = 0

vul_tain_zd_sim_df = pd.DataFrame()

for batch in zd_sim["zd_sim_batches"]:
    start = (batch - 1) * zd_cnt
    end = start + zd_cnt
    
    if end >= train_cnt:
        batch_df = train_df.iloc[start:]
    else:
        batch_df = train_df.iloc[start:end]
    
    labels = batch_df["target"].tolist()
    
    vul_batch_df = batch_df.query("target == 1")
    
    vul_tain_zd_sim_df = pd.concat([vul_tain_zd_sim_df, vul_batch_df])
    
    total_labels += len(labels)
    total_pos += labels.count(1)

print(total_pos, total_labels)

pct = (total_pos / total_labels) * 100

print(f"{pct:.2f}%")

631 10430
6.05%


In [17]:
train_data_csv_filepath = "data/zero_day/zero_day_vul_train.csv"

with open(train_data_csv_filepath, "w"):
    pass

vul_tain_zd_sim_df.to_csv(train_data_csv_filepath)

In [12]:
tokenizer_name = "microsoft/codebert-base"
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)

test_dataset = TextDataset(tokenizer, vul_tain_zd_sim_df)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 631/631 [00:01<00:00, 401.30it/s]


In [13]:
BATCH_SIZE = 256

test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE, num_workers=1)

In [14]:
BLOCK_SIZE = 512

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

test_data_np = np.empty((BATCH_SIZE, BLOCK_SIZE))
for batch in tqdm(test_dataloader):
    (inputs_ids, labels) = [x.to(device) for x in batch]
    input_numpy = inputs_ids.cpu().numpy()
    test_data_np = np.concatenate((test_data_np, np.nan_to_num(input_numpy)), axis=0)

test_data_np = np.nan_to_num(test_data_np)

if np.isnan(test_data_np).any():
    print("Has NaN")
else:
    print("Clean")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 19.11it/s]

Clean





In [15]:
diagrams = ripser(test_data_np)["dgms"]
plot_diagrams(diagrams, show=False)
plt.savefig(join("data/big-vul_dataset/diagrams/vul_train_zd_sim", f"zdsim_trainvul.png"))
plt.clf()

<Figure size 432x288 with 0 Axes>

In [13]:
zd_labels = zd_df["target"].tolist()
print(zd_labels.count(1))

7


In [18]:
zd_label_cnt = len(zd_labels)
zd_pos_cnt = zd_labels.count(1)

print(zd_pos_cnt, zd_label_cnt)

zd_pct = (zd_pos_cnt / zd_label_cnt) * 100

print(f"{zd_pct:.2f}%")

7 26
26.92%


In [22]:
train_data_csv_filepath = "data/zero_day/zero_day_vul_train.csv"

zdsim_trainvul_df = pd.read_csv(train_data_csv_filepath)

In [24]:
len(zdsim_trainvul_df.index)

631

In [25]:
zdsim_trainvul_dirname = "data/zero_day/zdsim_trainvul"

if not isdir(zdsim_trainvul_dirname):
    os.mkdir(zdsim_trainvul_dirname)

In [26]:
zdsim_trainvul_df.columns

Index(['Unnamed: 0', 'index', 'Access Gained', 'Attack Origin',
       'Authentication Required', 'Availability', 'CVE ID', 'CVE Page',
       'CWE ID', 'Complexity', 'Confidentiality', 'Integrity',
       'Known Exploits', 'Publish Date', 'Score', 'Summary', 'Update Date',
       'Vulnerability Classification', 'add_lines', 'codeLink', 'commit_id',
       'commit_message', 'del_lines', 'file_name', 'files_changed',
       'func_after', 'func_before', 'lang', 'lines_after', 'lines_before',
       'parentID', 'patch', 'project', 'project_after', 'project_before',
       'target', 'vul_func_with_fix', 'processed_func', 'flaw_line',
       'flaw_line_index'],
      dtype='object')

In [28]:
vuls = zdsim_trainvul_df["processed_func"].tolist()
fixes = zdsim_trainvul_df["vul_func_with_fix"].tolist()

total = len(vuls)

for i in range(total):
    DST_DIR = join(zdsim_trainvul_dirname, f"{i+1}")
    
    if not isdir(DST_DIR):
        os.mkdir(DST_DIR)
    
    with open(join(DST_DIR, "vul.cpp"), "w") as f:
        f.write(vuls[i])
    with open(join(DST_DIR, "fix.cpp"), "w") as f:
        f.write(fixes[i])