In [1]:
import logging
import random
import numpy as np
import pandas as pd
import os
import json
import torch
import functools
from argparse import ArgumentParser
from functools import partial, wraps
from contextlib import contextmanager

from multiprocessing import cpu_count, Manager, Pool, Queue

from typing import cast, List, Dict, Set
from tqdm.notebook import tqdm
from os.path import isfile, isdir, join, exists, relpath
from src.data_generator import read_csv
import xml.etree.ElementTree as ET
from math import ceil

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from transformers import AutoTokenizer
from torch import nn, einsum
import torch.optim as optim
import torch.nn.functional as F
from src.clip_mlm import *
from src.clip_mlm import CLIP as CLIPMLM
import matplotlib.pyplot as plt

In [None]:
name1 = "Tushmit Chowdhury"
name2 = "Satyaki Das"

print("__debug__")

In [None]:
label = 1
torch.tensor(label).to("cpu")

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Create dummy data
X = torch.randn(1000, 10)  # Input features
y = torch.randint(0, 2, (1000,))  # Target labels

print("Tushmit")
# Create a TensorDataset
dataset = TensorDataset(X, y)
print("Tushmit")

# Create a DataLoader
batch_size = 32
shuffle = True
print("Tushmit")
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
print("Tushmit")

# Iterate over the DataLoader
for batch_X, batch_y in data_loader:
    # Use the batch for training or evaluation
    print("__debug__")
    print("Batch input shape:", batch_X.shape)
    print("Batch target shape:", batch_y.shape)
    break  # Only print the first batch for demonstration


In [2]:
class ContrastiveLoss(torch.nn.Module):
    def __init__(self, args, margin=2.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin
        self.args = args

    def forward(self, output1, output2, label):
        assert self.args.maskVV & self.args.maskVN != True
        euclidean_distance = F.pairwise_distance(output1, output2)
        if self.args.maskVV:
            return torch.mean((1 - label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        elif self.args.maskVN:
            return torch.mean((label) * torch.pow(euclidean_distance, 2))
        else:
            return torch.mean((label) * torch.pow(euclidean_distance, 2) +
                              (1 - label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0),
                                                      2))

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)
    torch.manual_seed(seed)
    return

parser = ArgumentParser()
parser.add_argument('--seed', type=int, default=42)

parser.add_argument('--dataset', type=str, default='RE',
                    choices=['RE', 'TD', 'IO'])

# clip stage args
parser.add_argument('--epoch_clip', type=int, default=100)
parser.add_argument('--batch_size_clip', type=int, default=32)
parser.add_argument('--lr_clip', type=float, default=1e-5)
parser.add_argument('--save_epoch', type=int, default=1)
parser.add_argument('--mlmloss', type=float, default=0.1)
parser.add_argument('--maskVV', action='store_true')
parser.add_argument('--maskVN', action='store_true')

# classifier stage args
parser.add_argument('--epoch_cla', type=int, default=20)
parser.add_argument('--batch_size_cla', type=int, default=32)
parser.add_argument('--lr_2', type=float, default=1e-5)
parser.add_argument('--max_length', type=int, default=1024)
parser.add_argument('--savepath', type=str, default='./Results/mlm')
parser.add_argument('--resume', action='store_true')
parser.add_argument('--resume_file', type=str, default=None)
parser.add_argument('--train_clip', action='store_false')

args = parser.parse_known_args()[0]
args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

set_seed(args.seed)

print("__debug__")

__debug__


In [3]:
with open("data/SARD/function.json", "r") as rfi:
    dataset = json.load(rfi)

with open("data/SARD/vv_vn_pairs.json", "r") as rfi:
    pairs = json.load(rfi)

func1_list = []
func2_list = []
corr_label_list = []

for pair in tqdm(pairs):
    func1_batch = dataset[pair["idx"]]["raw_func"]
    for idx in pair["pairs"]["vv"]:
        func2_batch = dataset[idx]["raw_func"]
        func1_list.append(func1_batch)
        func2_list.append(func2_batch)
        corr_label_list.append(1)
    
    for idx in pair["pairs"]["vn"]:
        func2_batch = dataset[idx]["raw_func"]
        func1_list.append(func1_batch)
        func2_list.append(func2_batch)
        corr_label_list.append(0)
    
df = pd.DataFrame({
    "func1": func1_list,
    "func2": func2_list,
    "label": corr_label_list
})

batch_size = 32

func1_batches = []
func2_batches = []
corr_label_batches = []

for i in range(0, len(corr_label_list), batch_size):
    func1_batches.append(func1_list[i: i + batch_size])
    func2_batches.append(func2_list[i: i + batch_size])
    corr_label_batches.append(corr_label_list[i: i + batch_size])

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
clip = CLIPMLM(
            args=args,
            dim_text=512,
            num_text_tokens=50265,
            text_seq_len=args.max_length,
            text_heads=8
        )
optimizer = optim.AdamW(clip.parameters(), lr=args.lr_clip)
c_loss = ContrastiveLoss(args=args)

clip = torch.nn.DataParallel(clip, device_ids=[0, 1])
model = clip.to(args.device)

plot_data = []
plot_label = []

pbar = tqdm(zip(func1_batches, func2_batches, corr_label_batches), total=len(corr_label_batches))

for func1_batch, func2_batch, label_batch in pbar:
    ids_func1 = tokenizer(func1_batch, padding=True, truncation=True, return_tensors='pt',
                                 max_length=args.max_length)['input_ids'].to(args.device)
    ids_func2 = tokenizer(func2_batch, padding=True, truncation=True, return_tensors='pt',
                                 max_length=args.max_length)['input_ids'].to(args.device)
    corr_label = torch.tensor(label_batch).to(args.device)

    CLS1, CLS2, ssl_loss = model(text1=ids_func1, text2=ids_func2, training_classifier=False)
    loss = c_loss(CLS1, CLS2, corr_label)

    plot_data.append(CLS1)
    plot_label.append(label_batch)

plot_data = torch.cat(plot_data, dim=0).to('cpu').detach().numpy()
plot_label = torch.cat(plot_label).to('cpu').detach().numpy()
plt.scatter(plot_data[:, 0], plot_data[:, 1], c=plot_label)
plt.savefig("data/SARD/corr_plot_fig.png")
print("__debug__")

HBox(children=(FloatProgress(value=0.0, max=13525.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11894.0), HTML(value='')))

: 

In [None]:
with open("data/SARD/function.json", "r") as rfi:
    dataset = json.load(rfi)

vv_vn_pairs = dict()

for idx, entry in tqdm(enumerate(dataset), total=len(dataset)):
    if entry["target"] == 0:
        continue
    file_name = entry["file_name"]
    same_file_samples = [(i, data) for i, data in enumerate(dataset) if data["file_name"] == file_name]
    for i, data in same_file_samples:
        if idx == i:
            continue
        if str(idx) not in vv_vn_pairs:
            vv_vn_pairs[str(idx)] = {
                "vv": [],
                "vn": []
            }
        if data["target"] == 1:
            vv_vn_pairs[str(idx)]["vv"].append(i)
        else:
            vv_vn_pairs[str(idx)]["vn"].append(i)

print("__debug__")

In [None]:
with open("data/SARD/vv_vn_pairs.json", "w") as wfi:
    json.dump(vv_vn_pairs, wfi, indent=2)

In [None]:
with open("data/SARD/function.json", "r") as rfi:
    dataset = json.load(rfi)

dataset_pos = [entry for entry in dataset if entry["target"] == 1]

file_name = dataset_pos[0]["file_name"]
same_file_samples = [entry for entry in dataset if entry["file_name"] == file_name]

print("__debug__")

In [None]:
def read_csv(csv_file_path: str) -> List:
    """
    read csv file
    """
    assert exists(csv_file_path), f"no {csv_file_path}"
    data = []
    with open(csv_file_path) as fp:
        header = fp.readline()
        header = header.strip()
        h_parts = [hp.strip() for hp in header.split('\t')]
        for line in fp:
            line = line.strip()
            instance = {}
            lparts = line.split('\t')
            for i, hp in enumerate(h_parts):
                if i < len(lparts):
                    content = lparts[i].strip()
                else:
                    content = ''
                instance[hp] = content
            data.append(instance)
        return data

In [None]:
USE_CPU = cpu_count()
ground_truth_set = dict()
prefix = ""
dataset_root = "data/SARD"
source_root_path = join(dataset_root, "source-code")
csv_path = join(dataset_root, "csv")
ground_truth_path = join(dataset_root, "ground_truth.json")

with open(ground_truth_path, "r") as rfi:
    ground_truth = json.load(rfi)

cpp_paths = []
cpp_paths_filepath = join(dataset_root, "cpp_paths.json")
with open(cpp_paths_filepath, "r") as rfi:
    cpp_paths = json.load(rfi)

cpp_path = cpp_paths[44705-2]
file_vul_lines = set()
if cpp_path in ground_truth:
    file_vul_lines = set(ground_truth[cpp_path])
SRC_PATH = join(source_root_path, cpp_path)
with open(SRC_PATH, "r") as rfi:
    src_lines = rfi.readlines()
nodes_dir = join(csv_path, cpp_path)
joern_nodes = read_csv(join(nodes_dir, "nodes.csv"))

print("__debug__")

In [None]:
USE_CPU = cpu_count()
ground_truth_set = dict()
prefix = ""
dataset_root = "data/SARD"
source_root_path = join(dataset_root, "source-code")
csv_path = join(dataset_root, "csv")
ground_truth_path = join(dataset_root, "ground_truth.json")

all_cpp_paths = []

for root, dirs, files in os.walk(source_root_path, topdown=True):
    cpp_filenames = [filename for filename in files if filename.endswith(".cpp") or filename.endswith(".c") or filename.endswith(".h") or filename.endswith(".hpp") or filename.endswith(".cc") or filename.endswith(".hh")]
    if len(cpp_filenames) == 0:
        continue
    all_cpp_paths += [join(relpath(root, source_root_path), filename) for filename in cpp_filenames]

print("__debug__")

In [None]:
filepath = join(dataset_root, "cpp_paths.json")

with open(filepath, "w") as wfi:
    json.dump(all_cpp_paths, wfi, indent=2)

In [None]:
def getCodeIDtoPathDict(testcases: List) -> Dict[str, Dict[str, Set[int]]]:
    '''build code testcaseid to path map

    use the manifest.xml. build {testcaseid:{filePath:set(vul lines)}}
    filePath use relevant path, e.g., CWE119/cve/source-code/project_commit/...
    :param testcases:
    :return: {testcaseid:{filePath:set(vul lines)}}
    '''
    codeIDtoPath: Dict[str, Dict[str, Set[int]]] = {}
    for testcase in testcases:
        files = testcase.findall("file")
        testcaseid = testcase.attrib["id"]
        codeIDtoPath[testcaseid] = dict()

        for file in files:
            path = file.attrib["path"]
            flaws = file.findall("flaw")
            mixeds = file.findall("mixed")
            fix = file.findall("fix")
            # print(mixeds)
            VulLine = set()
            if (flaws != [] or mixeds != [] or fix != []):
                # targetFilePath = path
                if (flaws != []):
                    for flaw in flaws:
                        VulLine.add(int(flaw.attrib["line"]))
                if (mixeds != []):
                    for mixed in mixeds:
                        VulLine.add(int(mixed.attrib["line"]))

            codeIDtoPath[testcaseid][path] = VulLine

    return codeIDtoPath

In [None]:
xml_path = "data/SARD/SARD_testcaseinfo.xml"
tree = ET.ElementTree(file=xml_path)
testcases = tree.findall("testcase")
pathToCodeID = getCodeIDtoPathDict(testcases)

ground_truth_set = dict()
for test_ID, file_vul_lines in pathToCodeID.items():
    for cpp_path, vul_lines in file_vul_lines.items():
        actual_vul_lines = [line for line in vul_lines if line > 0]
        if len(actual_vul_lines) == 0:
            continue
        if cpp_path not in all_cpp_paths:
            continue
        if cpp_path not in ground_truth_set:
            ground_truth_set[cpp_path] = set()
        ground_truth_set[cpp_path] = ground_truth_set[cpp_path].union(actual_vul_lines)

ground_truth = {cpp_path: list(vul_lines) for cpp_path, vul_lines in ground_truth_set.items()}

print("__debug__")

In [None]:
with open(ground_truth_path, "w") as wfi:
    json.dump(ground_truth, wfi, indent=2)

In [None]:
testcases_filtered = [testcase for testcase in testcases if testcase.attrib["id"] == "4"]

testcase = testcases_filtered[0]

files = testcase.findall("file")
testcaseid = testcase.attrib["id"]

print("__debug__")

In [None]:
testcase = testcases[100]

print("__debug__")

In [None]:
txt_ground_truth_path = "data/SARD/SARD_testcaseinfo.txt"

with open(txt_ground_truth_path, "r") as rfi:
    all_lines = [line.strip().replace("../SySeVRCopy/data/SARD/source-code/", "") for line in rfi.readlines()]

ground_truth_set = dict()

for line in  all_lines:
    cpp_path, line_num = [part.strip() for part in line.split()]
    line_num = int(line_num)
    if line_num == 0:
        continue
    if cpp_path not in ground_truth_set:
        ground_truth_set[cpp_path] = set()
    ground_truth_set[cpp_path].add(line_num)

ground_truth_set = {cpp_path: list(vul_lines) for cpp_path, vul_lines in ground_truth_set.items()}

print("__debug__")

In [None]:
import os
import json

from os.path import isdir, isfile, join, exists, relpath
from tqdm.notebook import tqdm
from slice_labeling.program_analysis import *

In [None]:
slice_filepath = "data/SARD/API function call.txt"

with open(slice_filepath, "r") as rfi:
    slicelists = rfi.read().split("------------------------------")

if slicelists[0] == '':
    del slicelists[0]
if slicelists[-1] == '' or slicelists[-1] == '\n' or slicelists[-1] == '\r\n':
    del slicelists[-1]

curr_slice = slicelists[0]
slice_lines = [line.strip() for line in curr_slice.splitlines()]
idx, src_filepath, func_name, line_num = [part.strip() for part in  slice_lines[0].split()]

test_id, filename = [part.strip() for part in src_filepath.split("/")]

csv_root = "data/SARD/csv"
src_root = "data/SARD/source-code"

cpp_filepath = ""
for root, dirs, files in os.walk(src_root, topdown=True):
    if len(files) == 0:
        continue
    dir_test_id = relpath(root, src_root).replace("/", "").lstrip("0")
    if dir_test_id != test_id:
        continue
    cpp_filepath = join(relpath(root, src_root), filename)
    break

nodes_dir = join(csv_root, cpp_filepath)

nodes_path = join(nodes_dir, "nodes.csv")
edges_path = join(nodes_dir, "edges.csv")
nodes = read_csv(nodes_path)
edges = read_csv(edges_path)

slice_funcname = slice_lines[1].split()[-1].split('(')[0]

func_nodes = [(idx, entry) for idx, entry in enumerate(nodes) if entry["type"] == "Function"]

slice_func_idx = [(idx) for (idx, entry) in enumerate(func_nodes) if entry[1]["code"].strip() == slice_funcname][0]

start_idx = func_nodes[slice_func_idx][0]
end_idx = func_nodes[slice_func_idx + 1][0]

for line in slice_lines[1:-1]:
    line_content = line.strip().replace(" ", "")
    for node in nodes[start_idx:end_idx]:
        node_content = node["code"].strip().replace(" ", "")
        if line_content == node_content:
            loc = node["location"].split(":")[0].strip()
            print(f"{line} {loc}")

print("__debug__")

In [None]:
slice_file = "data/SARD/API function call.txt"

with open(slice_file, "r") as rfi:
    slicelists = rfi.read().split("------------------------------")

if slicelists[0] == '':
    del slicelists[0]
if slicelists[-1] == '' or slicelists[-1] == '\n' or slicelists[-1] == '\r\n':
    del slicelists[-1]

for slicelist in tqdm(slicelists):
    slice_corpus = []
    focus_index = 0
    flag_focus = 0

    sentences = slicelist.split('\n')

    if sentences[0] == '\r' or sentences[0] == '':
        del sentences[0]
    if sentences == []:
        continue
    if sentences[-1] == '':
        del sentences[-1]
    if sentences[-1] == '\r':
        del sentences[-1]

print("__debug__")

In [None]:
file_path = "65156/CWE121_Stack_Based_Buffer_Overflow__CWE805_wchar_t_alloca_ncat_05.c"
csv_root = "data/SARD/csv"
src_root = "data/SARD/source-code"

test_id = file_path.split("/")[0]
filename = file_path.split("/")[-1]
src_filepath = ""
for root, dirs, files in os.walk(src_root, topdown=True):
    if len(files) == 0:
        continue
    dir_test_id = relpath(root, src_root).replace("/", "").lstrip("0")
    if dir_test_id != test_id:
        continue
    src_filepath = join(relpath(root, src_root), filename)
    break
print("__debug__")

In [None]:
with open("slice_labeling/sample.txt", "r") as rfi:
    sample_slice = [line.strip() for line in rfi.readlines()]

nodes_dir = join(csv_root, src_filepath)

nodes_path = join(nodes_dir, "nodes.csv")
edges_path = join(nodes_dir, "edges.csv")
nodes = read_csv(nodes_path)
edges = read_csv(edges_path)

# PDG = build_PDG_no_post_dom(nodes_dir, src_filepath)

print("__debug__")

In [None]:
import os
import pickle
from joern.all import JoernSteps

from os.path import isdir, exists, join

In [None]:
with open("SARD/callee_CFGNode_map.pkl", "rb") as rbfi:
    data = pickle.load(rbfi)

print("__debug__")

In [None]:
dirpath = "SARD/dict_call2cfgNodeID_funcID"

filename = "dict.pkl"

for test_id in os.listdir(dirpath):
    test_id_dirpath = join(dirpath, test_id)
    with open(join(test_id_dirpath, filename), "rb") as rbfi:
        data = pickle.load(rbfi)
    print("__debug__")

print("__debug__")

In [None]:
sudo pip install git+https://github.com/fabsx00/python-joern.git

In [None]:
~/joern/joern-parse /home/satyaki/luka/SySeVRCopy/data/SARD/source-code

/home/satyaki/luka/SySeVRCopy/joern/bin/joern

joern/bin/joern/joern-cli/joern-parse /home/satyaki/luka/SySeVRCopy/data/SARD/source-code
joern/bin/joern/joern-cli/joern-parse /home/satyaki/luka/SySeVRCopy/data/SARD/source-code/000/001/002
joern/bin/joern/joern-cli/joern-export --repr=all --format=neo4j

export JAVA_HOME="/usr/lib/jvm/jdk-17"
export PATH=$JAVA_HOME/bin:$PATH

In [None]:
sard_dir = "data/SARD/SARD"

total_files = 0
for root, dirs, files in os.walk(sard_dir):
    src_files = [file for file in files if file.endswith(".c") or file.endswith(".cpp") or file.endswith(".h")]
    total_files += len(src_files)

print("__debug__")

In [None]:
PYTHONPATH="." python SySeVR_docker/docker_build/home/SySeVR/softdir/python-joern-0.3.1/setup.py