Skip to content

Commit

Permalink
Add Optz & fix bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
retr0reg committed Apr 8, 2023
1 parent 6823f75 commit a649402
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 29 deletions.
9 changes: 6 additions & 3 deletions generate_code_segments/justify_codes_to_files.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import re

def main():
dirs = ["vuln/","nvuln/","eval/vuln/","eval/nvuln/"]
# dirs = ["vuln/","nvuln/","eval/vuln/","eval/nvuln/"]
dirs = ["nvuln/"]
for dirr in dirs:
# 读取文件内容到字符串中
with open(dirr+"outputs.txt", 'r') as f:
content = f.read()

# 使用正则表达式匹配每个程序代码
pattern = re.compile(r'^\s*#include\s+<stdlib\.h>.*?\breturn\s+0;\s*}', re.DOTALL | re.MULTILINE)
# ^\s*#include.*?\breturn\s+0;\s*}
# pattern = re.compile(r'^\s*#include\s+<stdlib\.h>.*?\breturn\s+0;\s*}', re.DOTALL | re.MULTILINE)
pattern = re.compile(r'(\s*#include.*?return\s+0;\s*})', re.DOTALL)
matches = re.finditer(pattern, content)

# 将每个程序代码保存到不同的文件中
Expand All @@ -18,4 +21,4 @@ def main():
f.write(code)

if __name__ == "__main__":
pass
main()
79 changes: 79 additions & 0 deletions train/evaluate_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from tqdm import tqdm

import evaluate # 导入你的评估库


class CodeDataset(Dataset):
def __init__(self, vuln_dir, nvuln_dir, tokenizer):
self.tokenizer = tokenizer
self.vuln_files = os.listdir(vuln_dir)
self.nvuln_files = os.listdir(nvuln_dir)
self.vuln_dir = vuln_dir
self.nvuln_dir = nvuln_dir

def __len__(self):
return len(self.vuln_files) + len(self.nvuln_files)

def __getitem__(self, idx):
if idx < len(self.vuln_files):
file_path = os.path.join(self.vuln_dir, self.vuln_files[idx])
label = 1
else:
file_path = os.path.join(self.nvuln_dir, self.nvuln_files[idx - len(self.vuln_files)])
label = 0

with open(file_path, "r", encoding="utf-8") as f:
code = f.read()
inputs = self.tokenizer(code, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
input_ids = inputs["input_ids"].squeeze()
attention_mask = inputs["attention_mask"].squeeze()
# token_type_ids = inputs["token_type_ids"].squeeze()

return {
"input_ids": input_ids,
"attention_mask": attention_mask,
# "token_type_ids": token_type_ids,
"labels": torch.tensor(label),
}

def evaluate_pwnbert(vuln_eval_dir, nvuln_eval_dir, output_dir):
model_name = "microsoft/codebert-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

# 加载训练过的模型
model = RobertaForSequenceClassification.from_pretrained(output_dir)

device = torch.device("mps")
model.to(device)
model.eval()

eval_dataset = CodeDataset(vuln_eval_dir, nvuln_eval_dir, tokenizer)
eval_dataloader = DataLoader(eval_dataset, batch_size=2)

metric = evaluate.load("accuracy")
total_eval_accuracy = 0

for batch in tqdm(eval_dataloader, desc="Evaluating"):
batch = {k: v.to(device) for k, v in batch.items()}
with torch.no_grad():
outputs = model(**batch)

logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)
metric.add_batch(predictions=predictions, references=batch["labels"])
total_eval_accuracy += metric.compute()["accuracy"]

avg_eval_accuracy = total_eval_accuracy / len(eval_dataloader)
print(f"Average evaluation accuracy: {avg_eval_accuracy:.2f}")


if __name__ == "__main__":
vuln_eval_dir = "generate_code_segments/eval/vuln"
nvuln_eval_dir = "generate_code_segments/eval/nvuln"
output_dir = "pwnbert_finetuned"

evaluate_pwnbert(vuln_eval_dir, nvuln_eval_dir, output_dir)
48 changes: 25 additions & 23 deletions train/main_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,25 +143,24 @@ def finetune_pwnbert(vuln_dir, nvuln_dir, vuln_eval_dir, nvuln_eval_dir, model_n

progress_bar.update(1)


import evaluate

metric = evaluate.load("accuracy")
model.eval()
print(batch)
for batch in eval_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
with torch.no_grad():
outputs = model(**batch)

logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)
metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()


writer.add_scalar("Accuracy/eval", metric.result(), epoch)
# try:
# import evaluate

# metric = evaluate.load("accuracy")
# model.eval()
# print(batch)
# for batch in eval_dataloader:
# batch = {k: v.to(device) for k, v in batch.items()}
# with torch.no_grad():
# outputs = model(**batch)

# logits = outputs.logits
# predictions = torch.argmax(logits, dim=-1)
# metric.add_batch(predictions=predictions, references=batch["labels"])
# writer.add_scalar("Accuracy/eval", metric.compute(), epoch)

# except:
# print("No evaluation")

# config = AutoConfig.from_pretrained("bert-base-cased", dropout=0.1)
#
Expand Down Expand Up @@ -210,13 +209,16 @@ def finetune_pwnbert(vuln_dir, nvuln_dir, vuln_eval_dir, nvuln_eval_dir, model_n

# trainer.train()
writer.close()
return model, tokenizer
try:
model.save_pretrained("pwnbert_finetuned")
tokenizer.save_pretrained("pwnbert_finetuned")
print("Model and tokenizer saved successfully.")
except Exception as e:
print("Error occurred while saving the model and tokenizer:", e)

if __name__ == "__main__":
vuln_dir = "generate_code_segments/vuln"
nvuln_dir = "generate_code_segments/nvuln"
vuln_eval_dir = "generate_code_segments/eval/vuln"
nvuln_eval_dir = "generate_code_segments/eval/nvuln"
model, tokenizer = finetune_pwnbert(vuln_dir, nvuln_dir, vuln_eval_dir, nvuln_eval_dir)
model.save_pretrained("./pwnbert_finetuned")
tokenizer.save_pretrained("./pwnbert_finetuned")
finetune_pwnbert(vuln_dir, nvuln_dir, vuln_eval_dir, nvuln_eval_dir)
6 changes: 3 additions & 3 deletions vaildity_test.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
import sys
import random

# 加载模型和标记器
model = BertForSequenceClassification.from_pretrained("./pwnbert_finetuned")
tokenizer = BertTokenizer.from_pretrained("./pwnbert_finetuned")

tokenizer = RobertaTokenizer.from_pretrained("./pwnbert_finetuned")
model = RobertaForSequenceClassification.from_pretrained("./pwnbert_finetuned")
# def predict_vulnerability(model, tokenizer, code):
# inputs = tokenizer(code, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
# outputs = model(**inputs)
Expand Down

0 comments on commit a649402

Please sign in to comment.