In [None]:
pip install transformers

In [None]:
pip install seaborn

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import torch
import random
import logging

import matplotlib.pyplot as plt

from os.path import isfile, isdir, join
from datetime import datetime

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                          RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
%cd /content/drive/MyDrive/research/LineVulCopy/linevul
%rm cat_linevul.py
%rm cat_linevul_model.py

In [2]:
CWD = os.getcwd()

CWD

'/home/ubuntu/research/repos/LineVulCopy/linevul'

In [3]:
os.listdir()

['cat_linevul_model.py',
 '__pycache__',
 'dataset_gen_utils.py',
 'run.sh',
 'results',
 'cat_linevul.py',
 'saved_models',
 'multiclass_data_n_model.ipynb',
 '.ipynb_checkpoints',
 'linevul_model.py',
 'linevul_main.py',
 'train_word_level_tokenizer.py',
 'train_logs',
 'ifa_records',
 'bpe_tokenizer',
 'train_bpe_tokenizer.py',
 'word_level_tokenizer']

In [None]:
from google.colab import files

files.upload()

In [4]:
import cat_linevul_model
import cat_linevul

In [None]:
import importlib
importlib.reload(cat_linevul_model)
importlib.reload(cat_linevul)

In [5]:
DATA_DIR = "../data/cat"
train_csv_filename = "cat_boost_train.csv"
val_csv_filename = "cat_boost_val.csv"
test_csv_filename = "cat_boost_test.csv"

In [6]:
TRAIN_SRC_PATH = join(DATA_DIR, train_csv_filename)
VAL_SRC_PATH = join(DATA_DIR, val_csv_filename)
TEST_SRC_PATH = join(DATA_DIR, test_csv_filename)

train_df = pd.read_csv(TRAIN_SRC_PATH)
val_df = pd.read_csv(VAL_SRC_PATH)
test_df = pd.read_csv(TEST_SRC_PATH)

In [7]:
train_targets = train_df["target"].tolist()
val_targets = val_df["target"].tolist()
test_targets = test_df["target"].tolist()

train_target_torch = torch.tensor(train_targets)

print(type(train_target_torch))

class_distribution_dict = dict()

unq_train_tgs = set(train_targets)

for tg in unq_train_tgs:
  class_distribution_dict[f"label_{tg}"] = train_targets.count(tg)

<class 'torch.Tensor'>


In [8]:
len(class_distribution_dict)

89

In [9]:
class_count = [i for i in class_distribution_dict.values()]
class_weights = 1./torch.tensor(class_count, dtype=torch.float)

class_weights_all = class_weights[train_target_torch]

weighted_smapler = WeightedRandomSampler(weights=class_weights_all, num_samples=len(class_weights_all), replacement=True)

In [10]:
EPOCHS = 1
BATCH_SIZE = 16
LEARNING_RATE = 5e-5
NUM_FEATURES = 1
NUM_CLASSES = len(class_distribution_dict)

In [11]:
NUM_CLASSES

89

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device_count = torch.cuda.device_count()
args = {
  "tokenizer_name": "microsoft/codebert-base",
  "model_name_or_path": "microsoft/codebert-base",
  "model_name": "12heads_linevul_model.bin",
  "output_dir": "./saved_models",
  "use_word_level_tokenizer": False,
  "block_size": 512,
  "seed": 42,
  "n_gpu": device_count,
  "num_attention_heads": 12,
  "train_batch_size": BATCH_SIZE,
  "epochs": 1,
  "device": device,
  "weight_decay": 0.0,
  "learning_rate": LEARNING_RATE,
  "adam_epsilon": 1e-8,
  "gradient_accumulation_steps": 1,
  "max_grad_norm": 1
}

In [13]:
cat_linevul.set_seed(args)

In [14]:
tokenizer = RobertaTokenizer.from_pretrained(args["tokenizer_name"])

train_dataset = cat_linevul.TextDataset(train_df, train_target_torch, tokenizer, args)

HBox(children=(FloatProgress(value=0.0, max=149303.0), HTML(value='')))




In [15]:
val_target_torch = torch.tensor(val_targets)
test_target_torch = torch.tensor(test_targets)

val_dataset = cat_linevul.TextDataset(val_df, val_target_torch, tokenizer, args)
test_dataset = cat_linevul.TextDataset(test_df, test_target_torch, tokenizer, args)

HBox(children=(FloatProgress(value=0.0, max=18768.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=18768.0), HTML(value='')))




In [21]:
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=weighted_smapler)
val_loader = DataLoader(dataset=val_dataset, batch_size=1)
test_loader = DataLoader(dataset=test_dataset, batch_size=1)

In [22]:
config = RobertaConfig.from_pretrained(args["model_name_or_path"])
config.num_labels = NUM_CLASSES
config.num_attention_heads = args["num_attention_heads"]

In [23]:
model = RobertaForSequenceClassification.from_pretrained(args["model_name_or_path"], config=config, ignore_mismatched_sizes=True)

Some weights of the model checkpoint at microsoft/codebert-base were not used when initializing RobertaForSequenceClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be 

In [24]:
model = cat_linevul_model.Model(model, config, tokenizer, args)

In [25]:
logger = logging.getLogger(__name__)

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO)
logger.warning("device: %s, n_gpu: %s",device, args["n_gpu"],)



In [26]:
now = datetime.now()
curr_timestamp = now.strftime("%m_%d_%Y_%H_%M_%S")

In [27]:
model.to(device)

Model(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps

In [None]:
cudaDeviceReset()

In [28]:
import gc
torch.cuda.empty_cache()
gc.collect()

2274

In [29]:
cat_linevul.train(args, train_dataset, weighted_smapler, model, tokenizer, val_dataset, curr_timestamp, logger)

11/02/2022 01:11:39 - INFO - __main__ -   ***** Running training *****
11/02/2022 01:11:39 - INFO - __main__ -     Num examples = 149303
11/02/2022 01:11:39 - INFO - __main__ -     Num Epochs = 1
11/02/2022 01:11:39 - INFO - __main__ -     Instantaneous batch size per GPU = 16
11/02/2022 01:11:39 - INFO - __main__ -     Total train batch size = 16
11/02/2022 01:11:39 - INFO - __main__ -     Gradient Accumulation steps = 1
11/02/2022 01:11:39 - INFO - __main__ -     Total optimization steps = 9332


HBox(children=(FloatProgress(value=0.0, max=9332.0), HTML(value='')))




RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 14.56 GiB total capacity; 13.27 GiB already allocated; 22.44 MiB free; 13.48 GiB reserved in total by PyTorch)

In [None]:
df = pd.read_csv(SRC_PATH)
df.head()

In [None]:
vul_df = df.query("target != 0")from sklearn.model_selection import train_test_split

In [None]:
fig = sns.countplot(x="target", data=df).get_figure()

In [None]:
fig.savefig("../data/cat/vul_cnt.png")

In [None]:
X = df.drop("target", axis=1)
Y = df["target"]

type(X)

In [None]:
def boost_with_dupes(df: pd.DataFrame, dup_cnt=10):
    Y_list = df["target"].tolist()
    
    unq_targets = set(Y_list)

    dupe_targets = []

    for tg in unq_targets:
        cnt = Y_list.count(tg)
        if cnt < 2:
            dupe_targets.append(tg)
    
    X_dupes = []

    for idx, row in df.iterrows():
        if row["target"] in dupe_targets:
            X_dupes.extend([row] * dup_cnt)
    
    new_df = df.append(pd.DataFrame(X_dupes))
    
    return new_df

In [None]:
# print(len(df.index))
new_df = boost_with_dupes(df)
# print(len(new_df.index))
len(new_df["target"])

In [None]:
X_train_df, X_valtest_df, Y_train, Y_valtest = train_test_split(new_df, new_df["target"], test_size=0.2, stratify=new_df["target"])

In [None]:
type(X_train_df), type(X_valtest_df), type(Y_train), type(Y_valtest)

In [None]:
print(len(X_valtest_df.index))
new_X_valtest_df = boost_with_dupes(X_valtest_df)
len(new_X_valtest_df["target"])

In [None]:
X_val_df, X_test_df, Y_val, Y_test = train_test_split(new_X_valtest_df, new_X_valtest_df["target"], test_size=0.5, stratify=new_X_valtest_df["target"])

In [None]:
X_train_len = len(X_train_df.index)
Y_train_len = len(Y_train)

X_val_len = len(X_val_df.index)
Y_val_len = len(Y_val)

X_test_len = len(X_test_df.index)
Y_test_len = len(Y_test)

print(X_train_len, Y_train_len)
print(X_val_len, Y_val_len)
print(X_test_len, Y_test_len)

print((X_train_len + X_val_len + X_test_len), (Y_train_len + Y_val_len + Y_test_len))
print(len(df.index))

In [None]:
DST_DIR = "../data/cat"

filename_df_dict = {
    "train": {
        "filename": "cat_boost_train.csv",
        "dataframe": X_train_df
    },
    "val": {
        "filename": "cat_boost_val.csv",
        "dataframe": X_val_df
    },
    "test": {
        "filename": "cat_boost_test.csv",
        "dataframe": X_test_df
    }
}

for key in filename_df_dict:
    DST_PATH = join(DST_DIR, filename_df_dict[key]["filename"])
    
    if not isfile(DST_PATH):
        with open(DST_PATH, "w"):
            pass
    
    filename_df_dict[key]["dataframe"].to_csv(DST_PATH, index=False)

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
Y_list = Y.tolist()
unq_targets = set(Y_list)

dupe_targets = []

for tg in unq_targets:
    cnt = Y_list.count(tg)
    
    if cnt < 2:
        dupe_targets.append(tg)

In [None]:
print(type(X))

X_dupes = []
Y_dupes = []

dup_cnt = 10

for idx, entry in enumerate(X):
    if Y[idx] in dupe_targets:
        X_dupes.extend([entry] * dup_cnt)
        Y_dupes.extend([Y[idx]] * dup_cnt)

In [None]:
print(len(X), len(Y))

X = X.append(pd.Series(X_dupes))
Y = Y.append(pd.Series(Y_dupes))

print(len(X), len(Y))

In [None]:
X_train, X_valtest, Y_train, Y_valtest = train_test_split(X, Y, test_size=0.2, stratify=Y)

In [None]:
Y_valtest_list = Y_valtest.tolist()
valtest_unq_targets = set(Y_valtest_list)

valtest_dupe_targets = []

for tg in valtest_unq_targets:
    cnt = Y_valtest_list.count(tg)
    
    if cnt < 2:
        valtest_dupe_targets.append(tg)
        print(cnt, tg)

In [None]:
X_valtest_dupes = []
Y_valtest_dupes = []

dup_cnt = 10

for idx, entry in enumerate(X_valtest):
    if Y_valtest[idx] in valtest_dupe_targets:
        X_dupes.extend([entry] * dup_cnt)
        Y_dupes.extend([Y[idx]] * dup_cnt)

print(len(X), len(Y))

X = X.append(pd.Series(X_dupes))
Y = Y.append(pd.Series(Y_dupes))

print(len(X), len(Y))

In [None]:
X_val, X_test, Y_val, Y_test = train_test_split(X_valtest, Y_valtest, test_size=0.5, stratify=Y_valtest)

In [None]:
class InputFeatures(object):
    """A single training/test features for a example."""
    def __init__(self,
                 input_tokens,
                 input_ids,
                 label):
        self.input_tokens = input_tokens
        self.input_ids = input_ids
        self.label=label
        

class TextDataset(Dataset):
    def __init__(self, X, Y, tokenizer, args):
        self.examples = []
        funcs = X["processed_func"].tolist()
        labels = Y
        for i in tqdm(range(len(funcs))):
            self.examples.append(convert_examples_to_features(funcs[i], labels[i], tokenizer, args))

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):       
        return torch.tensor(self.examples[i].input_ids),torch.tensor(self.examples[i].label)


def convert_examples_to_features(func, label, tokenizer, args):
    if args["use_word_level_tokenizer"]:
        encoded = tokenizer.encode(func)
        encoded = encoded.ids
        if len(encoded) > 510:
            encoded = encoded[:510]
        encoded.insert(0, 0)
        encoded.append(2)
        if len(encoded) < 512:
            padding = 512 - len(encoded)
            for _ in range(padding):
                encoded.append(1)
        source_ids = encoded
        source_tokens = []
        return InputFeatures(source_tokens, source_ids, label)
    # source
    code_tokens = tokenizer.tokenize(str(func))[:args["block_size"]-2]
    source_tokens = [tokenizer.cls_token] + code_tokens + [tokenizer.sep_token]
    source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
    padding_length = args["block_size"] - len(source_ids)
    source_ids += [tokenizer.pad_token_id] * padding_length
    return InputFeatures(source_tokens, source_ids, label)

def set_seed(args):
    random.seed(args["seed"])
    np.random.seed(args["seed"])
    torch.manual_seed(args["seed"])
    if args["n_gpu"] > 0:
        torch.cuda.manual_seed_all(args["seed"])