# Reproduce LineVul

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import commons

In [3]:
from transformers import AutoTokenizer, AutoModel
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig

In [4]:
# there are some warning from transformer
# due to its verbose, disable

from transformers import logging
logging.set_verbosity(40)

In [5]:
import torch
from torch.utils.data import DataLoader, Dataset, SequentialSampler

In [6]:
import numpy as np
from datasets import Dataset
from torch.utils.data import DataLoader

In [7]:
import pandas as pd
from tqdm.autonotebook import tqdm

In [8]:
from linevul_model import Model
from linevul_helpers import TextDataset
from linevul_extra import extract_line_attention, linevul_predict

In [9]:
from project_dataset import load_dataset

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [11]:
config = RobertaConfig.from_pretrained('microsoft/codebert-base')
config.num_labels = 1
config.num_attention_heads = 12

In [12]:
# get from LineVul
checkpoint = '/home/hqn650/LineVul/linevul/saved_models/checkpoint-best-f1/12heads_linevul_model.bin'

In [13]:
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')

In [14]:
pre_train = RobertaForSequenceClassification.from_pretrained('microsoft/codebert-base', 
                                                             config=config, 
                                                             ignore_mismatched_sizes=True).to(device)

In [15]:
from dataclasses import dataclass

@dataclass
class Args:
    device = device
    n_gpu = n_gpu
    use_non_pretrained_model = False
    block_size = 512
    test_data_file = '/home/hqn650/LineVul/data/big-vul_dataset/test.csv'
    code_length=256
    do_local_explanation=True
    reasoning_method='attention'
    seed=42
    num_attention_heads=12
    do_sorting_by_line_scores=False
    do_sorting_by_pred_prob=False
    top_k_constant=10
    use_word_level_tokenizer=False
    eval_batch_size=512

    task = "attack_vector"
    
args = Args()

In [16]:
model = Model(pre_train, config, tokenizer, args)

In [17]:
model.load_state_dict(torch.load(checkpoint, map_location=args.device))
model.to(args.device)

Model(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((76

In [18]:
from linevul_helpers import TextDataset, convert_examples_to_features

class ExtendTextDataset(TextDataset):
    def __init__(self, tokenizer, args, data_frame):
        self.examples = []
        funcs = data_frame["processed_func"].tolist()
        for i in tqdm(range(len(funcs)), desc='ExtendTextDataset'):
            self.examples.append(convert_examples_to_features(funcs[i], 1, tokenizer, args))

In [19]:
# multi-gpu evaluate
if args.n_gpu > 1:
    model = torch.nn.DataParallel(model)

In [8]:
cases = [
"""
static void __put_super(struct super_block *sb)
{
	if (!--sb->s_count) {
		list_del_init(&sb->s_list);
		destroy_super(sb);
	}
}
""",
"""
pcf_read_TOC( FT_Stream  stream,
                PCF_Face   face )
  {
    FT_Error   error;
    PCF_Toc    toc = &face->toc;
    PCF_Table  tables;

    FT_Memory  memory = FT_FACE( face )->memory;
    FT_UInt    n;


    if ( FT_STREAM_SEEK ( 0 )                          ||
         FT_STREAM_READ_FIELDS ( pcf_toc_header, toc ) )
      return FT_THROW( Cannot_Open_Resource );

    if ( toc->version != PCF_FILE_VERSION                 ||
         toc->count   >  FT_ARRAY_MAX( face->toc.tables ) ||
         toc->count   == 0                                )
      return FT_THROW( Invalid_File_Format );

    if ( FT_NEW_ARRAY( face->toc.tables, toc->count ) )
      return FT_THROW( Out_Of_Memory );

    tables = face->toc.tables;
    for ( n = 0; n < toc->count; n++ )
    {
      if ( FT_STREAM_READ_FIELDS( pcf_table_header, tables ) )
        goto Exit;
      tables++;
    }

    /* Sort tables and check for overlaps.  Because they are almost      */
    /* always ordered already, an in-place bubble sort with simultaneous */
    /* boundary checking seems appropriate.                              */
    tables = face->toc.tables;

    for ( n = 0; n < toc->count - 1; n++ )
    {
      FT_UInt  i, have_change;


      have_change = 0;

      for ( i = 0; i < toc->count - 1 - n; i++ )
      {
        PCF_TableRec  tmp;


        if ( tables[i].offset > tables[i + 1].offset )
        {
          tmp           = tables[i];
          tables[i]     = tables[i + 1];
          tables[i + 1] = tmp;

          have_change = 1;
        }

        if ( ( tables[i].size   > tables[i + 1].offset )                  ||
             ( tables[i].offset > tables[i + 1].offset - tables[i].size ) )
        {
          error = FT_THROW( Invalid_Offset );
          goto Exit;
        }
      }

      if ( !have_change )
        break;
    }

    /* we now check whether the `size' and `offset' values are reasonable: */
    /* `offset' + `size' must not exceed the stream size                   */
    tables = face->toc.tables;
    for ( n = 0; n < toc->count; n++ )
    {
      /* we need two checks to avoid overflow */
      if ( ( tables->size   > stream->size                ) ||
           ( tables->offset > stream->size - tables->size ) )
      {
        error = FT_THROW( Invalid_Table );
        goto Exit;
      }
      tables++;
    }

#ifdef FT_DEBUG_LEVEL_TRACE

    {
      FT_UInt      i, j;
      const char*  name = "?";


      FT_TRACE4(( "pcf_read_TOC:\n" ));

      FT_TRACE4(( "  number of tables: %ld\n", face->toc.count ));

      tables = face->toc.tables;
      for ( i = 0; i < toc->count; i++ )
      {
        for ( j = 0; j < sizeof ( tableNames ) / sizeof ( tableNames[0] );
              j++ )
          if ( tables[i].type == (FT_UInt)( 1 << j ) )
            name = tableNames[j];

        FT_TRACE4(( "  %d: type=%s, format=0x%X, "
                    "size=%ld (0x%lX), offset=%ld (0x%lX)\n",
                    i, name,
                    tables[i].format,
                    tables[i].size, tables[i].size,
                    tables[i].offset, tables[i].offset ));
      }
    }

#endif

    return FT_Err_Ok;

  Exit:
    FT_FREE( face->toc.tables );
    return error;
  }
""",
"""static void make_response(struct xen_blkif_ring *ring, u64 id,
                          unsigned short op, int st) {
  struct blkif_response resp;
  unsigned long flags;
  union blkif_back_rings *blk_rings;
  int notify;
  resp.id = id; // unsecure statement
  resp.operation = op; // unsecure statement
  resp.status = st; // unsecure statement
  spin_lock_irqsave(&ring->blk_ring_lock, flags);
  blk_rings = &ring->blk_rings; 
  switch (ring->blkif->blk_protocol) {
    case BLKIF_PROTOCOL_NATIVE:
      memcpy(
          RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
          &resp, sizeof(resp));  // unsecure statement
      break;
    case BLKIF_PROTOCOL_X86_32:
      memcpy(
          RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
          &resp, sizeof(resp)); // unsecure statement
      break;
    case BLKIF_PROTOCOL_X86_64:
      memcpy(
          RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
          &resp, sizeof(resp)); // unsecure statement
      break;
    default:
      BUG();
  }
  blk_rings->common.rsp_prod_pvt++;
  RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
  spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
  if (notify) notify_remote_via_irq(ring->irq);
}""",
"""static void write_version(FILE *fp, const char *fname, const char *dirname,
                          xref_t *xref) {
  long start;
  char *c, *new_fname, data;
  FILE *new_fp;
  start = ftell(fp);
  if ((c = strstr(fname, ".pdf"))) *c = '\0';
  new_fname = malloc(strlen(fname) + strlen(dirname) + 16); // insecure statement
  snprintf(new_fname, strlen(fname) + strlen(dirname) + 16,
           "%s/%s-version-%d.pdf", dirname, fname, xref->version);
  if (!(new_fp = fopen(new_fname, "w"))) {
    ERR("Could not create file '%s'\n", new_fname);
    fseek(fp, start, SEEK_SET);
    free(new_fname);
    return;
  } 
  fseek(fp, 0, SEEK_SET);
  while (fread(&data, 1, 1, fp))
    fwrite(&data, 1, 1,
           new_fp); 
  fprintf(new_fp, "\r\nstartxref\r\n%ld\r\n%%%%EOF", xref->start);
  fclose(new_fp);
  free(new_fname);
  fseek(fp, start, SEEK_SET);
}""",
"""
void *vips_malloc( VipsObject *object, size_t size ) {
  void *buf;
  buf = g_malloc( size ); 
  if( object ) {
  g_signal_connect( object, "postclose",
  G_CALLBACK( vips_malloc_cb ), buf );
      object->local_memory += size;
  }
  return( buf );
}
"""
]

In [21]:
df = pd.DataFrame({'processed_func': cases, "target": [0,0,1,1,1]})

In [22]:
df

Unnamed: 0,processed_func,target
0,\nstatic void __put_super(struct super_block *...,0
1,"\npcf_read_TOC( FT_Stream stream,\n ...",0
2,static void make_response(struct xen_blkif_rin...,1
3,"static void write_version(FILE *fp, const char...",1
4,"\nvoid *vips_malloc( VipsObject *object, size_...",1


In [23]:
# to find TP
def find_tp(model, tokenizer, args, data_frame=None):
    if data_frame is not None:
        dataset = ExtendTextDataset(tokenizer, args, data_frame)
    else:
        dataset = TextDataset(tokenizer, args, file_type='test')
    sampler = SequentialSampler(dataset)
    data_loader = DataLoader(dataset, sampler=sampler, batch_size=args.eval_batch_size, num_workers=0)
    result, y_trues, y_preds = linevul_predict(model, data_loader, args.device)
    tp_indices = np.where((y_trues == y_preds) & (y_trues == 1))
    tp_indices = list(tp_indices[0])
    return result, tp_indices, y_trues, y_preds

In [24]:
result, correct_indices, y, y_hat = find_tp(model, tokenizer, args, data_frame=df)

ExtendTextDataset:   0%|          | 0/5 [00:00<?, ?it/s]



In [25]:
y

array([1, 1, 1, 1, 1])

In [26]:
def explain(model, tokenizer, explain_indices, data_frame=None): 
    """ 
        return (sample_idx, lines, n_lines)
    """
    if data_frame is not None:
        dataset = ExtendTextDataset(tokenizer, args, data_frame)
    else:
        dataset = TextDataset(tokenizer, args, file_type='test')
    sampler = SequentialSampler(dataset)
    data_loader = DataLoader(dataset, sampler=sampler, batch_size=1, num_workers=0)
    model.eval()
    index = 0
    progress_bar = tqdm(data_loader, total=len(data_loader))
    extract_list = []
    for mini_batch in progress_bar:
        if index in explain_indices:
            (input_ids, labels) = mini_batch
            ids = input_ids[0].detach().tolist()
            all_tokens = tokenizer.convert_ids_to_tokens(ids)
            all_tokens = [token.replace("Ġ", "") for token in all_tokens]
            all_tokens = [token.replace("ĉ", "Ċ") for token in all_tokens]
            with torch.no_grad():
                prob, attentions = model(input_ids=input_ids, output_attentions=True)
            lines_with_score, n_lines = extract_line_attention(attentions, all_tokens)
            extract_list.append((index, lines_with_score, n_lines))
        index += 1
    return extract_list

In [27]:
extract_list = explain(model, tokenizer, [0,1,2,3,4], df)

ExtendTextDataset:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [28]:
#203
extract_list

[(0,
  [(1, 'staticvoid__put_super(structsuper_block*sb)', 790.4317779541016),
   (7, 'list_del_init(&sb->s_list);', 745.5436553955078),
   (4, 'if(!--sb->s_count){', 630.649486541748),
   (10, 'destroy_super(sb);', 390.7985954284668),
   (13, '}', 112.63846588134766),
   (12, '}', 111.76204681396484),
   (2, '{', 96.75848007202148),
   (0, '', 61.17894744873047),
   (5, '', 36.56650161743164),
   (6, '', 34.87374496459961),
   (9, '', 28.109703063964844),
   (8, '', 27.800952911376953),
   (3, '', 27.059484481811523),
   (11, '', 25.973024368286133)],
  14),
 (1,
  [(15, 'toc->count==0)', 399.1795516014099),
   (14, 'toc->count>FT_ARRAY_MAX(face->toc.tables)||', 391.22833824157715),
   (10, 'if(FT_STREAM_SEEK(0)||', 381.99241733551025),
   (22,
    'if(FT_STREAM_READ_FIELDS(pcf_table_header,tables))',
    377.89522981643677),
   (11, 'FT_STREAM_READ_FIELDS(pcf_toc_header,toc))', 367.79677963256836),
   (17, 'if(FT_NEW_ARRAY(face->toc.tables,toc->count))', 363.3032703399658),
   (13, '

# Helpers

In [1]:
from commons import clean_generated_str

# Attack vector

In [37]:
from dataclasses import dataclass

@dataclass
class AttackVectorArgs:
    model_name = "results/attack_vector/t5p_script_770m/checkpoint-1600/"
    max_des_length = 150
    
attack_vector_args = AttackVectorArgs()

In [38]:
from transformers import AutoTokenizer
attack_vector_tokenizer = AutoTokenizer.from_pretrained(attack_vector_args.model_name)

In [39]:
from transformers import AutoModelForSeq2SeqLM
attack_vector = AutoModelForSeq2SeqLM.from_pretrained(attack_vector_args.model_name)

In [40]:
attack_vector_code = attack_vector_tokenizer(cases, return_tensors="pt", padding="max_length", truncation=True)

In [41]:
def preprocess_function(example):
    input_feature = attack_vector_tokenizer(example, padding="max_length", truncation=True, return_tensors="pt")
    return input_feature
tokenized_cases = [preprocess_function(i) for i in cases]

In [49]:
generated_attack_vec = []
for i in tokenized_cases:
    output = attack_vector.generate(**i, max_length=attack_vector_args.max_des_length)
    explain = attack_vector_tokenizer.decode(output[0], skip_special_tokens=True)
    print(explain)
    generated_attack_vec.append(explain)

via unspecified use of Asynchronous I/O (AIO ) operations.
using a specially-crafted font
via vectors related to the handling of input.
creating a symbolic link from a temporary file to various files on the system
 which triggers a heap-based buffer overflow.


# Root Cause

In [51]:
@dataclass
class RootCauseArgs:
    model_name = "results/root_cause/t5p_script_770m/checkpoint-1100/"
    max_des_length = 153
    
root_cause_args = RootCauseArgs()

In [52]:
root_cause_tokenizer = AutoTokenizer.from_pretrained(root_cause_args.model_name)

In [53]:
def preprocess_function(example):
    input_feature = root_cause_tokenizer(example, padding="max_length", truncation=True, return_tensors="pt")
    return input_feature
tokenized_cases = [preprocess_function(i) for i in cases]

In [54]:
root_cause = AutoModelForSeq2SeqLM.from_pretrained(root_cause_args.model_name)

In [55]:
generated_root_cause = []
for i in tokenized_cases:
    output = root_cause.generate(**i, max_length=attack_vector_args.max_des_length)
    explain = root_cause_tokenizer.decode(output[0], skip_special_tokens=True)
    print(explain)
    generated_root_cause.append(explain)

a soft lockup when performing\r\nAsynchronous I/O operations due to files_lock excessive locking
does not check for the end of the data during certain reading operations 
the failure to properly copy the contents of the ring producer/consumer pointers
doesn't validate a certain size value 
an integer overflow when allocating memory


# Vulnerability Type

In [60]:
@dataclass
class TypeArgs:
    model_name = "results/vulnerability_type/t5p_script_770m/checkpoint-1250/"
    max_des_length = 53
    
vul_type_args = TypeArgs()

In [61]:
vul_type_tokenizer = AutoTokenizer.from_pretrained(vul_type_args.model_name)
vul_type = AutoModelForSeq2SeqLM.from_pretrained(vul_type_args.model_name)

In [62]:
def preprocess_function(example):
    input_feature = vul_type_tokenizer(example, padding="max_length", truncation=True, return_tensors="pt")
    return input_feature
tokenized_cases = [preprocess_function(i) for i in cases]

In [63]:
generated_vul_type = []
for i in tokenized_cases:
    output = vul_type.generate(**i, max_length=attack_vector_args.max_des_length)
    explain = vul_type_tokenizer.decode(output[0], skip_special_tokens=True)
    print(explain)
    generated_vul_type.append(explain)

use-after-free error
pointer dereference
memory corruption
out-of-bounds write
memory corruption


# Impact

In [68]:
@dataclass
class ImpactArgs:
    model_name = "results/impact/t5p_script_770m/checkpoint-1750/"
    max_des_length = 167
    
impact_args = ImpactArgs()

In [69]:
impact_tokenizer = AutoTokenizer.from_pretrained(impact_args.model_name)
impact = AutoModelForSeq2SeqLM.from_pretrained(impact_args.model_name)

In [70]:
def preprocess_function(example):
    input_feature = impact_tokenizer(example, padding="max_length", truncation=True, return_tensors="pt")
    return input_feature
tokenized_cases = [preprocess_function(i) for i in cases]

In [71]:
generated_impact = []
for i in tokenized_cases:
    output = impact.generate(**i, max_length=attack_vector_args.max_des_length)
    explain = impact_tokenizer.decode(output[0], skip_special_tokens=True)
    print(explain)
    generated_impact.append(explain)

cause a denial of service (system crash )
a denial of service
leak kernel memory bytes and obtain sensitive information
execute arbitrary code or cause a denial of service condition
leaking raw process memory contents through the output image.
