In [4]:
from project_dataset import load_dataset

In [5]:
from dataclasses import dataclass

@dataclass
class Args:
    model_name = "Salesforce/codet5p-770m"
    num_proc = 4
    batch_size = 2
    max_src_length = 1200
    max_des_length = 146
    data_cols = ["CVE ID", "explain", "func_before", "processed_func"]
    save_dir = 'tf_board'
    epochs = 11
    grad_acc_steps = 4
    lr = 5e-5
    log_freq = 10
    local_rank = -1
    deepspeed = None
    fp16 = False
    lr_warmup_steps = 200
    weight_decay = 0.05
    task = "attack_vector"
    prefix = "770m"
    
args = Args()

In [6]:
ds = load_dataset(args.task)

In [7]:
import pandas as pd

In [8]:
import evaluate

rouge = evaluate.load("rouge")

In [9]:
df_test = ds['test']
df_test = df_test.to_pandas()

In [10]:
df_1 = pd.read_csv('results/attack_vector/t5p_script_770m/generated_predictions.txt', sep='\t', header=None)

In [11]:
preds = df_1[0].values.tolist()

In [12]:
references = df_test['explain'].values.tolist()

In [13]:
df = []
for i, v in enumerate(zip(preds, references)):
    r_ = rouge.compute(predictions=[v[0]], references=[v[1]])
    df.append((i, r_['rouge1'], r_['rouge2'], r_['rougeL']))

In [14]:
df_ = pd.DataFrame(df, columns=['id', 'rouge1', 'rouge2', 'rougeL'])

In [12]:
df_.to_csv(f"{args.task}_t5p_770m.tsv", sep='\t')

## analytics

In [15]:
df_test

Unnamed: 0,CVE ID,explain,func_before,processed_func
0,CVE-2017-3731,sending specially crafted truncated packets,"static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int...","static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int..."
1,CVE-2013-2867,via a crafted web site .,void BluetoothDeviceChromeOS::OnUnregisterAgen...,void BluetoothDeviceChromeOS::OnUnregisterAgen...
2,CVE-2012-5131,via unknown vectors .,static void unregisterBlobURLTask(void* conte...,static void unregisterBlobURLTask(void* contex...
3,CVE-2016-5842,persuading a victim to open a specially crafte...,}static inline void WriteResourceLong(unsigned...,}\nstatic inline void WriteResourceLong(unsign...
4,CVE-2012-2875,via a crafted document .,void PromoResourceService::PromoResourceState...,void PromoResourceService::PromoResourceStateC...
...,...,...,...,...
1345,CVE-2012-6657,by leveraging the ability to create a raw sock...,"int sock_setsockopt(struct socket *sock, int l...","int sock_setsockopt(struct socket *sock, int l..."
1346,CVE-2016-6787,using a specially-crafted application,static void perf_event_for_each_child(struct p...,static void perf_event_for_each_child(struct p...
1347,CVE-2016-3951,inserting a USB device with an invalid USB des...,"static int cdc_ncm_bind(struct usbnet *dev, s...","static int cdc_ncm_bind(struct usbnet *dev, st..."
1348,CVE-2016-1621,using a specially crafted media file,static unsigned int subpel_variance_ref(const ...,static unsigned int subpel_variance_ref(const ...


In [16]:
import pyarrow.parquet as pq
table = pq.read_table('data/MSR_data_cleaned.parquet')
df_bigvul = table.to_pandas()
df_bigvul.drop_duplicates(subset='CVE ID', keep='first', inplace=True)

In [17]:
df_bigvul.head()

Unnamed: 0,Access Gained,Attack Origin,Authentication Required,Availability,CVE ID,CVE Page,CWE ID,Complexity,Confidentiality,Integrity,...,lang,lines_after,lines_before,parentID,patch,project,project_after,project_before,vul,vul_func_with_fix
0,,Remote,Single system,Partial,CVE-2015-8467,https://www.cvedetails.com/cve/CVE-2015-8467/,CWE-264,Medium,Partial,Partial,...,C,,,a819d2b440aafa3138d95ff6e8b824da885a70e9,"@@ -1558,12 +1558,15 @@ static int samldb_chec...",samba,https://git.samba.org/?p=samba.git;a=blob;f=so...,https://git.samba.org/?p=samba.git;a=blob;f=so...,0,static bool check_rodc_critical_attribute(stru...
24,,Local,Not required,Partial,CVE-2009-4411,https://www.cvedetails.com/cve/CVE-2009-4411/,CWE-264,High,Partial,Partial,...,C,,,943f82dfa6ac250be30e4efe147831e9765cda93,"@@ -1,3 +1,6 @@+* Make sure that getfacl -R on...",savannah,https://git.savannah.gnu.org/cgit/acl.git/tree...,https://git.savannah.gnu.org/cgit/acl.git/tree...,0,acl_get_file_mode(const char *path_p)\n{\n\tst...
37,,Remote,Not required,Partial,CVE-2015-8382,https://www.cvedetails.com/cve/CVE-2015-8382/,CWE-119,Low,Partial,,...,C,,,1a2ec3fc60e428c47fd59c9dd7966c71ca44024d,"@@ -640,7 +640,7 @@ PHPAPI void php_pcre_match...",php,https://git.php.net/?p=php-src.git;a=blob;f=ex...,https://git.php.net/?p=php-src.git;a=blob;f=ex...,0,static PHP_FUNCTION(preg_match)\n{\n\tphp_do_p...
66,,Remote,Not required,Partial,CVE-2013-6712,https://www.cvedetails.com/cve/CVE-2013-6712/,CWE-119,Low,,,...,C,,,63f3ff7b5f89f50eb9df76c3d0860c04cc6e0f66,"@@ -1,4 +1,4 @@\n-/* Generated by re2c 0.13.5 ...",php,https://git.php.net/?p=php-src.git;a=blob;f=ex...,https://git.php.net/?p=php-src.git;a=blob;f=ex...,0,"static void add_error(Scanner *s, char *error)..."
76,,Remote,Not required,Partial,CVE-2013-6449,https://www.cvedetails.com/cve/CVE-2013-6449/,CWE-310,Medium,,,...,C,,,2ec4181ba92fc6b828687d2dc47c13dcd35a5d93,"@@ -4286,7 +4286,7 @@ need to go to SSL_ST_ACC...",openssl,https://git.openssl.org/gitweb/?p=openssl.git;...,https://git.openssl.org/gitweb/?p=openssl.git;...,0,static char * MS_CALLBACK srp_password_from_in...


In [18]:
df_bigvul.describe()

Unnamed: 0,Known Exploits,Score,add_lines,del_lines,vul
count,0.0,3530.0,3540.0,3540.0,3540.0
mean,,5.848187,0.590678,0.252542,0.050282
std,,1.94161,6.846599,3.668683,0.218558
min,,0.0,0.0,0.0,0.0
25%,,4.3,0.0,0.0,0.0
50%,,5.8,0.0,0.0,0.0
75%,,7.5,0.0,0.0,0.0
max,,10.0,280.0,167.0,1.0


In [19]:
df_foo = df_test.merge(df_bigvul, on='CVE ID', how='left')[['CVE ID', 'CWE ID', 'processed_func']]

In [20]:
df_test.shape

(1350, 4)

In [21]:
df_foo.shape

(1350, 3)

In [22]:
df_bigvul.shape

(3540, 35)

In [23]:
df_foo.head()

Unnamed: 0,CVE ID,CWE ID,processed_func
0,CVE-2017-3731,CWE-125,"static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int..."
1,CVE-2013-2867,,void BluetoothDeviceChromeOS::OnUnregisterAgen...
2,CVE-2012-5131,,static void unregisterBlobURLTask(void* contex...
3,CVE-2016-5842,CWE-125,}\nstatic inline void WriteResourceLong(unsign...
4,CVE-2012-2875,,void PromoResourceService::PromoResourceStateC...


In [24]:
CWE_IDs = df_foo['CWE ID'].unique()

In [25]:
len(CWE_IDs)

55

In [26]:
CWE_IDs

array(['CWE-125', None, 'CWE-119', 'CWE-20', 'CWE-362', 'CWE-415',
       'CWE-399', 'CWE-284', 'CWE-787', 'CWE-200', 'CWE-19', 'CWE-264',
       'CWE-416', 'CWE-59', 'CWE-190', 'CWE-189', 'CWE-269', 'CWE-476',
       'CWE-400', 'CWE-310', 'CWE-79', 'CWE-835', 'CWE-254', 'CWE-772',
       'CWE-134', 'CWE-17', 'CWE-120', 'CWE-732', 'CWE-78', 'CWE-22',
       'CWE-704', 'CWE-346', 'CWE-77', 'CWE-824', 'CWE-74', 'CWE-285',
       'CWE-611', 'CWE-682', 'CWE-754', 'CWE-404', 'CWE-352', 'CWE-347',
       'CWE-369', 'CWE-665', 'CWE-295', 'CWE-172', 'CWE-862', 'CWE-287',
       'CWE-311', 'CWE-320', 'CWE-494', 'CWE-252', 'CWE-834', 'CWE-617',
       'CWE-763'], dtype=object)

In [27]:
df_

Unnamed: 0,id,rouge1,rouge2,rougeL
0,0,1.000000,1.000,1.000000
1,1,1.000000,1.000,1.000000
2,2,1.000000,1.000,1.000000
3,3,1.000000,1.000,1.000000
4,4,1.000000,1.000,1.000000
...,...,...,...,...
1345,1345,0.315789,0.000,0.315789
1346,1346,1.000000,1.000,1.000000
1347,1347,0.777778,0.625,0.777778
1348,1348,1.000000,1.000,1.000000


In [28]:
df_foo_final = pd.concat([df_, df_foo], axis=1)

In [29]:
df_foo_final.describe()

Unnamed: 0,id,rouge1,rouge2,rougeL
count,1350.0,1350.0,1350.0,1350.0
mean,674.5,0.856558,0.776288,0.855428
std,389.855743,0.244799,0.35131,0.247099
min,0.0,0.0,0.0,0.0
25%,337.25,0.8,0.666667,0.8
50%,674.5,1.0,1.0,1.0
75%,1011.75,1.0,1.0,1.0
max,1349.0,1.0,1.0,1.0


In [30]:
df_foo_final

Unnamed: 0,id,rouge1,rouge2,rougeL,CVE ID,CWE ID,processed_func
0,0,1.000000,1.000,1.000000,CVE-2017-3731,CWE-125,"static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int..."
1,1,1.000000,1.000,1.000000,CVE-2013-2867,,void BluetoothDeviceChromeOS::OnUnregisterAgen...
2,2,1.000000,1.000,1.000000,CVE-2012-5131,,static void unregisterBlobURLTask(void* contex...
3,3,1.000000,1.000,1.000000,CVE-2016-5842,CWE-125,}\nstatic inline void WriteResourceLong(unsign...
4,4,1.000000,1.000,1.000000,CVE-2012-2875,,void PromoResourceService::PromoResourceStateC...
...,...,...,...,...,...,...,...
1345,1345,0.315789,0.000,0.315789,CVE-2012-6657,CWE-264,"int sock_setsockopt(struct socket *sock, int l..."
1346,1346,1.000000,1.000,1.000000,CVE-2016-6787,CWE-264,static void perf_event_for_each_child(struct p...
1347,1347,0.777778,0.625,0.777778,CVE-2016-3951,,"static int cdc_ncm_bind(struct usbnet *dev, st..."
1348,1348,1.000000,1.000,1.000000,CVE-2016-1621,CWE-119,static unsigned int subpel_variance_ref(const ...


In [31]:
average_rougeL = df_foo_final.groupby('CWE ID')['rougeL'].agg(['mean', 'count'])

In [32]:
average_rougeL[average_rougeL['count'] >= 10].sort_values('mean', ascending=True).reset_index()

Unnamed: 0,CWE ID,mean,count
0,CWE-200,0.750629,58
1,CWE-254,0.77402,17
2,CWE-415,0.776335,19
3,CWE-189,0.7932,44
4,CWE-362,0.808739,28
5,CWE-264,0.81809,57
6,CWE-476,0.842785,26
7,CWE-399,0.843496,102
8,CWE-416,0.853391,34
9,CWE-20,0.8549,134


In [33]:
len(CWE_IDs)

55

In [37]:
df_foo_final[df_foo_final['CWE ID'] == 'CWE-787']

Unnamed: 0,id,rouge1,rouge2,rougeL,CVE ID,CWE ID,processed_func
17,17,1.0,1.0,1.0,CVE-2019-14934,CWE-787,"int pdf_load_xrefs(FILE *fp, pdf_t *pdf) {\n ..."
154,154,0.8,0.75,0.8,CVE-2017-7863,CWE-787,static int decode_trns_chunk(AVCodecContext *a...
169,169,1.0,1.0,1.0,CVE-2019-14934,CWE-787,"static char *get_object(FILE *fp, int obj_id, ..."
400,400,1.0,1.0,1.0,CVE-2018-5388,CWE-787,static bool on_accept(private_stroke_socket_t ...
459,459,1.0,1.0,1.0,CVE-2017-5509,CWE-787,static ssize_t WritePSDChannels(const PSDInfo ...
513,513,0.8,0.666667,0.8,CVE-2017-9203,CWE-787,static int bmpr_read_rle(struct iwbmprcontext ...
597,597,1.0,1.0,1.0,CVE-2018-10540,CWE-787,"int ParseRiffHeaderConfig(FILE *infile, char *..."
621,621,1.0,1.0,1.0,CVE-2018-9496,CWE-787,VOID ixheaacd_esbr_postradixcompute2(WORD32 *p...
661,661,1.0,1.0,1.0,CVE-2017-14040,CWE-787,"opj_image_t *tgatoimage(const char *filename, ..."
702,702,1.0,1.0,1.0,CVE-2017-5509,CWE-787,static size_t WritePSDChannels(const PSDInfo *...


In [59]:
sample = df_foo_final[df_foo_final.id == 955]['processed_func'].values[0]

In [60]:
print(sample)

static void write_version(FILE *fp, const char *fname, const char *dirname,
                          xref_t *xref) {
  long start;
  char *c, *new_fname, data;
  FILE *new_fp;
  start = ftell(fp); /* Create file */
  if ((c = strstr(fname, ".pdf"))) *c = '\0';
  new_fname = malloc(strlen(fname) + strlen(dirname) + 16);
  snprintf(new_fname, strlen(fname) + strlen(dirname) + 16,
           "%s/%s-version-%d.pdf", dirname, fname, xref->version);
  if (!(new_fp = fopen(new_fname, "w"))) {
    ERR("Could not create file '%s'\n", new_fname);
    fseek(fp, start, SEEK_SET);
    free(new_fname);
    return;
  } /* Copy original PDF */
  fseek(fp, 0, SEEK_SET);
  while (fread(&data, 1, 1, fp))
    fwrite(&data, 1, 1,
           new_fp); /* Emit an older startxref, refering to an older version. */
  fprintf(new_fp, "\r\nstartxref\r\n%ld\r\n%%%%EOF", xref->start); /* Clean */
  fclose(new_fp);
  free(new_fname);
  fseek(fp, start, SEEK_SET);
}


In [40]:
preds[726]

'via behavior1 ) external newline zero-'

In [41]:
references[726]

'the (internal or trailing ) padding field'

In [53]:
print(df_bigvul[df_bigvul['CVE ID'] == 'CVE-2018-5388']['vul_func_with_fix'].values[0])

static void pop_end(stroke_msg_t *msg, const char* label, stroke_end_t *end)
{
	pop_string(msg, &end->address);
	pop_string(msg, &end->subnets);
	pop_string(msg, &end->sourceip);
	pop_string(msg, &end->dns);
	pop_string(msg, &end->auth);
	pop_string(msg, &end->auth2);
	pop_string(msg, &end->id);
	pop_string(msg, &end->id2);
	pop_string(msg, &end->rsakey);
	pop_string(msg, &end->cert);
	pop_string(msg, &end->cert2);
	pop_string(msg, &end->ca);
	pop_string(msg, &end->ca2);
	pop_string(msg, &end->groups);
	pop_string(msg, &end->groups2);
	pop_string(msg, &end->cert_policy);
	pop_string(msg, &end->updown);

	DBG_OPT("  %s=%s", label, end->address);
	DBG_OPT("  %ssubnet=%s", label, end->subnets);
	DBG_OPT("  %ssourceip=%s", label, end->sourceip);
	DBG_OPT("  %sdns=%s", label, end->dns);
	DBG_OPT("  %sauth=%s", label, end->auth);
	DBG_OPT("  %sauth2=%s", label, end->auth2);
	DBG_OPT("  %sid=%s", label, end->id);
	DBG_OPT("  %sid2=%s", label, end->id2);
	DBG_OPT("  %srsakey=%s", label, end->r