In [68]:
import numpy as np
import pandas as pd

In [69]:
def concat_to_df(df):
    df_temp = []
    for chunk in df:
        df_temp.append(chunk)

    df = pd.concat(df_temp,ignore_index = True)
    return df

In [153]:
msr_df = pd.read_csv('preprocessed datasets/MSR.csv', chunksize=10000)

In [154]:
mvd_df = pd.read_csv('preprocessed datasets/MVD.csv', chunksize=10000)

In [155]:
msr_df = concat_to_df(msr_df)
msr_df.size

1320452

In [156]:
mvd_df = concat_to_df(mvd_df)
mvd_df.size

908205

### Check overlapped dataset

In [157]:
msr_df['CWE ID'] = msr_df['CWE ID'].replace(np.nan, 'non-vulnerable')
msr_df['CWE ID'].unique()

array(['CWE-264', 'CWE-119', 'CWE-310', 'CWE-189', 'CWE-200',
       'non-vulnerable', 'CWE-20', 'CWE-416', 'CWE-399', 'CWE-835',
       'CWE-125', 'CWE-347', 'CWE-190', 'CWE-94', 'CWE-17', 'CWE-362',
       'CWE-704', 'CWE-354', 'CWE-78', 'CWE-59', 'CWE-269', 'CWE-388',
       'CWE-617', 'CWE-476', 'CWE-769', 'CWE-787', 'CWE-754', 'CWE-502',
       'CWE-22', 'CWE-665', 'CWE-254', 'CWE-772', 'CWE-290', 'CWE-400',
       'CWE-369', 'CWE-284', 'CWE-93', 'CWE-415', 'CWE-674', 'CWE-311',
       'CWE-330', 'CWE-327', 'CWE-320', 'CWE-77', 'CWE-134', 'CWE-732',
       'CWE-287', 'CWE-16', 'CWE-89', 'CWE-255', 'CWE-18', 'CWE-19',
       'CWE-285', 'CWE-74', 'CWE-79', 'CWE-404', 'CWE-862', 'CWE-834',
       'CWE-494', 'CWE-755', 'CWE-129', 'CWE-191', 'CWE-682', 'CWE-522',
       'CWE-918', 'CWE-358', 'CWE-295', 'CWE-770', 'CWE-346', 'CWE-532',
       'CWE-426', 'CWE-824', 'CWE-693', 'CWE-352', 'CWE-436', 'CWE-90',
       'CWE-763', 'CWE-120', 'CWE-611', 'CWE-601', 'CWE-706', 'CWE-361',
       '

In [158]:
msr_cwe_list = msr_df['CWE ID'].unique().tolist()

In [159]:
mvd_cwe_list = mvd_df['CWE ID'].unique().tolist()
mvd_df['CWE ID'].unique()

array(['non-vulnerable', 'CWE-311', 'CWE-706',
       'CWE-400, CWE-665, CWE-020', 'CWE-704', 'CWE-666', 'CWE-119',
       'CWE-404', 'CWE-400', 'CWE-369', 'CWE-074', 'CWE-506', 'CWE-362',
       'CWE-662', 'CWE-476', 'CWE-020', 'CWE-758', 'CWE-327', 'CWE-668',
       'CWE-400, CWE-404', 'CWE-119, CWE-666, CWE-573', 'CWE-190',
       'CWE-467', 'CWE-170', 'CWE-676', 'CWE-187', 'CWE-670', 'CWE-573',
       'CWE-191', 'CWE-754', 'CWE-666, CWE-573', 'CWE-404, CWE-668',
       'CWE-400, CWE-665', 'CWE-221', 'CWE-610', 'CWE-469',
       'CWE-662, CWE-573', 'CWE-665', 'CWE-138', 'CWE-834', 'CWE-673'],
      dtype=object)

In [160]:
# Split all multi-labeled cwe_ids elemtents and make unique cwe_id list

mvd_final_cwe_list = []
for elem in mvd_cwe_list:
    temp = elem.split(", ")
    if len(temp) > 1:
        print(elem)
    mvd_final_cwe_list +=temp

print("Before removing overlapped cwe_ids in MVD: Total ",len(mvd_final_cwe_list))
mvd_final_cwe_list = list(set(mvd_final_cwe_list))
print("After removing overlapped cwe_ids in MVD: Total ", len(mvd_final_cwe_list))


CWE-400, CWE-665, CWE-020
CWE-400, CWE-404
CWE-119, CWE-666, CWE-573
CWE-666, CWE-573
CWE-404, CWE-668
CWE-400, CWE-665
CWE-662, CWE-573
Before removing overlapped cwe_ids in MVD: Total  50
After removing overlapped cwe_ids in MVD: Total  34


In [161]:
cwe_list = msr_cwe_list+mvd_final_cwe_list
total_cwe_list = list(set(cwe_list))
print(f"# of overlapped CWE IDs: {len(cwe_list)-len(set(cwe_list))} \n# of total CWE IDs: {len(total_cwe_list)}")

# of overlapped CWE IDs: 17 
# of total CWE IDs: 109


In [162]:
'non-vulnerable' in total_cwe_list

True

In [163]:
# Check NaN
'nan' in total_cwe_list

False

### Define integrated cwe_ids labels

In [164]:
# sorted by cwe_ids and non-vulnerable as index: 0

import re

def sort_by_number(my_list):
    def extract_number(string):
        match = re.search(r'\d+', string)
        if match:
            return int(match.group())
        else:
            return float('inf')
    return sorted(my_list, key=extract_number)

total_cwe_list = sort_by_number(total_cwe_list)
total_cwe_list.remove('non-vulnerable')
total_cwe_list.insert(0, 'non-vulnerable')
print(total_cwe_list)  # prints ['item1', 'item2', 'item3', 'item10', 'item25']


['non-vulnerable', 'CWE-16', 'CWE-17', 'CWE-18', 'CWE-19', 'CWE-20', 'CWE-020', 'CWE-22', 'CWE-59', 'CWE-74', 'CWE-074', 'CWE-77', 'CWE-78', 'CWE-79', 'CWE-89', 'CWE-90', 'CWE-93', 'CWE-94', 'CWE-119', 'CWE-120', 'CWE-125', 'CWE-129', 'CWE-134', 'CWE-138', 'CWE-170', 'CWE-172', 'CWE-187', 'CWE-189', 'CWE-190', 'CWE-191', 'CWE-200', 'CWE-209', 'CWE-221', 'CWE-252', 'CWE-254', 'CWE-255', 'CWE-264', 'CWE-269', 'CWE-281', 'CWE-284', 'CWE-285', 'CWE-287', 'CWE-290', 'CWE-295', 'CWE-310', 'CWE-311', 'CWE-320', 'CWE-327', 'CWE-330', 'CWE-331', 'CWE-345', 'CWE-346', 'CWE-347', 'CWE-352', 'CWE-354', 'CWE-358', 'CWE-361', 'CWE-362', 'CWE-369', 'CWE-388', 'CWE-399', 'CWE-400', 'CWE-404', 'CWE-415', 'CWE-416', 'CWE-426', 'CWE-436', 'CWE-467', 'CWE-469', 'CWE-476', 'CWE-494', 'CWE-502', 'CWE-506', 'CWE-522', 'CWE-532', 'CWE-573', 'CWE-601', 'CWE-610', 'CWE-611', 'CWE-617', 'CWE-662', 'CWE-664', 'CWE-665', 'CWE-666', 'CWE-668', 'CWE-670', 'CWE-673', 'CWE-674', 'CWE-676', 'CWE-682', 'CWE-693', 'CWE-7

In [165]:
total_cwe_dict = { cwe_id_key: idx for idx, cwe_id_key in enumerate(total_cwe_list)}
print(total_cwe_dict)

{'non-vulnerable': 0, 'CWE-16': 1, 'CWE-17': 2, 'CWE-18': 3, 'CWE-19': 4, 'CWE-20': 5, 'CWE-020': 6, 'CWE-22': 7, 'CWE-59': 8, 'CWE-74': 9, 'CWE-074': 10, 'CWE-77': 11, 'CWE-78': 12, 'CWE-79': 13, 'CWE-89': 14, 'CWE-90': 15, 'CWE-93': 16, 'CWE-94': 17, 'CWE-119': 18, 'CWE-120': 19, 'CWE-125': 20, 'CWE-129': 21, 'CWE-134': 22, 'CWE-138': 23, 'CWE-170': 24, 'CWE-172': 25, 'CWE-187': 26, 'CWE-189': 27, 'CWE-190': 28, 'CWE-191': 29, 'CWE-200': 30, 'CWE-209': 31, 'CWE-221': 32, 'CWE-252': 33, 'CWE-254': 34, 'CWE-255': 35, 'CWE-264': 36, 'CWE-269': 37, 'CWE-281': 38, 'CWE-284': 39, 'CWE-285': 40, 'CWE-287': 41, 'CWE-290': 42, 'CWE-295': 43, 'CWE-310': 44, 'CWE-311': 45, 'CWE-320': 46, 'CWE-327': 47, 'CWE-330': 48, 'CWE-331': 49, 'CWE-345': 50, 'CWE-346': 51, 'CWE-347': 52, 'CWE-352': 53, 'CWE-354': 54, 'CWE-358': 55, 'CWE-361': 56, 'CWE-362': 57, 'CWE-369': 58, 'CWE-388': 59, 'CWE-399': 60, 'CWE-400': 61, 'CWE-404': 62, 'CWE-415': 63, 'CWE-416': 64, 'CWE-426': 65, 'CWE-436': 66, 'CWE-467': 6

### Merging two datasets with integrated CWE IDs labels

In [166]:
def check_match_vul_and_CWE_ID(df):
    count = 0
    for index, row in df.iterrows():
        if row['vul'] == 0 and row['CWE ID'] != 'non-vulnerable':
            print("vul and CWE ID are not matched!!!!!",row['vul'], row['CWE ID'])
            break

    if count==0:
        print("All vul and CWE ID are matched correctly")
        
def check_match_vul_and_label(df):
    count = 0
   
    for index, row in df.iterrows():
        if row['vul'] == 0 and row['label'] != 0:
            print("vul and label are not matched!!!!!",row['vul'], row['label'])
            break
    if count==0:
        print("All vul and label are matched correctly")


            

In [167]:
check_match_vul_and_CWE_ID(msr_df)

vul and CWE ID are not matched!!!!! 0 CWE-264
All vul and CWE ID are matched correctly


In [168]:
# add new column based on dictionary values
msr_df['label'] = msr_df['CWE ID'].apply(lambda x: total_cwe_dict.get(x))
msr_df

Unnamed: 0.1,Unnamed: 0,CWE ID,vul,lang,func_after,func_before,vul_func_with_fix,label
0,0,CWE-264,0,C,static bool check_rodc_critical_attribute(stru...,static bool check_rodc_critical_attribute(stru...,static bool check_rodc_critical_attribute(stru...,36
1,1,CWE-264,0,C,static int samldb_add_entry(struct samldb_ctx ...,static int samldb_add_entry(struct samldb_ctx ...,static int samldb_add_entry(struct samldb_ctx ...,36
2,2,CWE-264,0,C,static int samldb_add_entry_callback(struct ld...,static int samldb_add_entry_callback(struct ld...,static int samldb_add_entry_callback(struct ld...,36
3,3,CWE-264,0,C,static int samldb_add_handle_msDS_IntId(struct...,static int samldb_add_handle_msDS_IntId(struct...,static int samldb_add_handle_msDS_IntId(struct...,36
4,4,CWE-264,0,C,static int samldb_add_step(struct samldb_ctx *...,static int samldb_add_step(struct samldb_ctx *...,static int samldb_add_step(struct samldb_ctx *...,36
...,...,...,...,...,...,...,...,...
188631,188631,CWE-119,1,C,void impeg2d_dec_p_mb_params(dec_state_t *ps_d...,void impeg2d_dec_p_mb_params(dec_state_t *ps_d...,void impeg2d_dec_p_mb_params(dec_state_t *ps_d...,18
188632,188632,CWE-119,1,C,void impeg2d_dec_pnb_mb_params(dec_state_t *ps...,void impeg2d_dec_pnb_mb_params(dec_state_t *ps...,void impeg2d_dec_pnb_mb_params(dec_state_t *ps...,18
188633,188633,CWE-200,1,C,int equalizer_get_parameter(effect_context_t *...,int equalizer_get_parameter(effect_context_t *...,int equalizer_get_parameter(effect_context_t *...,30
188634,188634,CWE-125,1,C,"uint8_t rfc_parse_data(tRFC_MCB* p_mcb, MX_FRA...","uint8_t rfc_parse_data(tRFC_MCB* p_mcb, MX_FRA...","uint8_t rfc_parse_data(tRFC_MCB* p_mcb, MX_FRA...",20


In [169]:
# add new column based on dictionary values
def multi_labeling(x):
    cwe_list = x.split(', ')
    if len(cwe_list) == 1:
        return total_cwe_dict.get(x)
    else: #for multi-labeled dataset
        multi_label_list = []
        for cwe in cwe_list:
            multi_label_list.append(str(total_cwe_dict.get(cwe)))
        result = ', '.join(multi_label_list)
        return result

mvd_df['label'] = mvd_df['CWE ID'].apply(multi_labeling)
mvd_df

Unnamed: 0.1,Unnamed: 0,info,CWE ID,code,vul,label
0,0,150200/dirent_uri.c memset 405,non-vulnerable,const char *svn_relpath__internal_style(const ...,0,0
1,1,79848/CWE134_Uncontrolled_Format_String__char...,non-vulnerable,static void goodG2B() 64\nchar * data ; 66\nch...,0,0
2,2,94203/CWE319_Cleartext_Tx_Sensitive_Info__w32...,non-vulnerable,static void goodB2G2() 289\nwchar_t * password...,0,0
3,3,151133/utils.c strcmp 737,non-vulnerable,"int avformat_open_input(AVFormatContext **ps,c...",0,0
4,4,94266/CWE319_Cleartext_Tx_Sensitive_Info__w32...,CWE-311,void CWE319_Cleartext_Tx_Sensitive_Info__w32_w...,1,45
...,...,...,...,...,...,...
181636,181636,81637 150417/utils.c memcpy 1644,non-vulnerable,int avcodec_encode_audio(AVCodecContext *avctx...,0,0
181637,181637,81638 1293/nxt-bad.c memcpy 405,non-vulnerable,"int main() 548\nint msglen , ret ; 550\nu_char...",0,0
181638,181638,81639 85610/CWE191_Integer_Underflow__int_list...,non-vulnerable,void bad() 48\nWSADATA wsaData ; 55\nint recvR...,0,0
181639,181639,81640 68361/CWE122_Heap_Based_Buffer_Overflow_...,CWE-119,void bad() 34\nchar * data ; 36\ndata = NULL; ...,1,18


In [170]:
mvd_pre_df = mvd_df[['code', 'vul', 'CWE ID','label']]

mvd_pre_df

Unnamed: 0,code,vul,CWE ID,label
0,const char *svn_relpath__internal_style(const ...,0,non-vulnerable,0
1,static void goodG2B() 64\nchar * data ; 66\nch...,0,non-vulnerable,0
2,static void goodB2G2() 289\nwchar_t * password...,0,non-vulnerable,0
3,"int avformat_open_input(AVFormatContext **ps,c...",0,non-vulnerable,0
4,void CWE319_Cleartext_Tx_Sensitive_Info__w32_w...,1,CWE-311,45
...,...,...,...,...
181636,int avcodec_encode_audio(AVCodecContext *avctx...,0,non-vulnerable,0
181637,"int main() 548\nint msglen , ret ; 550\nu_char...",0,non-vulnerable,0
181638,void bad() 48\nWSADATA wsaData ; 55\nint recvR...,0,non-vulnerable,0
181639,void bad() 34\nchar * data ; 36\ndata = NULL; ...,1,CWE-119,18


In [171]:
msr_pre_df = msr_df[['func_before', 'vul', 'CWE ID','label']]
msr_pre_df = msr_pre_df.rename(columns={'func_before':'code'})
msr_pre_df

Unnamed: 0,code,vul,CWE ID,label
0,static bool check_rodc_critical_attribute(stru...,0,CWE-264,36
1,static int samldb_add_entry(struct samldb_ctx ...,0,CWE-264,36
2,static int samldb_add_entry_callback(struct ld...,0,CWE-264,36
3,static int samldb_add_handle_msDS_IntId(struct...,0,CWE-264,36
4,static int samldb_add_step(struct samldb_ctx *...,0,CWE-264,36
...,...,...,...,...
188631,void impeg2d_dec_p_mb_params(dec_state_t *ps_d...,1,CWE-119,18
188632,void impeg2d_dec_pnb_mb_params(dec_state_t *ps...,1,CWE-119,18
188633,int equalizer_get_parameter(effect_context_t *...,1,CWE-200,30
188634,"uint8_t rfc_parse_data(tRFC_MCB* p_mcb, MX_FRA...",1,CWE-125,20


In [172]:
check_match_vul_and_label(mvd_pre_df)

All vul and label are matched correctly


In [173]:
check_match_vul_and_label(msr_pre_df)

vul and label are not matched!!!!! 0 36
All vul and label are matched correctly


In [174]:
# merge the DataFrames on the 'key' column
concatenated_df = pd.concat([msr_pre_df, mvd_pre_df])

# print the concatenated DataFrame
concatenated_df

Unnamed: 0,code,vul,CWE ID,label
0,static bool check_rodc_critical_attribute(stru...,0,CWE-264,36
1,static int samldb_add_entry(struct samldb_ctx ...,0,CWE-264,36
2,static int samldb_add_entry_callback(struct ld...,0,CWE-264,36
3,static int samldb_add_handle_msDS_IntId(struct...,0,CWE-264,36
4,static int samldb_add_step(struct samldb_ctx *...,0,CWE-264,36
...,...,...,...,...
181636,int avcodec_encode_audio(AVCodecContext *avctx...,0,non-vulnerable,0
181637,"int main() 548\nint msglen , ret ; 550\nu_char...",0,non-vulnerable,0
181638,void bad() 48\nWSADATA wsaData ; 55\nint recvR...,0,non-vulnerable,0
181639,void bad() 34\nchar * data ; 36\ndata = NULL; ...,1,CWE-119,18
