# Get labels from test suite manifest

In [7]:
import pandas as pd
import xml.etree.ElementTree as et
import re
from IPython.display import clear_output

### Load and parse manifest.xml

In [6]:
tree = et.parse('/mnt/md0/user/scheuererra68323/testset_jtt/C/manifest.xml')
root = tree.getroot()

df = pd.DataFrame()
_testcases = root.findall('testcase')
print(len(_testcases), 'testcases found')

64123 testcases found


In [8]:
show_progress = True

i=1
n = len(_testcases)

# this will hold the extracted labels
testcase_labels = pd.DataFrame()

for _testcase in _testcases:
    for _file in _testcase.findall('file'):
        
        # get relevant line numbers for this testcase
        for _flaw in _file.findall('flaw'):
            
            _relevant_line_number = int(_flaw.attrib.get('line'))
            cwe = "CWE-" + re.search('CWE-([0-9]+):', _flaw.attrib.get('name')).group(1)

            testcase_labels = testcase_labels.append({
                'path'   : _file.attrib.get('path'),
                'line': _relevant_line_number,
                cwe : True
            }, ignore_index=True)

    # print progress
    if show_progress:
        clear_output(wait=True)
        print('Processed testcase {:5} of {:5} ({:3.1f}%)'.format(i, n, i/n * 100))
        
    i+=1
    
testcase_labels['dataset'] = 'SARD-102'
testcase_labels['label_source'] = 'test_suite'

testcase_labels.sample(n=10)

Processed testcase 64123 of 64123 (100.0%)


Unnamed: 0,CWE-114,line,path,CWE-121,CWE-135,CWE-126,CWE-122,CWE-123,CWE-124,CWE-127,...,CWE-780,CWE-785,CWE-789,CWE-078,CWE-832,CWE-835,CWE-843,CWE-090,dataset,label_source
47878,,37.0,CWE588_Attempt_to_Access_Child_of_Non_Structur...,,,,,,,,...,,,,,,,,,SARD-102,test_suite
34929,,31.0,CWE272_Least_Privilege_Violation__w32_char_Reg...,,,,,,,,...,,,,,,,,,SARD-102,test_suite
10825,,40.0,CWE122_Heap_Based_Buffer_Overflow__c_CWE806_wc...,,,,1.0,,,,...,,,,,,,,,SARD-102,test_suite
40139,,30.0,CWE400_Resource_Exhaustion__fscanf_for_loop_81...,,,,,,,,...,,,,,,,,,SARD-102,test_suite
14041,,31.0,CWE124_Buffer_Underwrite__wchar_t_declare_memc...,,,,,,1.0,,...,,,,,,,,,SARD-102,test_suite
10050,,57.0,CWE122_Heap_Based_Buffer_Overflow__c_CWE805_st...,,,,1.0,,,,...,,,,,,,,,SARD-102,test_suite
51682,,38.0,CWE665_Improper_Initialization__wchar_t_cat_72...,,,,,,,,...,,,,,,,,,SARD-102,test_suite
37323,,65.0,CWE36_Absolute_Path_Traversal__char_environmen...,,,,,,,,...,,,,,,,,,SARD-102,test_suite
23833,,31.0,CWE190_Integer_Overflow__unsigned_int_fscanf_p...,,,,,,,,...,,,,,,,,,SARD-102,test_suite
12472,,45.0,CWE124_Buffer_Underwrite__char_declare_cpy_07.c,,,,,,1.0,,...,,,,,,,,,SARD-102,test_suite


In [12]:
testcase_labels.path = "/mnt/md0/user/scheuererra68323/testset_jtt/C/" + testcase_labels.path

In [13]:
print( testcase_labels.shape )
testcase_labels.head()

(65263, 123)


Unnamed: 0,CWE-114,line,path,CWE-121,CWE-135,CWE-126,CWE-122,CWE-123,CWE-124,CWE-127,...,CWE-780,CWE-785,CWE-789,CWE-078,CWE-832,CWE-835,CWE-843,CWE-090,dataset,label_source
0,1.0,121.0,/mnt/md0/user/scheuererra68323/testset_jtt/C/C...,,,,,,,,...,,,,,,,,,SARD-102,test_suite
1,1.0,124.0,/mnt/md0/user/scheuererra68323/testset_jtt/C/C...,,,,,,,,...,,,,,,,,,SARD-102,test_suite
2,1.0,124.0,/mnt/md0/user/scheuererra68323/testset_jtt/C/C...,,,,,,,,...,,,,,,,,,SARD-102,test_suite
3,1.0,131.0,/mnt/md0/user/scheuererra68323/testset_jtt/C/C...,,,,,,,,...,,,,,,,,,SARD-102,test_suite
4,1.0,131.0,/mnt/md0/user/scheuererra68323/testset_jtt/C/C...,,,,,,,,...,,,,,,,,,SARD-102,test_suite


In [14]:
testcase_labels.sample(n=10)

Unnamed: 0,CWE-114,line,path,CWE-121,CWE-135,CWE-126,CWE-122,CWE-123,CWE-124,CWE-127,...,CWE-780,CWE-785,CWE-789,CWE-078,CWE-832,CWE-835,CWE-843,CWE-090,dataset,label_source
8824,,42.0,/mnt/md0/user/scheuererra68323/testset_jtt/C/C...,,,,1.0,,,,...,,,,,,,,,SARD-102,test_suite
16842,,51.0,/mnt/md0/user/scheuererra68323/testset_jtt/C/C...,,,,,,,1.0,...,,,,,,,,,SARD-102,test_suite
57255,,43.0,/mnt/md0/user/scheuererra68323/testset_jtt/C/C...,,,,,,,,...,,,,,,,,,SARD-102,test_suite
32228,,144.0,/mnt/md0/user/scheuererra68323/testset_jtt/C/C...,,,,,,,,...,,,,,,,,,SARD-102,test_suite
37804,,146.0,/mnt/md0/user/scheuererra68323/testset_jtt/C/C...,,,,,,,,...,,,,,,,,,SARD-102,test_suite
13373,,34.0,/mnt/md0/user/scheuererra68323/testset_jtt/C/C...,,,,,,1.0,,...,,,,,,,,,SARD-102,test_suite
20390,,54.0,/mnt/md0/user/scheuererra68323/testset_jtt/C/C...,,,,,,,,...,,,,,,,,,SARD-102,test_suite
47390,,31.0,/mnt/md0/user/scheuererra68323/testset_jtt/C/C...,,,,,,,,...,,,,,,,,,SARD-102,test_suite
20343,,65.0,/mnt/md0/user/scheuererra68323/testset_jtt/C/C...,,,,,,,,...,,,,,,,,,SARD-102,test_suite
45963,,38.0,/mnt/md0/user/scheuererra68323/testset_jtt/C/C...,,,,,,,,...,,,,,,,,,SARD-102,test_suite


In [15]:
testcase_labels.to_hdf('/mnt/md0/user/scheuererra68323/testset_jtt/JTT_Testcases_Labels.h5', 'JTT_Testcases_Labels')