In [1]:
import os
from torch_geometric.data import InMemoryDataset
import requests, zipfile, io
import pickle
import pandas as pd
import numpy as np
# Here, I have disabled a false alarm that would otherwise trip later in the project.
pd.options.mode.chained_assignment = None

# The datetime library will let me filter the data by reporting date.
from datetime import datetime, timedelta
# Since the NVD data is housed in JavaScript Object Notation (JSON) format, I will need the json_normalize function to access and manipulate the information.
from pandas.io.json import json_normalize
from torch_geometric.data import Data
import sys
import torch
import re

In [2]:
import torch# If there's a GPU available...
import random
import multiprocessing
import time

NUM_GPUS=0

try:
    if torch.cuda.is_available():  
        device = torch.device("cuda")
        NUM_GPUS=torch.cuda.device_count()
        print('There are %d GPU(s) available.' % NUM_GPUS)
        print('We will use the GPU:', torch.cuda.get_device_name())# If not...
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")  
except:
    print('Cuda error using CPU instead.')
    device = torch.device("cpu")  
    
print(device)

# device = torch.device("cpu")  
# print(device)

NUM_PROCESSORS=multiprocessing.cpu_count()
print("Cpu count: ",NUM_PROCESSORS)

No GPU available, using the CPU instead.
cpu
Cpu count:  4


In [3]:
import pprint

In [4]:
# # Expanding view area to facilitate data manipulation.
# pd.set_option('display.max_rows', 20)
# pd.set_option('display.max_columns', 100)

In [5]:
class DataPath():
    def __init__(self,dataset_dir='',results_dir=''):
        self.PATH_TO_DATASETS_DIRECTORY=dataset_dir
        self.PATH_TO_RESULTS_DIRECTORY=results_dir
        
        self.NVD_CWE_FILE=self.PATH_TO_DATASETS_DIRECTORY+'NVD_CWE.csv'        
        self.MITRE_CWE_FILE=self.PATH_TO_DATASETS_DIRECTORY+'MITRE_CWE_1000.csv'
        self.MITRE_CAPEC_FILE=self.PATH_TO_DATASETS_DIRECTORY+'MITRE_CAPEC_1000.csv'
        
    
        if not os.path.exists(self.PATH_TO_DATASETS_DIRECTORY):
            print("Creating directory: ",self.PATH_TO_DATASETS_DIRECTORY)
            os.makedirs(self.PATH_TO_DATASETS_DIRECTORY)
        if not os.path.exists(self.PATH_TO_RESULTS_DIRECTORY):
            print("Creating directory: ",self.PATH_TO_RESULTS_DIRECTORY)
            os.makedirs(self.PATH_TO_RESULTS_DIRECTORY)

dataPath = DataPath('NVD/','NVD/')

In [6]:
def NVD_CWE(dataPath):
    df_CWE=pd.read_csv(dataPath.NVD_CWE_FILE,low_memory=False)    
    return df_CWE

df_CWE=NVD_CWE(dataPath)
df_CWE

Unnamed: 0,CVE Description,CWE Code,Name
0,Access of Resource Using Incompatible Type ('T...,['CWE-843'],CWE-843
1,Access of Uninitialized Pointer The program ac...,['CWE-824'],CWE-824
2,Allocation of Resources Without Limits or Thro...,['CWE-770'],CWE-770
3,Always-Incorrect Control Flow Implementation T...,['CWE-670'],CWE-670
4,Authentication Bypass by Capture-replay A capt...,['CWE-294'],CWE-294
...,...,...,...
119,Use of Password Hash With Insufficient Computa...,['CWE-916'],CWE-916
120,Use of Uninitialized Resource The software use...,['CWE-908'],CWE-908
121,Weak Password Recovery Mechanism for Forgotten...,['CWE-640'],CWE-640
122,Weak Password Requirements The product does no...,['CWE-521'],CWE-521


In [7]:
df_CAPEC_org=pd.read_csv(dataPath.MITRE_CAPEC_FILE,low_memory=False, index_col=False)    
df_CAPEC_org

Unnamed: 0,ID,Name,Abstraction,Status,Description,Alternate Terms,Likelihood Of Attack,Typical Severity,Related Attack Patterns,Execution Flow,Prerequisites,Skills Required,Resources Required,Indicators,Consequences,Mitigations,Example Instances,Related Weaknesses,Taxonomy Mappings,Notes
0,1,Accessing Functionality Not Properly Constrain...,Standard,Draft,"In applications, particularly web applications...",,High,High,::NATURE:ChildOf:CAPEC ID:122::NATURE:CanPrece...,::STEP:1:PHASE:Explore:DESCRIPTION:[Survey] Th...,::The application must be navigable in a manne...,::SKILL:In order to discover unrestricted reso...,::None: No specialized resources are required ...,,::SCOPE:Confidentiality:SCOPE:Access Control:S...,"::In a J2EE setting, administrators can associ...",::Implementing the Model-View-Controller (MVC)...,::276::285::434::693::732::1193::1220::1297::1...,TAXONOMY NAME:ATTACK:ENTRY ID:1574.010:ENTRY N...,
1,10,Buffer Overflow via Environment Variables,Detailed,Draft,This attack pattern involves causing a buffer ...,,High,High,::NATURE:ChildOf:CAPEC ID:100::,::STEP:1:PHASE:Explore:DESCRIPTION:[Identify t...,::The application uses environment variables.:...,::SKILL:An attacker can simply overflow a buff...,,"::If the application does bound checking, it s...",::SCOPE:AvailabilityTECHNICAL IMPACT:Unreliabl...,::Do not expose environment variable to the us...,::Attack Example: Buffer Overflow in $HOME A b...,::120::302::118::119::74::99::20::680::733::697::,TAXONOMY NAME:OWASP Attacks:ENTRY NAME:Buffer ...,
2,100,Overflow Buffers,Standard,Draft,Buffer Overflow attacks target improper or mis...,,High,Very High,::NATURE:ChildOf:CAPEC ID:123::,::STEP:1:PHASE:Explore:DESCRIPTION:[Identify t...,::Targeted software performs buffer operations...,"::SKILL:In most cases, overflowing a buffer do...",::None: No specialized resources are required ...,::An attack designed to leverage a buffer over...,::SCOPE:AvailabilityTECHNICAL IMPACT:Unreliabl...,::Use a language or compiler that performs aut...,::The most straightforward example is an appli...,::120::119::131::129::805::680::,TAXONOMY NAME:WASC:ENTRY ID:07:ENTRY NAME:Buff...,
3,101,Server Side Include (SSI) Injection,Detailed,Draft,An attacker can use Server Side Include (SSI) ...,,High,High,::NATURE:ChildOf:CAPEC ID:253::NATURE:CanPrece...,::STEP:1:PHASE:Explore:DESCRIPTION:[Determine ...,::A web server that supports server side inclu...,::SKILL:The attacker needs to be aware of SSI ...,::None: No specialized resources are required ...,,::SCOPE:ConfidentialityTECHNICAL IMPACT:Read D...,::Set the OPTIONS IncludesNOEXEC in the global...,::Consider a website hosted on a server that p...,::97::74::20::,TAXONOMY NAME:WASC:ENTRY ID:36:ENTRY NAME:SSI ...,
4,102,Session Sidejacking,Detailed,Draft,Session sidejacking takes advantage of an unen...,,High,High,::NATURE:ChildOf:CAPEC ID:593::,::STEP:1:PHASE:Explore:DESCRIPTION:[Detect Unp...,::An attacker and the victim are both using th...,::SKILL:Easy to use tools exist to automate th...,"::A packet sniffing tool, such as wireshark, c...",,::SCOPE:Confidentiality:SCOPE:Access Control:S...,::Make sure that HTTPS is used to communicate ...,::The attacker and the victim are using the sa...,::294::522::523::319::614::,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,94,Adversary in the Middle (AiTM),Meta,Stable,An adversary targets the communication between...,::TERM:Man-in-the-Middle / MITM:DESCRIPTION:::...,High,Very High,::NATURE:CanPrecede:CAPEC ID:151::NATURE:CanPr...,::STEP:1:PHASE:Experiment:DESCRIPTION:The atta...,::There are two components communicating with ...,::SKILL:This attack can get sophisticated sinc...,,,::SCOPE:IntegrityTECHNICAL IMPACT:Modify Data:...,::Ensure Public Keys are signed by a Certifica...,"::In 2017, security researcher Jerry Decime di...",::300::290::593::287::294::,TAXONOMY NAME:ATTACK:ENTRY ID:1557:ENTRY NAME:...,
542,95,WSDL Scanning,Detailed,Draft,This attack targets the WSDL interface made av...,,High,High,::NATURE:ChildOf:CAPEC ID:54::,::STEP:1:PHASE:Explore:DESCRIPTION:[Scan for W...,::A client program connecting to a web service...,::SKILL:This attack can be as simple as readin...,,,::SCOPE:ConfidentialityTECHNICAL IMPACT:Read D...,::It is important to protect WSDL file or prov...,::A WSDL interface may expose a function vulne...,::538::,,
543,96,Block Access to Libraries,Detailed,Draft,An application typically makes calls to functi...,,Medium,Medium,::NATURE:ChildOf:CAPEC ID:603::,::STEP:1:PHASE:Explore:DESCRIPTION:Determine w...,::An application requires access to external l...,::SKILL:Knowledge of how to block access to li...,,,::SCOPE:AvailabilityTECHNICAL IMPACT:Alter Exe...,::Ensure that application handles situations w...,::A web-based system uses a third party crypto...,::589::,,
544,97,Cryptanalysis,Standard,Draft,Cryptanalysis is a process of finding weakness...,,Low,Very High,::NATURE:ChildOf:CAPEC ID:192::NATURE:CanPrece...,::STEP:1:PHASE:Explore:DESCRIPTION:An attacker...,::The target software utilizes some sort of cr...,::SKILL:Cryptanalysis generally requires a ver...,::Computing resource requirements will vary ba...,,::SCOPE:ConfidentialityTECHNICAL IMPACT:Read D...,::Use proven cryptographic algorithms with rec...,::A very easy to understand example is a crypt...,::327::1204::1240::1241::1279::,TAXONOMY NAME:OWASP Attacks:ENTRY NAME:Cryptan...,


In [8]:
## capec to cwe

In [9]:
CAPECS_ID=df_CAPEC_org['ID']
CAPECS_ID
CAPECS_CWE=df_CAPEC_org['Related Weaknesses']
CAPECS_CWE

0      ::276::285::434::693::732::1193::1220::1297::1...
1      ::120::302::118::119::74::99::20::680::733::697::
2                       ::120::119::131::129::805::680::
3                                         ::97::74::20::
4                            ::294::522::523::319::614::
                             ...                        
541                          ::300::290::593::287::294::
542                                              ::538::
543                                              ::589::
544                      ::327::1204::1240::1241::1279::
545                                                  NaN
Name: Related Weaknesses, Length: 546, dtype: object

In [10]:
CAPEC_CWEs=[]

for row in CAPECS_CWE:        
        #print(row)        
        cwes_num=[]
        
        if pd.isna(row):            
            None
        else:
            for cwe in row.split("::"):
                if len(cwe)==0:
                    continue
                cwes_num.append(int(cwe))
        
        #print(cwes_num)
        
        CAPEC_CWEs.append(cwes_num)
        
print(len(CAPEC_CWEs))

546


In [11]:
df_CWE_org=pd.read_csv(dataPath.MITRE_CWE_FILE,low_memory=False, index_col=False)    
df_CWE_org

Unnamed: 0,CWE-ID,Name,Weakness Abstraction,Status,Description,Extended Description,Related Weaknesses,Weakness Ordinalities,Applicable Platforms,Background Details,...,Likelihood of Exploit,Common Consequences,Detection Methods,Potential Mitigations,Observed Examples,Functional Areas,Affected Resources,Taxonomy Mappings,Related Attack Patterns,Notes
0,5,J2EE Misconfiguration: Data Transmission Witho...,Variant,Draft,Information sent over a network can be comprom...,,::NATURE:ChildOf:CWE ID:319:VIEW ID:1000:ORDIN...,,::LANGUAGE NAME:Java:LANGUAGE PREVALENCE:Undet...,,...,,::SCOPE:Confidentiality:IMPACT:Read Applicatio...,,::PHASE:System Configuration:DESCRIPTION:The a...,,,,::TAXONOMY NAME:7 Pernicious Kingdoms:ENTRY NA...,,::TYPE:Other:NOTE:If an application uses SSL t...
1,6,J2EE Misconfiguration: Insufficient Session-ID...,Variant,Incomplete,The J2EE application is configured to use an i...,If an attacker can guess or steal a session ID...,::NATURE:ChildOf:CWE ID:334:VIEW ID:1000:ORDIN...,,::LANGUAGE NAME:Java:LANGUAGE PREVALENCE:Undet...,::Session ID's can be used to identify communi...,...,,::SCOPE:Access Control:IMPACT:Gain Privileges ...,,::PHASE:Implementation:DESCRIPTION:Session ide...,,,,::TAXONOMY NAME:7 Pernicious Kingdoms:ENTRY NA...,::21::59::,
2,7,J2EE Misconfiguration: Missing Custom Error Page,Variant,Incomplete,The default error page of a web application sh...,A Web application must define a default error ...,::NATURE:ChildOf:CWE ID:756:VIEW ID:1000:ORDIN...,,::LANGUAGE NAME:Java:LANGUAGE PREVALENCE:Undet...,,...,,::SCOPE:Confidentiality:IMPACT:Read Applicatio...,,::PHASE:Implementation:DESCRIPTION:Handle exce...,,,,::TAXONOMY NAME:7 Pernicious Kingdoms:ENTRY NA...,,
3,8,J2EE Misconfiguration: Entity Bean Declared Re...,Variant,Incomplete,When an application exposes a remote interface...,,::NATURE:ChildOf:CWE ID:668:VIEW ID:1000:ORDIN...,,,,...,,::SCOPE:Confidentiality:SCOPE:Integrity:IMPACT...,,::PHASE:Implementation:DESCRIPTION:Declare Jav...,,,,::TAXONOMY NAME:7 Pernicious Kingdoms:ENTRY NA...,,::TYPE:Other:NOTE:Entity beans that expose a r...
4,9,J2EE Misconfiguration: Weak Access Permissions...,Variant,Draft,If elevated access rights are assigned to EJB ...,If the EJB deployment descriptor contains one ...,::NATURE:ChildOf:CWE ID:266:VIEW ID:1000:ORDIN...,,,,...,,::SCOPE:Other:IMPACT:Other::,,::PHASE:Architecture and Design System Configu...,,,,::TAXONOMY NAME:7 Pernicious Kingdoms:ENTRY NA...,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,1338,Improper Protections Against Hardware Overheating,Base,Draft,A hardware device is missing or has inadequate...,"Hardware, electrical circuits, and semiconduct...",::NATURE:ChildOf:CWE ID:693:VIEW ID:1000::,,::LANGUAGE CLASS:Language-Independent:LANGUAGE...,,...,,::SCOPE:Availability:IMPACT:DoS: Resource Cons...,::METHOD:Dynamic Analysis with Manual Results ...,::PHASE:Architecture and Design:DESCRIPTION:Te...,,,,,,
920,1339,Insufficient Precision or Accuracy of a Real N...,Base,Draft,The program processes a real number with an im...,When a security decision or calculation requir...,::NATURE:ChildOf:CWE ID:682:VIEW ID:1000:ORDIN...,,::LANGUAGE CLASS:Language-Independent:LANGUAGE...,::There are three major ways to store real num...,...,,"::SCOPE:Availability:IMPACT:DoS: Crash, Exit, ...",,::PHASE:Implementation Patching and Maintenanc...,::REFERENCE:CVE-2018-16069:DESCRIPTION:Chain: ...,,,,,
921,1341,Multiple Releases of Same Resource or Handle,Base,Incomplete,The product attempts to close or release a res...,Code typically requires opening handles or ref...,::NATURE:ChildOf:CWE ID:675:VIEW ID:1000:ORDIN...,,::LANGUAGE NAME:Java:LANGUAGE PREVALENCE:Undet...,,...,,::SCOPE:Availability:SCOPE:Integrity:IMPACT:Do...,::METHOD:Automated Static Analysis:DESCRIPTION...,::PHASE:Implementation:DESCRIPTION:When closin...,::REFERENCE:CVE-2019-13351:DESCRIPTION:file de...,,,,,::TYPE:Terminology:NOTE:The terms related to r...
922,1342,Information Exposure through Microarchitectura...,Base,Incomplete,The processor does not properly clear microarc...,"In many processor architectures an exception, ...",::NATURE:ChildOf:CWE ID:226:VIEW ID:1000:ORDIN...,,::LANGUAGE CLASS:Language-Independent:LANGUAGE...,,...,,::SCOPE:Confidentiality:SCOPE:Integrity:IMPACT...,,::PHASE:Architecture and Design Requirements:D...,::REFERENCE:CVE-2020-0551:DESCRIPTION:Load val...,,,,,::TYPE:Relationship:NOTE:CWE-1342 differs from...


In [12]:
NVD_CWEs=[]

for row in df_CWE['Name']:
    cwe=int(row.split('-')[1])    
    #print(cwe)
    NVD_CWEs.append(cwe)
    
print(NVD_CWEs, len(NVD_CWEs))

[843, 824, 770, 670, 294, 290, 639, 120, 312, 319, 362, 352, 502, 425, 369, 415, 494, 834, 668, 200, 610, 552, 209, 287, 295, 273, 754, 913, 94, 116, 924, 178, 755, 665, 20, 22, 59, 667, 88, 79, 74, 78, 89, 281, 269, 212, 404, 307, 119, 920, 776, 1021, 611, 662, 129, 354, 347, 915, 326, 838, 829, 459, 444, 863, 682, 131, 697, 681, 276, 732, 669, 704, 335, 1188, 922, 532, 331, 613, 345, 522, 190, 191, 436, 835, 306, 862, 311, 909, 401, 772, 476, 203, 193, 672, 346, 125, 787, 617, 763, 565, 918, 384, 367, 252, 674, 400, 427, 428, 434, 426, 601, 416, 327, 338, 134, 470, 798, 706, 330, 916, 908, 640, 521, 91] 124


In [13]:
nvd2CAPECs=[]

for cwe_nvd in NVD_CWEs:
#     print(cwe_nvd)
    found=False
    
    for i, row in enumerate(zip(df_CWE_org['CWE-ID'],df_CWE_org['Related Attack Patterns'])):
#         print(row)
        cwe_id=int(row[0])
        cwe_capecs=row[1]
        
        if cwe_nvd==cwe_id:
            found=True
            
#             print("found")
            
            cwes_num=[]
            if pd.isna(cwe_capecs):            
                None
            else:
                for cwe in cwe_capecs.split("::"):
                    if len(cwe)==0:
                        continue
                    cwes_num.append(int(cwe))
            nvd2CAPECs.append(cwes_num)            
            break        
    if found==False:
        nvd2CAPECs.append([])
    
#     break
    
pprint.pprint(nvd2CAPECs)
print(len(nvd2CAPECs))
                            

[[],
 [],
 [125,
  130,
  147,
  197,
  229,
  230,
  231,
  469,
  482,
  486,
  487,
  488,
  489,
  490,
  491,
  493,
  494,
  495,
  496,
  528],
 [],
 [102, 509, 555, 561, 60, 644, 645, 652, 94],
 [21, 22, 459, 461, 473, 476, 59, 60, 667, 94],
 [],
 [10, 100, 14, 24, 42, 44, 45, 46, 47, 67, 8, 9, 92],
 [37],
 [102, 117, 383, 477, 65],
 [26, 29],
 [111, 462, 467, 62],
 [586],
 [127, 668, 87],
 [],
 [],
 [184, 185, 186, 187, 533, 657, 662],
 [],
 [],
 [116,
  13,
  169,
  22,
  224,
  285,
  287,
  290,
  291,
  292,
  293,
  294,
  295,
  296,
  297,
  298,
  299,
  300,
  301,
  302,
  303,
  304,
  305,
  306,
  307,
  308,
  309,
  310,
  312,
  313,
  317,
  318,
  319,
  320,
  321,
  322,
  323,
  324,
  325,
  326,
  327,
  328,
  329,
  330,
  472,
  497,
  508,
  573,
  574,
  575,
  576,
  577,
  59,
  60,
  616,
  643,
  646,
  651,
  79],
 [219],
 [150, 639],
 [215, 463, 54, 7],
 [114, 115, 151, 194, 22, 57, 593, 633, 650, 94],
 [459, 475],
 [],
 [],
 [],
 [242, 35, 77

In [14]:
from ipynb.fs.full.NVD_CWE_hierarchy import get_nvd_hierarchy

In [15]:
child_parent, parent_child, depth=get_nvd_hierarchy()

levels={}
for key,val in depth.items():
    if val in levels:
        levels[val].append(key)
    else:
        levels[val]=[key]

print(len(levels[0]))
print(len(levels[1]))

36
88


In [16]:
count=0
no_siblings=0
for i,cwe in enumerate(NVD_CWEs):
    if cwe in levels[1]:
        print(cwe,nvd2CAPECs[i])
        if(len(nvd2CAPECs[i])==0):
            count+=1    
        
print(count)

843 []
824 []
770 [125, 130, 147, 197, 229, 230, 231, 469, 482, 486, 487, 488, 489, 490, 491, 493, 494, 495, 496, 528]
294 [102, 509, 555, 561, 60, 644, 645, 652, 94]
290 [21, 22, 459, 461, 473, 476, 59, 60, 667, 94]
639 []
120 [10, 100, 14, 24, 42, 44, 45, 46, 47, 67, 8, 9, 92]
312 [37]
319 [102, 117, 383, 477, 65]
352 [111, 462, 467, 62]
502 [586]
425 [127, 668, 87]
369 []
415 []
494 [184, 185, 186, 187, 533, 657, 662]
552 [150, 639]
209 [215, 463, 54, 7]
295 [459, 475]
273 []
94 [242, 35, 77]
924 []
178 []
22 [126, 64, 76, 78, 79]
59 [132, 17, 35, 76]
667 [25, 26, 27]
88 [137, 174, 41, 460, 88]
79 [209, 588, 591, 592, 63, 85]
78 [108, 15, 43, 6, 88]
89 [108, 109, 110, 470, 66, 7]
281 []
212 [168]
307 [16, 49, 560, 565, 600, 652, 653]
920 []
776 []
1021 [103, 181, 222, 504, 506, 654]
611 [221]
129 [100]
354 [145, 463, 75]
347 [463, 475]
915 []
838 [468]
829 [175, 201, 228, 251, 252, 253, 263, 549, 660]
459 []
444 [105, 33]
131 [100, 47]
681 []
276 [1, 127, 81]
335 []
1188 [665]
532 [

### Prepare avaiable links

#### Given Links

In [17]:
cwe2capec_dict={}
for i,cwe in enumerate(NVD_CWEs):
    cwe2capec_dict[cwe]=nvd2CAPECs[i]

In [18]:
cwe2capec_dict_original=cwe2capec_dict.copy()

### for testing purposes
cwes_for_test=[120, 79, 354, 611, 918]

##resetting those mappings
for i in cwes_for_test:
    cwe2capec_dict[i]=[]

In [19]:
U=[]
V=[]

for i,cwe in enumerate(NVD_CWEs):
        print(cwe,cwe2capec_dict[cwe])        
        if(len(cwe2capec_dict[cwe])>0):
            for capec in cwe2capec_dict[cwe]:
                U.append(cwe)
                V.append(capec)
                
print(U,len(U))
print(V,len(V))

843 []
824 []
770 [125, 130, 147, 197, 229, 230, 231, 469, 482, 486, 487, 488, 489, 490, 491, 493, 494, 495, 496, 528]
670 []
294 [102, 509, 555, 561, 60, 644, 645, 652, 94]
290 [21, 22, 459, 461, 473, 476, 59, 60, 667, 94]
639 []
120 []
312 [37]
319 [102, 117, 383, 477, 65]
362 [26, 29]
352 [111, 462, 467, 62]
502 [586]
425 [127, 668, 87]
369 []
415 []
494 [184, 185, 186, 187, 533, 657, 662]
834 []
668 []
200 [116, 13, 169, 22, 224, 285, 287, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 312, 313, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 472, 497, 508, 573, 574, 575, 576, 577, 59, 60, 616, 643, 646, 651, 79]
610 [219]
552 [150, 639]
209 [215, 463, 54, 7]
287 [114, 115, 151, 194, 22, 57, 593, 633, 650, 94]
295 [459, 475]
273 []
754 []
913 []
94 [242, 35, 77]
116 [104, 73, 81, 85]
924 []
178 []
755 []
665 [26, 29]
20 [10, 101, 104, 108, 109, 110, 120, 13, 135, 136, 14, 153, 182, 209, 22, 23, 230, 231

#### Parent links

In [20]:
pU=[]
pV=[]

for i,cwe in enumerate(NVD_CWEs):
    if (len(cwe2capec_dict[cwe])==0) and (cwe in levels[0]) and (cwe in parent_child):
        print(cwe, parent_child[cwe])
        capecs=[]
        for ch_cwe in parent_child[cwe]:
            capecs.extend(cwe2capec_dict[ch_cwe])
            
        print(cwe,capecs)
        print("-"*50)
        
        for capec in capecs:
            pU.append(cwe)
            pV.append(capec)

670 [617]
670 []
--------------------------------------------------
834 [835]
834 []
--------------------------------------------------
668 [134, 426, 427, 428, 552]
668 [135, 67, 38, 38, 471, 150, 639]
--------------------------------------------------
754 [252, 273, 476]
754 []
--------------------------------------------------
913 [470, 502, 915]
913 [586]
--------------------------------------------------
863 [639]
863 []
--------------------------------------------------
669 [212, 434, 494, 565, 829]
669 [168, 1, 184, 185, 186, 187, 533, 657, 662, 226, 31, 39, 175, 201, 228, 251, 252, 253, 263, 549, 660]
--------------------------------------------------
704 [681, 843]
704 []
--------------------------------------------------
672 [415, 416, 613]
672 []
--------------------------------------------------


In [21]:
print(len(U))
print(len(V))
print(len(pU))
print(len(pV))

489
489
29
29


In [22]:
nU=[]
nV=[]

for i, value in enumerate(cwe2capec_dict.items()):
    cwe, capec = value
    if len(capec)==0:
        continue
        
    for j, nvalue in enumerate(cwe2capec_dict.items()):        
        ncwe, ncapec=nvalue
        
        if cwe==ncwe or i>=j or len(ncapec)==0:
            continue
            
        if (cwe in levels[0]) and (ncwe in levels[1]) or (cwe in levels[1]) and (ncwe in levels[0]):
            continue
        
        ##if capec common between two sets
        lc1 = set(capec)
        incapec = lc1.intersection(ncapec)
        if(len(incapec)>0):
            continue            
        ###
        if len(ncapec)>0:
            for nc in ncapec:
                nU.append(cwe)
                nV.append(nc)
    

print(len(nU))
print(len(nV))

6039
6039


In [23]:
print(nU[:5])
print(nV[:5])

# for i, value in enumerate(cwe2capec_dict.items()):
#     print(i,value)

[770, 770, 770, 770, 770]
[102, 509, 555, 561, 60]


In [24]:
# ### prepare links
#positive links
POS_U=[]
POS_V=[]
POS_U.extend(U)
POS_U.extend(pU)
POS_V.extend(V)
POS_V.extend(pV)

#### CWE dataset

In [25]:
df_CWE_org['CWE-ID']
df_CWE_org['Name']
df_CWE_org['Description']
df_CWE_org['Extended Description']

CWE_index_map={}
CWE_description=[]
index=0

for i, row in enumerate(zip(df_CWE_org['CWE-ID'], df_CWE_org['Name'], df_CWE_org['Description'],df_CWE_org['Extended Description'])):
    cwe=row[0]
    if cwe in cwe2capec_dict:
        cwe_des=str(row[1])+'. '+str(row[2])+' '+str(row[3])
        CWE_description.append(cwe_des)
        CWE_index_map[cwe]=index
        index+=1

print(CWE_description[1])
print(CWE_index_map)

Improper Limitation of a Pathname to a Restricted Directory ('Path Traversal'). The software uses external input to construct a pathname that is intended to identify a file or directory that is located underneath a restricted parent directory, but the software does not properly neutralize special elements within the pathname that can cause the pathname to resolve to a location that is outside of the restricted directory. Many file operations are intended to take place within a restricted directory. By using special elements such as .. and / separators, attackers can escape outside of the restricted location to access files or directories that are elsewhere on the system. One of the most common special elements is the ../ sequence, which in most modern operating systems is interpreted as the parent directory of the current location. This is referred to as relative path traversal. Path traversal also covers the use of absolute pathnames such as /usr/local/bin, which may also be useful in

In [26]:
index_CWE_map={}

for key,value in CWE_index_map.items():
    index_CWE_map[value]=key

#### CAPEC descriptions

In [27]:
df_CAPEC_org['ID']
df_CAPEC_org['Name']
df_CAPEC_org['Description']

CAPEC_index_map={}
CAPEC_description=[]
index=0

for i, row in enumerate(zip(df_CAPEC_org['ID'], df_CAPEC_org['Name'], df_CAPEC_org['Description'])):
    capec=row[0]
    
    capec_des=str(row[1])+'. '+str(row[2])
    CAPEC_description.append(capec_des)
    CAPEC_index_map[capec]=index
    index+=1

print(CAPEC_description[0])
print(CAPEC_index_map)

Accessing Functionality Not Properly Constrained by ACLs. In applications, particularly web applications, access to functionality is mitigated by an authorization framework. This framework maps Access Control Lists (ACLs) to elements of the application's functionality; particularly URL's for web apps. In the case that the administrator failed to specify an ACL for a particular element, an attacker may be able to access it with impunity. An attacker with the ability to access functionality not properly constrained by ACLs can obtain sensitive information and possibly compromise the entire application. Such an attacker can access resources that must be available only to users at a higher privilege level, can access management sections of the application, or can run queries for data that they otherwise not supposed to.
{1: 0, 10: 1, 100: 2, 101: 3, 102: 4, 103: 5, 104: 6, 105: 7, 107: 8, 108: 9, 109: 10, 11: 11, 110: 12, 111: 13, 112: 14, 113: 15, 114: 16, 115: 17, 116: 18, 117: 19, 12: 2

In [28]:
index_CAPEC_map={}

for key,value in CAPEC_index_map.items():
    index_CAPEC_map[value]=key

### Learning the pairings

In [29]:
sentences=[]
sentences.extend(CWE_description)
sentences.extend(CAPEC_description)
sentences=np.array(sentences)

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

#tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

tfidf = TfidfVectorizer(max_features=5000, sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

tfidf.fit(sentences)
all_inputs = tfidf.transform(sentences).toarray()
print(all_inputs.shape)

(670, 1740)


In [31]:
#CWE_inputs=all_inputs[]
len(CWE_index_map)

124

In [32]:
#negative links
iPOS_U=[];iPOS_V=[];iNEG_U=[];iNEG_V=[];POS=[];NEG=[];

for i in range(len(POS_U)):
    iPOS_U.append(CWE_index_map[POS_U[i]])
    iPOS_V.append(CAPEC_index_map[POS_V[i]])
    POS.append(1)
    
for i in range(len(nU)):
    iNEG_U.append(CWE_index_map[nU[i]])
    iNEG_V.append(CAPEC_index_map[nV[i]])
    NEG.append(0)
    
print(len(iPOS_U))
print(len(iPOS_V))

print(len(iNEG_U))
print(len(iNEG_V))

518
518
6039
6039


In [33]:
l = round(len(iNEG_U)/len(iPOS_U))
print(l)
iPOS_U=iPOS_U*l
iPOS_V=iPOS_V*l
POS=POS*l

12


In [34]:
print(len(iPOS_U))
print(len(iPOS_V))
print(len(POS))

print(len(iNEG_U))
print(len(iNEG_V))
print(len(NEG))

6216
6216
6216
6039
6039
6039


In [35]:
all_LINK_U=[]
all_LINK_V=[]
all_y=[]

all_LINK_U.extend(iPOS_U)
all_LINK_U.extend(iNEG_U)

all_LINK_V.extend(iPOS_V)
all_LINK_V.extend(iNEG_V)

all_y.extend(POS)
all_y.extend(NEG)

In [36]:
CWE_inputs=all_inputs[:len(CWE_index_map),:]
CAPEC_inputs=all_inputs[len(CWE_index_map):,:]
print(CWE_inputs.shape)
print(CAPEC_inputs.shape)

(124, 1740)
(546, 1740)


In [37]:
import numpy as np
from sklearn.model_selection import train_test_split

In [38]:
train_u, test_u, train_v, test_v, y_train, y_test = train_test_split(all_LINK_U, all_LINK_V, all_y, test_size=0.10)

In [39]:
print(len(train_u))
print(len(test_u))
print(len(train_v))
print(len(test_v))
print(len(y_train))
print(len(y_test))

11029
1226
11029
1226
11029
1226


In [40]:
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [41]:
if type(CWE_inputs)!=torch.Tensor:
    CWE_inputs=(torch.tensor(CWE_inputs)).to(torch.float)
    
if type(CAPEC_inputs)!=torch.Tensor:
    CAPEC_inputs=(torch.tensor(CAPEC_inputs)).to(torch.float)

if type(train_u)!=torch.Tensor:
    train_u=torch.tensor(train_u)
    train_v=torch.tensor(train_v)
    test_u=torch.tensor(test_u)
    test_v=torch.tensor(test_v)
    y_train=torch.tensor(y_train)
    y_test=torch.tensor(y_test)

In [42]:
CWE_inputs[train_u]

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [43]:
batch_size=32

train_data = TensorDataset(CWE_inputs[train_u],CAPEC_inputs[train_v],y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, num_workers=0)

test_data = TensorDataset(CWE_inputs[test_u],CAPEC_inputs[test_v],y_test)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size, num_workers=0)

### Model

In [44]:
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss, CosineEmbeddingLoss
import torch.nn.functional as F

In [45]:
class TFIDFLINK(nn.Module):
    def __init__(self,input_feature,output_feature):
        super(TFIDFLINK, self).__init__()
        #self.fc1 = nn.Linear(input_feature, 768)
        self.fc1 = nn.Linear(input_feature*2, 768)
        self.relu= nn.ReLU()
        self.fc2 = nn.Linear(768, output_feature)
        
        
        self.sigmoid=nn.Sigmoid()
        self.softmax=nn.Softmax(dim=1)
        
    def forward(self, x,y):
        #x = self.fc1(x*y) ##different operator to try
        #x = self.fc1(torch.abs(x-y))
        #x = self.fc1(torch.cat((x, y), 1))
        x = self.fc1(torch.cat((torch.abs(x-y),x*y), 1))        
        #x = self.fc1(torch.cat((x,y,x*y), 1))
        #x = self.fc1(torch.cat((x,y,torch.abs(x-y)), 1))
        #x = self.fc1(torch.cat((x,y,torch.abs(x-y),x*y), 1))
        x = self.relu(x)
        x = self.fc2(x)
        
        return x
    
# test=TFIDFLINK(4,2)
# test(torch.rand((10,4)),torch.rand((10,4)))

In [46]:
link_model=TFIDFLINK(CWE_inputs[0].shape[0], 2)

if(device.type=='cpu'):
    print("Using CPU")
else:
    if NUM_GPUS > 1:
        print("Using {} GPUS".format(NUM_GPUS))
        link_model = nn.DataParallel(link_model)

    print("Depolying model to ",device)
    link_model.cuda(device)

Using CPU


In [47]:
link_model

TFIDFLINK(
  (fc1): Linear(in_features=3480, out_features=768, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=768, out_features=2, bias=True)
  (sigmoid): Sigmoid()
  (softmax): Softmax(dim=1)
)

#### Train

In [90]:
from torch.utils.tensorboard import SummaryWriter

# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('./log/capec/')



In [91]:
writer.add_graph(link_model, (CWE_inputs[:2,:], CAPEC_inputs[:2,:]))
writer.close()

In [97]:
import tensorflow as tf
import tensorboard as tb
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile

#https://github.com/pytorch/pytorch/issues/30966

In [100]:
class_labels=[index_CWE_map[i] for i in range(len(CWE_inputs))]

writer.add_embedding(CWE_inputs,
                     metadata=class_labels,                     
                    #label_img=images.unsqueeze(1),
                     tag='CWE embedding'
                    )
writer.close()

In [101]:
capec_class_labels=[index_CAPEC_map[i] for i in range(len(CAPEC_inputs))]

writer.add_embedding(CAPEC_inputs,
                     metadata=capec_class_labels,                     
                    #label_img=images.unsqueeze(1),
                     tag='CAPEC embedding'
                    )
writer.close()

In [94]:
CWE_inputs.shape

torch.Size([124, 1740])

In [103]:
# helper function
def add_pr_curve_tensorboard(y_true, y_pred, global_step=0):
    '''
    Takes in a "class_index" from 0 to 9 and plots the corresponding
    precision-recall curve
    '''
    tensorboard_truth = y_true
    tensorboard_probs = y_pred

    writer.add_pr_curve('Link',
                        tensorboard_truth,
                        tensorboard_probs,
                        global_step=global_step)
    writer.close()


    #add_pr_curve_tensorboard(i, test_probs, test_label)


In [48]:
from sklearn.metrics import f1_score
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [106]:
def LINK_train_model(dataloader, epochs=1):
    
    #parameters=link_model.parameters()
    parameters=filter(lambda p: p.requires_grad, link_model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=1e-3)
    
    # Note: AdamW is a class from the huggingface library (as opposed to pytorch) I believe the 'W' stands for 'Weight Decay fix"
#     optimizer = AdamW(parameters,
#                       lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
#                       eps = 1e-8 # args.adam_epsilon - default is 1e-8.
#                      )    

    # Number of training epochs (authors recommend between 2 and 4. Total number of training steps is number of batches * number of epochs.
    total_steps = len(dataloader) * epochs
    num_warmup_steps=len(dataloader)*0

    # Create the learning rate scheduler.
    #scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = total_steps)
    
    CELoss = CrossEntropyLoss()
    
    train_losses = []# For each epoch...
    train_accs = []
    val_losses = []
    val_accs =[]
    best_val=None
    
    with torch.profiler.profile(
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2),
        on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/capec'),
        record_shapes=True,
        profile_memory=True,
        with_stack=True
) as prof:
        for epoch_i in range(0, epochs):

            y_true_all=[]
            y_pred_all=[]

            train_accuracy = 0
            nb_train_steps, nb_train_examples = 0, 0 # Evaluate data for one epoch


            print('\n======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
            print('Training...')

            t0 = time.time() 
            total_loss = 0

            link_model.train() 
            for step, batch in enumerate(dataloader):
                if step % 40 == 0 and not step == 0:
                    elapsed = format_time(time.time() - t0)
                    print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(dataloader), elapsed)) 

                link_model.zero_grad()

                CWE_vec=batch[0].to(device)
                CAPEC_vec=batch[1].to(device)
                y_true=batch[2].to(device)

                logits=link_model(CWE_vec,CAPEC_vec)                        
                loss= CELoss(logits,y_true)

                y_pred=torch.argmax(logits, dim=1)

    #             print("True:", y_true)
    #             print("Pred:", y_pred)

                y_true_all.extend(y_true.detach().cpu().numpy())
                y_pred_all.extend(y_pred.detach().cpu().numpy())


                if NUM_GPUS > 1:
                        loss = loss.mean() 

                loss_value = loss.item()
                loss.backward() 

                # Clip the norm of the gradients to 1.0.
                # This is to help prevent the "exploding gradients" problem.
                torch.nn.utils.clip_grad_norm_(link_model.parameters(), 1.0) 
                optimizer.step() 
                #scheduler.step()

                total_loss+= loss_value
                nb_train_steps += 1 

                torch.cuda.empty_cache()

                prof.step()
                
                if step % 40 == 0 and not step == 0:
                    # ...log the running loss
                    writer.add_scalar('training loss',total_loss / nb_train_steps, epoch_i * len(dataloader) + step)                
                    

            avg_train_loss = total_loss / nb_train_steps
            train_losses.append(avg_train_loss) 

            train_acc = f1_score(y_true_all, y_pred_all, average='macro')
            train_accs.append(train_acc)

            print(" Average training loss: {0:.4f}".format(avg_train_loss))
            print(" Train F1-Score: {0:.4f}".format(train_acc))
            
            writer.add_scalar('Loss/train', avg_train_loss, epoch_i)
            writer.add_scalar('Accuracy/train', train_acc, epoch_i)
        
        
    print("Link Prediction Training complete!")
    print(train_accs)
#     print("Saving Last model")
#     torch.save(link_model.state_dict(), MODEL_SAVE_DIR+'_LINK_LAST2c')

#     print("Evaluate test model")
#     LINK_evaluate_model(test_dataloader)


LINK_train_model(train_dataloader, epochs=5)


Training...
 Batch    40 of   345. Elapsed: 0:00:06.
 Batch    80 of   345. Elapsed: 0:00:07.
 Batch   120 of   345. Elapsed: 0:00:08.
 Batch   160 of   345. Elapsed: 0:00:09.
 Batch   200 of   345. Elapsed: 0:00:10.
 Batch   240 of   345. Elapsed: 0:00:11.
 Batch   280 of   345. Elapsed: 0:00:12.
 Batch   320 of   345. Elapsed: 0:00:13.
 Average training loss: 0.0000
 Train F1-Score: 1.0000

Training...
 Batch    40 of   345. Elapsed: 0:00:01.
 Batch    80 of   345. Elapsed: 0:00:02.
 Batch   120 of   345. Elapsed: 0:00:03.
 Batch   160 of   345. Elapsed: 0:00:04.
 Batch   200 of   345. Elapsed: 0:00:04.
 Batch   240 of   345. Elapsed: 0:00:05.
 Batch   280 of   345. Elapsed: 0:00:06.
 Batch   320 of   345. Elapsed: 0:00:07.
 Average training loss: 0.0020
 Train F1-Score: 0.9995

Training...
 Batch    40 of   345. Elapsed: 0:00:01.
 Batch    80 of   345. Elapsed: 0:00:02.
 Batch   120 of   345. Elapsed: 0:00:03.
 Batch   160 of   345. Elapsed: 0:00:04.
 Batch   200 of   345. Elapsed:

In [105]:
def LINK_val_model(dataloader):
    
    y_true_all=[]
    y_pred_all=[]

    train_accuracy = 0
    nb_val_steps, nb_val_examples = 0, 0 # Evaluate data for one epoch

    print('Test...\n',"*"*50)

    t0 = time.time() 
    total_loss = 0
    
    CELoss = CrossEntropyLoss()

    link_model.eval() 
    
    for step, batch in enumerate(dataloader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(dataloader), elapsed)) 

        CWE_vec=batch[0].to(device)
        CAPEC_vec=batch[1].to(device)
        y_true=batch[2].to(device)

        logits=link_model(CWE_vec,CAPEC_vec)                        
        loss= CELoss(logits,y_true)

        y_pred=torch.argmax(logits, dim=1)

#             print("True:", y_true)
#             print("Pred:", y_pred)

        y_true_all.extend(y_true.detach().cpu().numpy())
        y_pred_all.extend(y_pred.detach().cpu().numpy())


        if NUM_GPUS > 1:
                loss = loss.mean() 

        loss_value = loss.item()

        total_loss+= loss_value
        nb_val_steps += 1 
            

    avg_loss = total_loss / nb_val_steps
    val_acc = f1_score(y_true_all, y_pred_all, average='macro')

    print(" Average loss: {0:.4f}".format(avg_loss))
    print(" F1-Score: {0:.4f}".format(val_acc))

    add_pr_curve_tensorboard(np.array(y_true_all), np.array(y_pred_all))
    
#     print("Saving Last model")
#     torch.save(link_model.state_dict(), MODEL_SAVE_DIR+'_LINK_LAST2c')

#     print("Evaluate test model")
#     LINK_evaluate_model(test_dataloader)
    
LINK_val_model(test_dataloader)

Test...
 **************************************************
 Average loss: 0.0105
 F1-Score: 0.9992


### Predicting CAPECS

In [51]:
def getCAPECs(link_model, pCWE_ID, threshold=0.90, top_k=15):
    ipCWE_ID=CWE_index_map[pCWE_ID]
    
    CWE_vec= CWE_inputs[ipCWE_ID].repeat(CAPEC_inputs.shape[0],1).to(device)
    CAPEC_vec=CAPEC_inputs.to(device)
    
#     print(CWE_vec.shape)
#     print(CAPEC_vec.shape)
    
    link_model.eval()
    logits=link_model(CWE_vec,CAPEC_vec)
    
    logits=(F.softmax(logits,dim=1))[:,1]
    #print(logits)
    
    values, logits_index_k=torch.topk(logits, top_k)
    logits_index_k=logits_index_k.cpu().numpy()
    #print(logits_index_k)

    logits_index=(logits>=threshold).nonzero(as_tuple=True)
    logits_index=logits_index[0].cpu().numpy()
    
    #print(logits_index)
    
    common_indexs=np.intersect1d(logits_index, logits_index_k)
    
    #print(common_indexs)
    
    CAPEC_ids=[index_CAPEC_map[i] for i in common_indexs] 
    
    print(np.sort(CAPEC_ids))
    
    return CAPEC_ids


pCWE_ID=120

print("Predicted CAPECs for CWE_ID: ",pCWE_ID)
getCAPECs(link_model, pCWE_ID)

print(".....")
print("Available for CWE_ID: ", pCWE_ID)
print(list(np.sort(cwe2capec_dict[pCWE_ID])))

print("True CAPECs for CWE_ID: ", pCWE_ID)
print(list(np.sort(cwe2capec_dict_original[pCWE_ID])))

Predicted CAPECs for CWE_ID:  120
[  8  10  14  24  47 100 123 312 319 325 328 482 487 573 638]
.....
Available for CWE_ID:  120
[]
True CAPECs for CWE_ID:  120
[8, 9, 10, 14, 24, 42, 44, 45, 46, 47, 67, 92, 100]


In [52]:
for pCWE_ID in cwes_for_test:
    print("Predicted CAPECs for CWE_ID: ",pCWE_ID)
    getCAPECs(link_model, pCWE_ID)

    print("Available for CWE_ID: ", pCWE_ID)
    print(list(np.sort(cwe2capec_dict[pCWE_ID])))

    print("True CAPECs for CWE_ID: ", pCWE_ID)
    print(list(np.sort(cwe2capec_dict_original[pCWE_ID])))
    print("*"*100)

Predicted CAPECs for CWE_ID:  120
[  8  10  14  24  47 100 123 312 319 325 328 482 487 573 638]
Available for CWE_ID:  120
[]
True CAPECs for CWE_ID:  120
[8, 9, 10, 14, 24, 42, 44, 45, 46, 47, 67, 92, 100]
****************************************************************************************************
Predicted CAPECs for CWE_ID:  79
[169 224 285 312 319 320 321 325 328 482 537 573 574 580 634]
Available for CWE_ID:  79
[]
True CAPECs for CWE_ID:  79
[63, 85, 209, 588, 591, 592]
****************************************************************************************************
Predicted CAPECs for CWE_ID:  354
[ 22  94 224 285 290 312 320 325 328 472 482 537 573 634 638]
Available for CWE_ID:  354
[]
True CAPECs for CWE_ID:  354
[75, 145, 463]
****************************************************************************************************
Predicted CAPECs for CWE_ID:  611
[ 85 224 227 318 319 320 325 328 472 528 537 573 574 577 580]
Available for CWE_ID:  611
[]
True CAPECs f

In [53]:
cwe2capec_dict_original[610]

[219]

In [54]:
pprint.pprint(cwe2capec_dict_original)

{20: [10,
      101,
      104,
      108,
      109,
      110,
      120,
      13,
      135,
      136,
      14,
      153,
      182,
      209,
      22,
      23,
      230,
      231,
      24,
      250,
      261,
      267,
      28,
      3,
      31,
      42,
      43,
      45,
      46,
      47,
      473,
      52,
      53,
      588,
      63,
      64,
      664,
      67,
      7,
      71,
      72,
      73,
      78,
      79,
      8,
      80,
      81,
      83,
      85,
      88,
      9],
 22: [126, 64, 76, 78, 79],
 59: [132, 17, 35, 76],
 74: [10,
      101,
      108,
      120,
      13,
      135,
      14,
      24,
      250,
      267,
      273,
      28,
      3,
      34,
      42,
      43,
      45,
      46,
      47,
      51,
      52,
      53,
      6,
      64,
      67,
      7,
      71,
      72,
      76,
      78,
      79,
      8,
      80,
      83,
      84,
      9],
 78: [108, 15, 43, 6, 88],
 79: [209, 588, 591, 592, 63, 85

In [55]:
cwes_for_test

[120, 79, 354, 611, 918]