In [83]:
import xml.etree.ElementTree as et
import pandas as pd

# file 1000.xml can be downloaded from https://cwe.mitre.org/data/definitions/1000.html - "Downloads - XML"
tree = et.parse('1000.xml')
root = tree.getroot()

# shorthand for XML namespace
ns = {'d': 'http://cwe.mitre.org/cwe-6'}

#et.tostring(root)
weaknesses = root.find('d:Weaknesses', ns)
columns = ['ID', 'Name']
relcolumns = ['child_ID', 'ID']
weak = pd.DataFrame(columns = columns)
rel = pd.DataFrame(columns = relcolumns)

for w in weaknesses.findall('d:Weakness', ns):
    _id = w.attrib.get('ID')
    _name = w.attrib.get('Name')
    weak = weak.append(pd.Series([_id, _name], index = columns), ignore_index=True)
    
    try:
        r = w.find('d:Related_Weaknesses', ns).find('d:Related_Weakness[@Nature="ChildOf"][@View_ID="1000"][@Ordinal="Primary"]', ns)
        _child_ID = _id
        _parent_ID = r.attrib.get('CWE_ID')
        rel = rel.append(pd.Series([_child_ID, _parent_ID], index = relcolumns), ignore_index=True)
    except:
        pass
    

df = weak.merge(rel, on="ID", how="inner")

df.ID, df.child_ID = pd.to_numeric(df.ID), pd.to_numeric(df.child_ID)




### Look up multilayer relationships

# CWE IDs of root Research Concepts that we will use as clusters
researchconcepts = [284, 435, 664, 682, 691, 693, 697, 703, 707, 710]

def get_parents(row):
        parent = row.ID
        while parent not in researchconcepts:
            w = weaknesses.find(f'd:Weakness[@ID="{parent}"]', ns)
            parent = int( w.find('d:Related_Weaknesses', ns).find('d:Related_Weakness[@Nature="ChildOf"][@View_ID="1000"][@Ordinal="Primary"]', ns).attrib.get('CWE_ID') )
            
        return parent


df.ID = df.apply(get_parents, axis=1)
df = df.reset_index(drop=True)[['ID', 'child_ID']]
df

Unnamed: 0,ID,child_ID
0,697,187
1,697,478
2,697,839
3,697,486
4,697,595
...,...,...
859,707,90
860,707,97
861,707,641
862,707,694


In [84]:
df.to_hdf(r'cwe_by_research_concepts_relations.h5', 'cwe_by_research_concepts_relations')