In [34]:
from lxml import etree
import xml.etree.ElementTree as ET
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
from gensim.utils import simple_preprocess
import gensim.similarities 
from gensim.models.word2vec import Word2Vec
import igraph as ig
import networkx as nx
import pandas as pd
import numpy as np
from numpy.linalg import norm
from scipy import spatial
from scipy import sparse
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
import csv
from numpy import asarray
from numpy import savetxt
import matplotlib.pyplot as plt

In [76]:
def read_files(files):
    all_entries = {0: {'CVE': "", 'Date': "",'Description': [], 'Description-Tokenized' : [],
                   'Month-String':"",'Month-Int': 0,"Year":"","Year-Int":0,"Cluster": 0,
                    "Time Start": 0, "Time End": 0}}
    
    untokenized_text = list()
    start_index = 0
    year_num = 1
    
    for file in files:
        vulns = parse_data(file)
        for text in vulns[1]:
            untokenized_text.append(text)
        cur_entries = store_entries(vulns,start_index,year_num,len(files))
        start_index = len(cur_entries) + start_index
        all_entries = all_entries | cur_entries
        year_num = year_num + 1
        
    return [all_entries,untokenized_text] 
        

def is_member(value, iterable):
    for item in iterable:
        if value is item or value == item:
              return True
    return False

def parse_data(file):
    tree = ET.parse(file)
    root = tree.getroot()
    notes = list()
    cve = list()
    pub = list()
    for i in range(5,len(root)):
        try:
            token = word_tokenize(root[i][1][0].text)
            if is_member("REJECT",token) == False:  
                pub.append(root[i][1][1].text)
                cve.append(root[i][2].text)
                notes.append(root[i][1][0].text)
        except:
            continue
    
    vulns_info = [cve,notes,pub]
    return vulns_info
            
def tokenize(text):
    # Tokenize all notes 
    tokenized = list()
    for i in range(0, len(text)):
        filtered = remove_stopwords(word_tokenize(text[i]))
        tokenized.append(filtered)
                         
    return tokenized

def tokenize_single(sentence):
    filtered = remove_stopwords(word_tokenize(sentence))
    tokenized = [word.lower() for word in filtered] 
    return tokenized

def remove_stopwords(text):
    # NLTK stopwords
    stop_words = stopwords.words('english')
    stop_words.append(')')
    stop_words.append('(')
    stop_words.append('.')
    stop_words.append('')
    stop_words.append(',')
    stop_words.append('via')
    stop_words.append('attackers')
    stop_words.append('vulnerability')
    stop_words.append('arbitrary')
    stop_words.append('``')
    stop_words.append('1')
    stop_words.append('2') 
    stop_words.append(':')
    stop_words.append('versions')
    stop_words.append('attacker')
    
    filtered_sentence = [w for w in text if not w.lower() in stop_words]
    #with no lower case conversion
    filtered_sentence = []
 
    for w in text:
        if w not in stop_words:
            filtered_sentence.append(w)
    return filtered_sentence

def read_corpus(text, tokens_only=False):
    count=0
    for doc in text:
        count+=1
        tokens = gensim.utils.simple_preprocess(doc)
        if tokens_only:
            yield tokens
        else:
            yield gensim.models.doc2vec.TaggedDocument(tokens, [count])

            
# build and train a doc2vec model on all vulnerability text          
            
def build_doc2vec(untokenized_text):
    train_corpus = list(read_corpus(untokenized_text))
    test_corpus = list(read_corpus(untokenized_text, tokens_only=True))
    model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=20)
    model.build_vocab(train_corpus)
    model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
    return model


# converts the vulnerability text to vectors. text is a list of tokenized sentences for the entire data set
# model is a doc2vec model 
 
def compute_vectors(text,model):
    vectors = list()
    for sentence in text:
        vectors.append(model.infer_vector(sentence))
    return vectors
        
    
# store information of each vulnerability entry in a dictionary. Includes CVE ID, Publication date, vulnerability
# description, tokenized description, month as a string, and month as an int (1-12)
    
def store_entries(vulns,start_index,year_num,total_years):
    # Create dictionaries for every year 
    entries = {start_index: {'CVE': "", 'Date': "",'Description': [], 'Description-Tokenized' : [],
                   'Month-String':"",'Month-Int': 0,"Year":"","Year-Int":0,"Cluster": 0,
                    "Time Start": 0, "Time End": 0}}
    j=0

    for i in range(start_index,start_index+len(vulns[0])):
        entries[i] = {}
        entries[i]['CVE'] = vulns[0][j]
        entries[i]['Date'] = vulns[2][j]
        entries[i]['Description'] = vulns[1][j]
        tokenized = tokenize_single(vulns[1][j])
        desc_tokenized = remove_stopwords(tokenized)
        entries[i]['Description-Tokenized'] = desc_tokenized
        entries[i]['Month-String'] = "n/a"
        entries[i]['Month-Int'] = 0
        entries[i]["Year"] = entries[i]['Date'][0:4]
        entries[i]["Year-Int"] = year_num
        j = j + 1

    for i in range(start_index,start_index+len(vulns[0])):
        if str(entries[i]['Date'])[5] == '0' and entries[i]['Date'][6] == '1':
            entries[i]["Month-String"] = "January"
            entries[i]["Month-Int"] = 1
        elif str(entries[i]['Date'])[5] == '0' and entries[i]['Date'][6] == '2':
            entries[i]["Month-String"] = "February"
            entries[i]["Month-Int"] = 2
        elif str(entries[i]['Date'])[5] == '0' and entries[i]['Date'][6] == '3':
            entries[i]["Month-String"] = "March"
            entries[i]["Month-Int"] = 3
        elif str(entries[i]['Date'])[5] == '0' and entries[i]['Date'][6] == '4':
            entries[i]["Month-String"] = "April"
            entries[i]["Month-Int"] = 4
        elif str(entries[i]['Date'])[5] == '0' and entries[i]['Date'][6] == '5':
            entries[i]["Month-String"] = "May"
            entries[i]["Month-Int"] = 5
        elif str(entries[i]['Date'])[5] == '0' and entries[i]['Date'][6] == '6':
            entries[i]["Month-String"] = "June"
            entries[i]["Month-Int"] = 6
        elif str(entries[i]['Date'])[5] == '0' and entries[i]['Date'][6] == '7':
            entries[i]["Month-String"] = "July"
            entries[i]["Month-Int"] = 7
        elif str(entries[i]['Date'])[5] == '0' and entries[i]['Date'][6] == '8':
            entries[i]["Month-String"] = "August"
            entries[i]["Month-Int"] = 8
        elif str(entries[i]['Date'])[5] == '0' and entries[i]['Date'][6] == '9':
            entries[i]["Month-String"] = "September"
            entries[i]["Month-Int"] = 9
        elif str(entries[i]['Date'])[5] == '1' and entries[i]['Date'][6] == '0':
            entries[i]["Month-String"] = "October"
            entries[i]["Month-Int"] = 10
        elif str(entries[i]['Date'])[5] == '1' and entries[i]['Date'][6] == '1':
            entries[i]["Month-String"] = "November"
            entries[i]["Month-Int"] = 11
        else:
            entries[i]["Month-String"] = "December"
            entries[i]["Month-Int"] = 12
        
        # get time interval 
        entries[i]["Time Start"] = (year_num * 12) - entries[i]["Month-Int"]
        entries[i]["Time End"] = total_years * 12
        
    return entries

# 1999-2004

In [36]:
files_1999_2004 = ['/CVEDatasSets/allitems-cvrf-year-1999.xml',
        '/CVEDatasSets/allitems-cvrf-year-2000.xml',
        '/CVEDatasSets/allitems-cvrf-year-2001.xml',
        '/CVEDatasSets/allitems-cvrf-year-2002.xml',
        '/CVEDatasSets/allitems-cvrf-year-2003.xml',
        '/CVEDatasSets/allitems-cvrf-year-2004.xml']
node_data = pd.read_csv('/Community Results/1999-2004Louvain.csv')

In [37]:
file_info = read_files(files_1999_2004)
entries = file_info[0]

In [38]:
node_data

Unnamed: 0,Id,Label,v__nx_name,v_start_time,v_end_time,modularity_class,Color
0,n0,n0,0,10,72,86,#000000
1,n1,n1,1,3,72,1,#000000
2,n2,n2,2,3,72,80,#000000
3,n3,n3,3,10,72,1,#000000
4,n4,n4,4,3,72,26,#000000
...,...,...,...,...,...,...,...
10811,n10811,n10811,10811,60,72,99,#000000
10812,n10812,n10812,10812,60,72,86,#000000
10813,n10813,n10813,10813,64,72,82,#000000
10814,n10814,n10814,10814,66,72,28,#000000


In [39]:
class_sorted = node_data.sort_values('modularity_class')

In [40]:
classes = class_sorted.modularity_class.unique()

In [41]:
from nltk.probability import FreqDist

In [42]:
top_1999=[101,59,61,69,14]
for value in top_1999:
    cur_corpus = []
    print("_____________________________________________________________________________________________________")
    cur_class = classes[value]
    print(cur_class)
    cur_nodes = class_sorted.loc[class_sorted['modularity_class'] == cur_class]
    
    for i in range(0,len(cur_nodes)):
        id = cur_nodes.iloc[i]['v__nx_name']
        sent = entries[id]['Description-Tokenized']
        cur_corpus.extend(sent)
    fdist = FreqDist(cur_corpus)
    top_ten = fdist.most_common(10)
    print(top_ten)
    print("______________________________________________________________________________________________________")

_____________________________________________________________________________________________________
101
[('sql', 252), ('remote', 180), ('injection', 160), ('parameter', 155), ('allows', 150), ('execute', 114), ('commands', 78), ('earlier', 37), ('gain', 29), ('code', 28)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
59
[('allows', 135), ('function', 126), ('buffer', 122), ('remote', 120), ('execute', 116), ('overflow', 107), ('code', 105), ('file', 64), ('long', 51), ('service', 48)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
61
[('allows', 118), ('used', 118), ('remote', 92), ('execute', 51), ('users', 48), ('local', 47), ('code', 46), ('cause', 37), ('earlier', 36)

# 2005-2007

In [43]:
files_2005 = [ '/CVEDatasSets/allitems-cvrf-year-2005.xml',
        '/CVEDatasSets/allitems-cvrf-year-2006.xml',
        '/CVEDatasSets/allitems-cvrf-year-2007.xml']
node_data = pd.read_csv('/Community Results/2005-2007Louvain.csv')

file_info = read_files(files_2005)
entries = file_info[0]

In [44]:
node_data

Unnamed: 0,Id,Label,v__nx_name,v_start_time,v_end_time,modularity_class,Color
0,n0,n0,0,11,36,18,#000000
1,n1,n1,1,11,36,70,#000000
2,n2,n2,2,11,36,133,#000000
3,n3,n3,3,11,36,5,#000000
4,n4,n4,4,11,36,17,#000000
...,...,...,...,...,...,...,...
18067,n18067,n18067,18067,32,36,31,#000000
18068,n18068,n18068,18068,32,36,31,#000000
18069,n18069,n18069,18069,32,36,50,#000000
18070,n18070,n18070,18070,29,36,138,#000000


In [45]:
class_sorted = node_data.sort_values('modularity_class')
classes = class_sorted.modularity_class.unique()

In [46]:
top_2005=[11,109,132,2,121]
for value in top_2005:
    cur_corpus = []
    print("_____________________________________________________________________________________________________")
    cur_class = classes[value]
    print(cur_class)
    cur_nodes = class_sorted.loc[class_sorted['modularity_class'] == cur_class]
    
    for i in range(0,len(cur_nodes)):
        id = cur_nodes.iloc[i]['v__nx_name']
        sent = entries[id]['Description-Tokenized']
        cur_corpus.extend(sent)
    fdist = FreqDist(cur_corpus)
    top_ten = fdist.most_common(10)
    print(top_ten)
    print("______________________________________________________________________________________________________")

_____________________________________________________________________________________________________
11
[('properly', 205), ('allows', 183), ('remote', 120), ('users', 115), ('service', 82), ('local', 82), ('cause', 65), ('certain', 61), ('denial', 60), ('privileges', 53)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
109
[('remote', 403), ('php', 393), ('parameter', 213), ('code', 202), ('file', 199), ('execute', 198), ('inclusion', 194), ('url', 178), ('earlier', 165), ('allows', 162)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
132
[('information', 300), ('sql', 231), ('remote', 228), ('note', 199), (':', 198), ('third', 185), ('details', 184), ('party', 182), ('obtai

# 2008-2010

In [47]:
files_2008 = [ '/CVEDatasSets/allitems-cvrf-year-2008.xml',
        '/CVEDatasSets/allitems-cvrf-year-2009.xml',
        '/CVEDatasSets/allitems-cvrf-year-2010.xml']
node_data = pd.read_csv('/Community Results/2008-2010Louvain.csv')

file_info = read_files(files_2008)
entries = file_info[0]

In [48]:
node_data

Unnamed: 0,Id,Label,v__nx_name,v_start_time,v_end_time,modularity_class,Color
0,n0,n0,0,11,36,138,#000000
1,n1,n1,1,10,36,82,#000000
2,n2,n2,2,11,36,5,#000000
3,n3,n3,3,11,36,41,#000000
4,n4,n4,4,11,36,64,#000000
...,...,...,...,...,...,...,...
16945,n16945,n16945,16945,26,36,75,#000000
16946,n16946,n16946,16946,26,36,75,#000000
16947,n16947,n16947,16947,26,36,75,#000000
16948,n16948,n16948,16948,26,36,75,#000000


In [49]:
class_sorted = node_data.sort_values('modularity_class')
classes = class_sorted.modularity_class.unique()

In [50]:
top_2008=[131,97,93,92,25]
for value in top_2008:
    cur_corpus = []
    print("_____________________________________________________________________________________________________")
    cur_class = classes[value]
    print(cur_class)
    cur_nodes = class_sorted.loc[class_sorted['modularity_class'] == cur_class]
    
    for i in range(0,len(cur_nodes)):
        id = cur_nodes.iloc[i]['v__nx_name']
        sent = entries[id]['Description-Tokenized']
        cur_corpus.extend(sent)
    fdist = FreqDist(cur_corpus)
    top_ten = fdist.most_common(10)
    print(top_ten)
    print("______________________________________________________________________________________________________")

_____________________________________________________________________________________________________
131
[('file', 258), ('remote', 146), ('allows', 144), ('code', 104), ('execute', 101), ('buffer', 77), ('service', 67), ('denial', 65), ('crafted', 65), ('cause', 63)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
97
[('dot', 207), ('directory', 190), ('allows', 186), ('traversal', 180), ('remote', 176), ('files', 148), ('parameter', 120), ('..', 105), ('read', 96), ('local', 91)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
93
[('function', 214), ('allows', 143), ('remote', 127), ('execute', 92), ('code', 86), ('cause', 77), ('service', 76), ('denial', 75), ('related', 56

# 2011-2014

In [51]:
files_2011 = [ '/CVEDatasSets/allitems-cvrf-year-2011.xml',
        '/CVEDatasSets/allitems-cvrf-year-2012.xml',
        '/CVEDatasSets/allitems-cvrf-year-2013.xml',
            '/CVEDatasSets/allitems-cvrf-year-2014.xml' ]
node_data = pd.read_csv('/Community Results/2011-2014Louvain.csv')

file_info = read_files(files_2011)
entries = file_info[0]

In [52]:
node_data

Unnamed: 0,Id,Label,d0,modularity_class,Color
0,0,0,1,86,#000000
1,1,1,2,116,#000000
2,2,2,2,13,#000000
3,3,3,3,109,#000000
4,4,4,2,81,#000000
...,...,...,...,...,...
24608,24608,24608,3,24,#000000
24609,24609,24609,2,125,#000000
24610,24610,24610,3,24,#000000
24611,24611,24611,1,36,#000000


In [53]:
class_sorted = node_data.sort_values('modularity_class')
classes = class_sorted.modularity_class.unique()

In [56]:
top_2011=[21,166,5,124,0]
for value in top_2011:
    cur_corpus = []
    print("_____________________________________________________________________________________________________")
    cur_class = classes[value]
    print(cur_class)
    cur_nodes = class_sorted.loc[class_sorted['modularity_class'] == cur_class]
    
    for i in range(0,len(cur_nodes)):
        id = cur_nodes.iloc[i]['Id']
        try:
            sent = entries[id]['Description-Tokenized']
        except: 
            continue
        cur_corpus.extend(sent)
    fdist = FreqDist(cur_corpus)
    top_ten = fdist.most_common(10)
    print(top_ten)
    print("______________________________________________________________________________________________________")

_____________________________________________________________________________________________________
21
[('allows', 231), ('remote', 195), ('oracle', 156), ('unspecified', 149), ('vectors', 135), ('related', 123), ('affect', 114), ('users', 111), ('unknown', 92), ('component', 71)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
166
[('servers', 432), ('allows', 251), ('crafted', 233), ('obtain', 221), ('information', 221), ('sensitive', 220), ('aka', 219), ('application', 218), ('man-in-the-middle', 218), ('certificate', 217)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
5
[('allows', 211), ('remote', 166), ('vectors', 150), ('unspecified', 148), ('adobe', 104), ('execute'

# 2015-2016

In [57]:
files_2015 = [ '/CVEDatasSets/allitems-cvrf-year-2015.xml',
        '/CVEDatasSets/allitems-cvrf-year-2016.xml']
node_data = pd.read_csv('/Community Results/2015-2016Louvain.csv')

file_info = read_files(files_2015)
entries = file_info[0]

In [58]:
node_data

Unnamed: 0,Id,Label,v__nx_name,v_start_time,v_end_time,modularity_class,Color
0,n0,n0,0,11,24,113,#000000
1,n1,n1,1,11,24,68,#000000
2,n2,n2,2,10,24,36,#000000
3,n3,n3,3,11,24,52,#000000
4,n4,n4,4,9,24,133,#000000
...,...,...,...,...,...,...,...
17252,n17252,n17252,17252,21,24,56,#000000
17253,n17253,n17253,17253,21,24,56,#000000
17254,n17254,n17254,17254,21,24,56,#000000
17255,n17255,n17255,17255,12,24,49,#000000


In [59]:
class_sorted = node_data.sort_values('modularity_class')
classes = class_sorted.modularity_class.unique()

In [61]:
top_2015=[52,132,94,99,111]
for value in top_2015:
    cur_corpus = []
    print("_____________________________________________________________________________________________________")
    cur_class = classes[value]
    print(cur_class)
    cur_nodes = class_sorted.loc[class_sorted['modularity_class'] == cur_class]
    
    for i in range(0,len(cur_nodes)):
        id = cur_nodes.iloc[i]['v__nx_name']
        try:
            sent = entries[id]['Description-Tokenized']
        except: 
            continue
        cur_corpus.extend(sent)
    fdist = FreqDist(cur_corpus)
    top_ten = fdist.most_common(10)
    print(top_ten)
    print("______________________________________________________________________________________________________")

_____________________________________________________________________________________________________
52
[('aka', 241), ('allows', 208), ('windows', 201), ('remote', 195), ('crafted', 109), ('cisco', 108), ('bug', 99), ('id', 94), ('service', 82), ('users', 81)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
132
[('affected', 292), ('issue', 255), ('discovered', 142), ('allows', 141), ('crafted', 111), ('cause', 106), ("''", 103), ('certain', 102), ('service', 102), ('component', 100)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
94
[('aka', 198), ('cisco', 185), ('devices', 182), ('bug', 173), ('remote', 172), ('allows', 163), ('id', 155), ('service', 102), ('crafted', 94)

# 2017

In [62]:
files_2017 = [ '/CVEDatasSets/allitems-cvrf-year-2017.xml']
node_data = pd.read_csv('/Community Results/2017Louvain.csv')

file_info = read_files(files_2017)
entries = file_info[0]

In [63]:
node_data

Unnamed: 0,Id,Label,v__nx_name,v_start_time,v_end_time,modularity_class,Color
0,n0,n0,0,9,12,13,#000000
1,n1,n1,1,11,12,61,#000000
2,n2,n2,2,11,12,106,#000000
3,n3,n3,3,11,12,100,#000000
4,n4,n4,4,9,12,13,#000000
...,...,...,...,...,...,...,...
14652,n14652,n14652,14652,6,12,26,#000000
14653,n14653,n14653,14653,6,12,130,#000000
14654,n14654,n14654,14654,6,12,25,#000000
14655,n14655,n14655,14655,6,12,130,#000000


In [64]:
class_sorted = node_data.sort_values('modularity_class')
classes = class_sorted.modularity_class.unique()

In [65]:
top_2017=[24,10,28,102,49]
for value in top_2017:
    cur_corpus = []
    print("_____________________________________________________________________________________________________")
    cur_class = classes[value]
    print(cur_class)
    cur_nodes = class_sorted.loc[class_sorted['modularity_class'] == cur_class]
    
    for i in range(0,len(cur_nodes)):
        id = cur_nodes.iloc[i]['v__nx_name']
        try:
            sent = entries[id]['Description-Tokenized']
        except: 
            continue
        cur_corpus.extend(sent)
    fdist = FreqDist(cur_corpus)
    top_ten = fdist.most_common(10)
    print(top_ten)
    print("______________________________________________________________________________________________________")

_____________________________________________________________________________________________________
24
[(':', 80), ('allows', 62), ('ibm', 57), ('code', 53), ('remote', 52), ('attacker', 47), ('file', 41), ('could', 39), ('cause', 38), ('service', 38)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
10
[('android', 342), ('msm', 211), ('kernel', 142), ('using', 130), ('linux', 125), ('releases', 123), ('caf', 123), ('os', 107), ('firefox', 105), ('qrd', 105)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
28
[('allows', 79), ('remote', 68), ('attacker', 66), ('could', 65), (':', 58), ('crafted', 53), ('service', 48), ('user', 46), ('denial', 45), ('nexus', 45)]
_____________

# 2018

In [66]:
files_2018 = [ '/CVEDatasSets/allitems-cvrf-year-2018.xml']
node_data = pd.read_csv('/Community Results/2018Louvain.csv')

file_info = read_files(files_2018)
entries = file_info[0]

In [67]:
class_sorted = node_data.sort_values('modularity_class')
classes = class_sorted.modularity_class.unique()

In [69]:
top_2018=[16,47,38,105,31]
for value in top_2018:
    cur_corpus = []
    print("_____________________________________________________________________________________________________")
    cur_class = classes[value]
    print(cur_class)
    cur_nodes = class_sorted.loc[class_sorted['modularity_class'] == cur_class]
    
    for i in range(0,len(cur_nodes)):
        id = cur_nodes.iloc[i]['v__nx_name']
        try:
            sent = entries[id]['Description-Tokenized']
        except: 
            continue
        cur_corpus.extend(sent)
    fdist = FreqDist(cur_corpus)
    top_ten = fdist.most_common(10)
    print(top_ten)
    print("______________________________________________________________________________________________________")

_____________________________________________________________________________________________________
16
[('kernel', 56), ('cause', 38), ('linux', 34), ('allows', 33), ('crafted', 31), ('service', 27), ('denial', 25), ('function', 20), ('user', 19), ('crash', 18)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
47
[(':', 1964), ('oracle', 593), ('access', 420), ('cvss', 416), ('attacks', 324), ('data', 291), ('unauthorized', 270), ('successful', 259), ('attacker', 258), ('score', 227)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
38
[('issue', 98), ('allows', 77), ('version', 65), ('software', 65), ('disk', 51), ('command', 51), ('sd', 51), ('injection', 50), ('quest', 49), 

# 2019

In [70]:
files_2019 = [ '/CVEDatasSets/allitems-cvrf-year-2019.xml']
node_data = pd.read_csv('/Community Results/2019Louvain.csv')

file_info = read_files(files_2019)
entries = file_info[0]

In [71]:
class_sorted = node_data.sort_values('modularity_class')
classes = class_sorted.modularity_class.unique()

In [72]:
top_2019=[124,105,112,85,99]
for value in top_2019:
    cur_corpus = []
    print("_____________________________________________________________________________________________________")
    cur_class = classes[value]
    print(cur_class)
    cur_nodes = class_sorted.loc[class_sorted['modularity_class'] == cur_class]
    
    for i in range(0,len(cur_nodes)):
        id = cur_nodes.iloc[i]['v__nx_name']
        try:
            sent = entries[id]['Description-Tokenized']
        except: 
            continue
        cur_corpus.extend(sent)
    fdist = FreqDist(cur_corpus)
    top_ten = fdist.most_common(10)
    print(top_ten)
    print("______________________________________________________________________________________________________")

_____________________________________________________________________________________________________
124
[(':', 398), ('needed', 307), ('bounds', 242), ('execution', 201), ('user', 173), ('possible', 168), ('could', 163), ('due', 160), ('lead', 159), ('privileges', 156)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
105
[('issue', 210), ('application', 149), ('may', 131), ('memory', 118), ('fixed', 98), ('addressed', 93), ('improved', 91), ('able', 79), ('macos', 79), ('attacker', 74)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
112
[('issue', 119), ('discovered', 98), ('attacker', 51), ('found', 48), ('file', 47), (':', 46), ('could', 43), ('function', 42), ('cause', 34

# 2020

In [77]:
files_2020 = [ '/CVEDatasSets/allitems-cvrf-year-2020.xml']
node_data = pd.read_csv('/Community Results/2020Louvain.csv')

file_info = read_files(files_2020)
entries = file_info[0]

In [78]:
class_sorted = node_data.sort_values('modularity_class')
classes = class_sorted.modularity_class.unique()

In [79]:
top_2020=[58,36,35,91,143]
for value in top_2020:
    cur_corpus = []
    print("_____________________________________________________________________________________________________")
    cur_class = classes[value]
    print(cur_class)
    cur_nodes = class_sorted.loc[class_sorted['modularity_class'] == cur_class]
    
    for i in range(0,len(cur_nodes)):
        id = cur_nodes.iloc[i]['v__nx_name']
        try:
            sent = entries[id]['Description-Tokenized']
        except: 
            continue
        cur_corpus.extend(sent)
    fdist = FreqDist(cur_corpus)
    top_ten = fdist.most_common(10)
    print(top_ten)
    print("______________________________________________________________________________________________________")

_____________________________________________________________________________________________________
58
[('oracle', 421), ('access', 284), ('cvss', 248), ('attacks', 209), ('data', 187), ('successful', 173), ('affected', 155), ('server', 153), ('unauthorized', 153), ('allows', 151)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
36
[('allows', 129), ('web', 86), ('xss', 85), ('payload', 70), ('?', 68), ('way', 67), ('reflected', 67), ('script', 67), ('html', 67), ('adding', 67)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
35
[('issue', 72), ('allows', 69), ('code', 64), ('user', 56), ('could', 53), ('remote', 48), ('access', 43), ('discovered', 39), ('affected', 36), ('fi

# 2021

In [80]:
files_2021 = [ '/CVEDatasSets/allitems-cvrf-year-2021.xml']
node_data = pd.read_csv('/Community Results/2021Louvain.csv')

file_info = read_files(files_2021)
entries = file_info[0]

class_sorted = node_data.sort_values('modularity_class')
classes = class_sorted.modularity_class.unique()

In [82]:
node_data

Unnamed: 0,Id,Label,d0,modularity_class,Color
0,0,0,1,126,#000000
1,1,1,1,32,#000000
2,2,2,1,32,#000000
3,3,3,2,46,#000000
4,4,4,2,46,#000000
...,...,...,...,...,...
20345,20345,20345,2,121,#000000
20346,20346,20346,2,7,#000000
20347,20347,20347,2,80,#000000
20348,20348,20348,2,102,#000000


In [83]:
top_2021=[51,122,5,0,6]
for value in top_2021:
    cur_corpus = []
    print("_____________________________________________________________________________________________________")
    cur_class = classes[value]
    print(cur_class)
    cur_nodes = class_sorted.loc[class_sorted['modularity_class'] == cur_class]
    
    for i in range(0,len(cur_nodes)):
        id = cur_nodes.iloc[i]['Id']
        try:
            sent = entries[id]['Description-Tokenized']
        except: 
            continue
        cur_corpus.extend(sent)
    fdist = FreqDist(cur_corpus)
    top_ten = fdist.most_common(10)
    print(top_ten)
    print("______________________________________________________________________________________________________")

_____________________________________________________________________________________________________
51
[('cve', 238), ('id', 238), ('unique', 238), ('remote', 153), ('code', 128), ('execution', 121), ('windows', 91), ('elevation', 72), ('privilege', 72), ('microsoft', 66)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
122
[('memory', 109), ('user', 98), ('snapdragon', 70), ('allows', 65), ('application', 55), ('access', 48), ('file', 45), ('may', 38), ('version', 36), ('information', 36)]
______________________________________________________________________________________________________
_____________________________________________________________________________________________________
5
[('needed', 154), ('id', 152), ('execution', 121), ('user', 108), ('possible', 106), ('could', 88), ('privilege', 82), ('interaction', 82)