In [14]:
import pandas as pd
import numpy as np

In [15]:
PATH_TO_FILE = './Data/corpus2.json'
data = pd.read_json(PATH_TO_FILE)

In [16]:
data.head(5)

Unnamed: 0,id,cve,project_name,description,mistakes,tag_list
0,1037,CVE-2014-7948,chromium,Google Chrome caches pages despite having an S...,I think that the vulenrability is interesting ...,"[{'id': 8, 'tag': 'bounty'}, {'id': 95, 'tag':..."
1,1158,CVE-2015-6757,chromium,Trying to access data in ServiceWorker after i...,The mistake made here was a basic C coding mis...,"[{'id': 5, 'tag': 'lifetime-1y2y'}, {'id': 8, ..."
2,1489,CVE-2017-5125,chromium,Embargoed. Heap overflow in Skia.\n,,"[{'id': 14, 'tag': 'cwe-119'}, {'id': 357, 'ta..."
3,184,CVE-2010-4488,chromium,This bug involves an unhandled case in which t...,"This could be a coding or design error, there ...","[{'id': 3, 'tag': 'lifetime-90d180d'}, {'id': ..."
4,1269,CVE-2016-1682,chromium,Checks to verify the content security policy (...,It seems as though the developer just didn't t...,"[{'id': 1, 'tag': 'lifetime-30d'}, {'id': 8, '..."


In [17]:
CVE_description_mistake = data[['cve','description','mistakes']].copy()

In [18]:
CVE_description_mistake

Unnamed: 0,cve,description,mistakes
0,CVE-2014-7948,Google Chrome caches pages despite having an S...,I think that the vulenrability is interesting ...
1,CVE-2015-6757,Trying to access data in ServiceWorker after i...,The mistake made here was a basic C coding mis...
2,CVE-2017-5125,Embargoed. Heap overflow in Skia.\n,
3,CVE-2010-4488,This bug involves an unhandled case in which t...,"This could be a coding or design error, there ..."
4,CVE-2016-1682,Checks to verify the content security policy (...,It seems as though the developer just didn't t...
...,...,...,...
721,CVE-2014-1715,Path traversal vulnerability that can lead to ...,"Ultimately, this vunerability was the direct c..."
722,CVE-2014-0116,A fix applied in a previous update didn't acco...,The correct sanitization regex pattern was pre...
723,CVE-2017-5055,Use after free in printing. When the cross pro...,PrintPreviewDone() got called multiple times a...
724,CVE-2010-0656,"Webkit, if given a false directory in the URL ...",The origin of the bug is not really a commit o...


In [19]:
from fuzzywuzzy import process,fuzz

In [20]:
for col in CVE_description_mistake[['cve','description','mistakes']]:
    CVE_description_mistake[col] = CVE_description_mistake[col].str.strip()
    print("number of unique values in " + str(col) +": "+ str(CVE_description_mistake[col].nunique()))

number of unique values in cve: 725
number of unique values in description: 707
number of unique values in mistakes: 569


In [21]:
unique_cves = CVE_description_mistake['cve'].unique().tolist()
sorted(unique_cves)[:5]

['CVE-2002-0392',
 'CVE-2002-0661',
 'CVE-2002-0840',
 'CVE-2002-1156',
 'CVE-2002-1593']

### This below process des not mean anything as the Fuzyy token sort ratio scorer tokenizes the strings and cleans them by returning these strings to lower cases, removing punctuations, and then sorting them alphabetically. After that, it finds the Levenshtein distance and returns the similarity percentage.

In [22]:
process.extract('CVE-2002-0392', unique_cves, scorer=fuzz.token_sort_ratio)

[('CVE-2002-0392', 100),
 ('CVE-2004-0942', 85),
 ('CVE-2012-0031', 77),
 ('CVE-2013-2900', 77),
 ('CVE-2004-0809', 77)]

In [23]:
process.extract('CVE-2010-3411', unique_cves, scorer=fuzz.token_sort_ratio)

[('CVE-2010-3411', 100),
 ('CVE-2010-3111', 92),
 ('CVE-2010-3415', 92),
 ('CVE-2010-3413', 92),
 ('CVE-2010-3112', 92)]

In [24]:
unique_desc = CVE_description_mistake['description'].unique().tolist()
sorted(unique_desc)[:10]

['(Linux Only) A malicious user could remotely construct a relatively long string.\nUpon creating the string, the user could then use it with the javascript alert \nfunction, or use it with an input that could be displayed in an alert. As a result, \nthe display server could crash and lock up the open chrome windows on the target device, \nrequiring a reboot (Denial of Service).',
 'A CGI (Common Gateway Interface) is a protocol used by web servers that runs command-line interface scripts in response to client requests. Apache HTTP Server has a module "mod_cgid" which is responsible for running CGI scripts.\nCVE-2014-0231 was a vulnerability in Apache HTTP Server\'s mod_cgid module where users could create a denial of service attack by causing the process which ran these scripts to hang indefinitely. mod_cgid did not have any timeout feature. If an attacker found a request that caused one of a server\'s CGI scripts hosted in mod_cgid to halt, this would deny service to other users of t

In [25]:
process.extract('(Linux Only) A malicious user could remotely construct a relatively long string.\nUpon creating the string, the user could then use it with the javascript alert \nfunction, or use it with an input that could be displayed in an alert. As a result, \nthe display server could crash and lock up the open chrome windows on the target device, \nrequiring a reboot (Denial of Service).', unique_desc, scorer=fuzz.token_sort_ratio)

[('(Linux Only) A malicious user could remotely construct a relatively long string.\nUpon creating the string, the user could then use it with the javascript alert \nfunction, or use it with an input that could be displayed in an alert. As a result, \nthe display server could crash and lock up the open chrome windows on the target device, \nrequiring a reboot (Denial of Service).',
  100),
 ('A remote attacker could write malicious input to a data stream that would cause\nthe server to crash. Once the server is crashed, subsequently valid requests will continue to crash\nthe server.',
  47),
 ("The multipart stream area of the system doesn't check the length of a boundary string. This\nmeans that a user can potentially use an extremely long boundary string to cause a denial of\nservice attack.",
  45),
 ('When displaying a type of graphic known as an SVG, a value of infinity could\nbe used as one of the attributes, causing a write to an invalid memory location.\nThis would result in a 

## Similarity score on description column

In [None]:
#Create tuples of description, matched description, and the score
score_sort = [(x,) + i
             for x in unique_desc 
             for i in process.extract(x, unique_desc, scorer=fuzz.token_sort_ratio)]
#Create a dataframe from the tuples
similarity_sort = pd.DataFrame(score_sort, columns=['desc_sort','match_sort','score_sort'])
similarity_sort.head()


In [None]:
print(len(similarity_sort))

In [None]:
similarity_sort['sorted_desc_sort'] = np.maximum(similarity_sort['desc_sort'], similarity_sort['match_sort'])
similarity_sort.head()

In [None]:
high_score_sort = similarity_sort[(similarity_sort['score_sort'] >= 75) &
                (similarity_sort['desc_sort'] !=  similarity_sort['match_sort']) &
                (similarity_sort['sorted_desc_sort'] != similarity_sort['match_sort'])]
high_score_sort = high_score_sort.drop('sorted_desc_sort',axis=1).copy()

In [None]:
high_score_sort

In [None]:
high_score_sort.groupby(['desc_sort','score_sort']).agg(
                        {'match_sort': ', '.join}).sort_values(
                        ['score_sort'], ascending=False)

## Similarity score on the mistakes column

In [None]:
unique_mistakes = CVE_description_mistake['mistakes'].unique().tolist()
unique_mistakes = ['None' if mis is None else mis for mis in unique_mistakes]
sorted(unique_mistakes)[:100]

In [None]:
#Create tuples of mistakes, matched mistakes, and the score
score_sort = [(x,) + i
             for x in unique_mistakes 
             for i in process.extract(x, unique_mistakes, scorer=fuzz.token_sort_ratio)]
#Create a dataframe from the tuples
similarity_sort_mistakes = pd.DataFrame(score_sort, columns=['mistakes_sort','match_sort','score_sort'])
similarity_sort_mistakes.head()

In [None]:
print(len(similarity_sort_mistakes))

In [None]:
similarity_sort_mistakes['sorted_mistakes_sort'] = np.maximum(similarity_sort_mistakes['mistakes_sort'], similarity_sort_mistakes['match_sort'])
similarity_sort_mistakes.head()

In [None]:
high_score_sort = similarity_sort_mistakes[(similarity_sort_mistakes['score_sort'] >= 15) &
                (similarity_sort_mistakes['mistakes_sort'] !=  similarity_sort_mistakes['match_sort']) &
                (similarity_sort_mistakes['sorted_mistakes_sort'] != similarity_sort_mistakes['match_sort'])]
high_score_sort = high_score_sort.drop('sorted_mistakes_sort',axis=1).copy()

In [None]:
high_score_sort

In [None]:
high_score_sort.groupby(['mistakes_sort','score_sort']).agg(
                        {'match_sort': ', '.join}).sort_values(
                        ['score_sort'], ascending=False)

## Similarity score on the mistakes and description column combined

In [None]:
CVE_description_mistake['desc_mis'] = CVE_description_mistake['description'] + CVE_description_mistake['mistakes']

In [None]:
unique_desc_mis = CVE_description_mistake['desc_mis'].astype(str).unique().tolist()
unique_desc_mis = ['None' if desc_mis is None else desc_mis for desc_mis in unique_desc_mis]
unique_desc_mis = ['None' if desc_mis is 'nan' else desc_mis for desc_mis in unique_desc_mis]
sorted(unique_desc_mis)[:1]

In [None]:
#Create tuples of description_mistakes, matched description_mistakes, and the score
score_sort = [(x,) + i
             for x in unique_desc_mis 
             for i in process.extract(x, unique_desc_mis, scorer=fuzz.token_sort_ratio)]
#Create a dataframe from the tuples
similarity_sort_desc_mis = pd.DataFrame(score_sort, columns=['desc_mis_sort','match_sort','score_sort'])
similarity_sort_desc_mis.head()

In [None]:
print(len(similarity_sort_desc_mis))

In [None]:
similarity_sort_desc_mis['sorted_desc_mis_sort'] = np.maximum(similarity_sort_desc_mis['desc_mis_sort'], similarity_sort_mistakes['match_sort'])
similarity_sort_desc_mis.head()

In [None]:
high_score_sort = similarity_sort_desc_mis[(similarity_sort_desc_mis['score_sort'] >= 15) &
                (similarity_sort_desc_mis['desc_mis_sort'] !=  similarity_sort_desc_mis['match_sort']) &
                (similarity_sort_desc_mis['sorted_desc_mis_sort'] != similarity_sort_desc_mis['match_sort'])]
high_score_sort = high_score_sort.drop('sorted_desc_mis_sort',axis=1).copy()

In [None]:
high_score_sort


In [None]:
high_score_sort.groupby(['desc_mis_sort','score_sort']).agg(
                        {'match_sort': ', '.join}).sort_values(
                        ['score_sort'], ascending=False)