## NVD CVE JSON Mirror Maintenance

### This is a collection of routines to maintain a mirror of the NVD JSON CVE repository.

To use:
    <ul>
    <li>install mongodb
    <li>create a database called nvd
    <li>create collections: maincol, modscol, metacol
    <li>identify a working and a config directory - working dir is to download JSON files , config dir is where you put
        the .mongo.env file
    </ul>
        

In [98]:
#author: Ray Zupancic - Grumblesoft.com
import requests
from pathlib import Path
import json
from pymongo import MongoClient
from bson import json_util
import pandas as pd
import glob
import sys
import zipfile
import datetime
import pathlib

class MongodbConn():
    
    def __init__(self,configpath='', configname=".mongo.env"):
        if configpath == '':
            #set a home directory path where all JSON files and mongodb config live
            self.configpath = str(Path.home())+"/" + configname
        else:
            self.configpath = configpath
        
        try:
            #read the params    
            self.par = self._get_params(self.configpath)    
            self.cli = self._connect_mongo(self.par)
            #validate that the db exists
            db_names = self.cli.list_database_names()
            db = self.par["database"]
            if not db in db_names:
                raise Exception("db: {} not found on instance".format(db))
            self.mydb = self.cli[db]
            #connect to or create the collections
            self.maincol = self.mydb[self.par["maincollection"]]
            self.modscol = self.mydb[self.par["modscollection"]]
            self.metacol = self.mydb[self.par["metacollection"]]
            self.intecol = self.mydb[self.par["intecollection"]]
        except ValueError as v:
            print(repr(v))
        except Exception as e:
            print("Could not connect to the collection")
            print(repr(e))
            sys.exit()

        
    def _get_params(self,filename):
        ''' Public method to read values from a file and get parameters
            place your mongodb connect information in this file
            server,<yourservername or IP>
            username,<username>
            password,<mongopassword>
            authSource,<db you are authenticating against>
            database,<your-database>
            collection,<your-collection>
        '''
        myparams = {}
        with open(filename) as myfile:
            for line in myfile:
                key, value = line.partition("=")[::2]
                myparams[key.strip()] = value.strip()
        return myparams

    
    def _connect_mongo(self,params):
        '''
            Public method to connect to a mongo instance using params from input
            params - a dictionary with user, password, server and database
        '''
        client = MongoClient(params.get("server"),\
                              username=params.get("username"),\
                              password=params.get("password"),\
                              authSource=params.get("authSource"),\
                              authMechanism='SCRAM-SHA-1')
        return client

    
class QueryMongo(MongodbConn):
    '''
        queryMongo class is used to apply find functions against a mongodb 
        inherits: mongoConn - connection instance
    '''
    
    def __init__(self,configpath='', configname=".mongo.env"):
        '''
            Intializer method - instance the super class initializer
        '''
        super().__init__(configpath, configname)
    
    def get_count(self, col):
        '''
            Public method to return the document count for a given collection
            input: collection
            output: integer 
        '''
        return col.count_documents({})
    
    
    def get_fields(self,data_l,col):
        '''
            Public method to query mongodb
            input: list of fields []
            output: cursor mongodb
        '''
        qs = '"_id":0'
        for item in data_l:
            qs += ',"'+item+'":1'
        print(qs)
        cursor = col.find({},{qs})
        return cursor

    
    def get_fields(self,filter_d,proj_d, col):
        '''
            Public method to query a field 
            input: filter (dictionary), projection (dictionary), collection
            output: cursor (mongodb)
        '''
        cursor = col.find(filter_d,proj_d)
        return cursor
    
    
    def get_cve(self,cve, col):
        '''
            Public method to query a collection for a given CVE
            input: cve (string), collection
            output: dictionary
        '''
        try:
            cve_d = col.find_one({"CVE_ID":cve})
            if not bool(cve_d):
                raise ValueError("CVE: {} does not exist".format(cve))
        except Exception as e:
            print(repr(e))
        return cve_d

    
    def get_cve_last_modified(self,CVE, col):
        '''
            Public method to get last modified date
            input: string (CVE ID), collection
            output: cursor
        '''
        filter_d = {"CVE_ID":{ "$regex": CVE }}
        proj_d = {"_id":0,"lastModifiedDate":1}
        cr = col.find(filter_d, proj_d)
        return cr
    
    def get_true_mods_CVEID(self):
        '''
            Public method to get the mod CVEs that are not new. This uses pandas to do the intersection
            input: na
            output: DataFrame
        '''
        filt = {}
        proj = {'CVE_ID':1,'_id':0}
        cr1 = self.maincol.find(filt, proj)
        df1 = pd.DataFrame(list(cr1))
        cr2 = self.modscol.find(filt, proj)
        df2 = pd.DataFrame(list(cr2))
                       
        return df2.merge(df1,how='inner',on='CVE_ID')
    
    def _parse_json(self, data):
        '''
            Private method to handle BSON
            input: BSON
            output: JSON
        '''
        return json.loads(json_util.dumps(data))
    
class UpdateMongo(QueryMongo):
    
    
    def __init__(self, workingpath = '', configpath='', configname=".mongo.env"):
        '''
            Intializer method - instance the super class initializer
            input: string (working directory), string (config directory), string (ini file)
        '''
        super().__init__(configpath, configname)
        self.workingpath = workingpath
        
        
    def get_NVD_files(self,file_l):
        ''' Public method to download NVD datafiles in zip format, the URL is:
            https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-[year].json.gz
            inputs: d_path - directory path as string
        '''
        for file in file_l:
            try:
                #need the URL, which for NVD files is an embedding of the year
                url = 'https://nvd.nist.gov/feeds/json/cve/1.1/' +file
                f = requests.get(url, allow_redirects=True)
                #save the file
                f_path = self.workingpath + file
                open(f_path, 'wb').write(f.content)
                f.close()
            except Exception as e:
                print("error on download - check NVD JSON files availability")
                print(repr(e))
            try:
                #unzip the NVD files
                with zipfile.ZipFile(f_path, 'r') as zip_ref:
                    zip_ref.extractall(self.workingpath)
            except Exception as e:
                print("error on unzipping files")
                print(repr(e))

    def rebuild_yearly_NVD_files(self,download_files=False):
        '''
            Public method to rebuild the yearly data from NVD. This will empty the main cve collection and
            rebuild it with new NVD files. Generally, one would want to use the modified file from NVD to add
            modifications and insertions rather than rebuilding frequently 
            input: boolean (download NVD files from NVD?)
            output: list (inserted files)
        '''
        json_files_l = []
        #get the present year - we will get all files ffrom 2002 to present year
        year = datetime.date.today().year
        years = range(2002,year+1)
        for year in years:
            json_files_l.append('nvdcve-1.1-' + str(year) + '.json.zip')
        #download the files and unzip
        if download_files:
            self.get_NVD_files(json_files_l)
        #delete all docs in main collection
        self.remove_all_cves(self.maincol)
        
        json_files_l = [item[:-4] for item in json_files_l]
        self.manage_bulk_insert(json_files_l)
        
        return json_files_l
        
                

    def insert_json_file(self, filepath):
        '''
            insert any json file into collection
        '''

        with open(filepath) as jsonf :
            file_data = json.load(jsonf)
            if isinstance(file_data, list):
                self.mycol.insert_many(file_data)
            else:
                self.mycol.insert_one(file_data)

        client.close()


    def _insert_flattened_json_file(self, file, col):
        '''
            public function to flatten and insert a yearly NVD JSON file into a collection of CVEs
            input: file name - NVD yearly JSON file
            output: InsertManyResult object
        '''
        df = pd.DataFrame()
        with open(file) as data_file:
           data = json.load(data_file)
        #normalize out the CVEs
        df = pd.json_normalize(data,'CVE_Items')
        df.rename({'cve.CVE_data_meta.ID':'CVE_ID'}, axis=1,inplace=True)
        #change the awkward index name
        df.rename_axis('CVE')
        #MongoDB doesn't like '.' in key names
        df.columns = df.columns.str.replace(".", "_")
        #insert records into MongoDB
        result =col.insert_many(df.to_dict('records'))


    def manage_bulk_insert(self,filelist):
        '''
            Public method to allow for loading all NVD JSON files into a collection
        '''
        try:
            for file in filelist:
                try:
                    filecheck = pathlib.Path(self.workingpath + file)
                    if not filecheck.exists():
                        raise FileNotFoundError("file: not found on path".format(filecheck))
                except Exception as e:
                    print(repr(e))
                file = self.workingpath + file
                print("inserting file: {}".format(file))
                self._insert_flattened_json_file(file,self.maincol)
        except Exception as e:
            print(repr(e))
            
    def reload_NVD_mod_file(self,download=True):
        '''
            Public method to download and insert the latest NVD CVE Modifications file
            into the modscol collection. Optionally the file in the working directory 
            can be loaded without being downloaded (must exist on working dir). This doesn't apply the mods
            to the main collection (maincol)
            input: boolean (download flag)
            output: integer (records inserted)
        '''
        if download:
            self.get_NVD_files(["nvdcve-1.1-modified.json.zip"])
        #empty the mod collection
        self.remove_all_cves(self.modscol)
        file = self.workingpath+"nvdcve-1.1-modified.json"
        self._insert_flattened_json_file(file,self.modscol)
    
    def _get_mods_list_cursor(self):
        '''
            Private method to query for CVE mods and return a cursor of CVEs that have changed
            input: na
            output: cursor
        '''
        try:
            filter_d = {}
            proj_d = {}
            cr = self.modscol.find(filter_d,proj_d)
            if self.modscol.count_documents({}) == 0:
                raise ValueError ("no modcves found")
        except Exception as e:
            print(repr(e))
        return cr
    
    def get_cve_exists(self,cve, col):
        '''
            Public method to query a collection for a given CVE and return exists versus not exists as bool
            input: cve (string), collection
            output: boolean
        '''
        
        return bool(col.find_one({"CVE_ID":cve}))
    
    def update_cves_from_mods(self):
        '''
            Public method to remove CVEs modified and add CVE modified and new CVEs into the main cve collection. The method
            outputs the total number of changes as a list of mods and additions
            input: na
            output: [mods (integer), additions (integer)]
        '''
        mods_l = []
        for cve in self._get_mods_list():
            cve_id = cve.get("CVE_ID")
            #check if exists
            if self.get_cve_exists(cve_id, self.maincol):
                #if exists, add it to a list to be modified
                mods_l.append(cve.get("_id"))
            
        #delete the mods
        r_mods = self.maincol.remove({"_id":{'$in':mods_l}})
        #add all the mods and additions
        r_adds =self.maincol.insert_many(list(self._get_mods_list()))
        return r_mods, r_adds
    
    def remove_cve(self,cve,col):
        '''
            Public method to remove a single record from mongodb and return the deleted count
            input: list of fields []
            output: integer 
        '''
        try:
            query = { "CVE_ID":cve }
            result = col.delete_one(query)
            if result.deleted_count != 1:
                raise ValueError("CVE: {} not deleted".format(cve))
        except Exception as e:
            print(repr(e))
        return result.deleted_count
    
    def remove_all_cves(self,col):
        '''
            Public method to remove all records from mongodb collection and return the deleted count
            input: list of fields []
            output: integer 
        '''
        try:
            query = {}
            result = col.delete_many(query)
        except Exception as e:
            print(repr(e))
        return result.deleted_count



#### instantiate query and update instances

In [99]:
um = UpdateMongo("/home/rzupancic/CVE/")

#### check the count of CVEs for a given string as a test
Example: {"CVE_ID":{ "$regex": "^CVE-2014"}} will count the total CVEs that begin with CVE-2014
Example: 2014 has 8845 (this could change)

In [100]:
filter_d = {"CVE_ID":{ "$regex": "^CVE-2014"}}
proj_d = {"_id":0,"CVE_ID":1}
cr = um.get_fields(filter_d, proj_d,um.maincol)
len(list(cr))

8845

#### There are two types of files we are interested in: yearly and mods

In [None]:
years = list(range(2002,2021))
mod = ["modified"]

#### Re-insert (or initially insert) the yearly files after optionally downloading the latest batch from NVD
The boolean indicates whether to download. If false, simply re-insert the files in the <br>
working directory (these must exist if download is False)

In [97]:
um.rebuild_yearly_NVD_files(False)

inserting file: /home/rzupancic/CVE/nvdcve-1.1-2002.json
inserting file: /home/rzupancic/CVE/nvdcve-1.1-2003.json
inserting file: /home/rzupancic/CVE/nvdcve-1.1-2004.json
inserting file: /home/rzupancic/CVE/nvdcve-1.1-2005.json
inserting file: /home/rzupancic/CVE/nvdcve-1.1-2006.json
inserting file: /home/rzupancic/CVE/nvdcve-1.1-2007.json
inserting file: /home/rzupancic/CVE/nvdcve-1.1-2008.json
inserting file: /home/rzupancic/CVE/nvdcve-1.1-2009.json
inserting file: /home/rzupancic/CVE/nvdcve-1.1-2010.json
inserting file: /home/rzupancic/CVE/nvdcve-1.1-2011.json
inserting file: /home/rzupancic/CVE/nvdcve-1.1-2012.json
inserting file: /home/rzupancic/CVE/nvdcve-1.1-2013.json
inserting file: /home/rzupancic/CVE/nvdcve-1.1-2014.json
inserting file: /home/rzupancic/CVE/nvdcve-1.1-2015.json
inserting file: /home/rzupancic/CVE/nvdcve-1.1-2016.json
inserting file: /home/rzupancic/CVE/nvdcve-1.1-2017.json
inserting file: /home/rzupancic/CVE/nvdcve-1.1-2018.json
inserting file: /home/rzupancic

['nvdcve-1.1-2002.json',
 'nvdcve-1.1-2003.json',
 'nvdcve-1.1-2004.json',
 'nvdcve-1.1-2005.json',
 'nvdcve-1.1-2006.json',
 'nvdcve-1.1-2007.json',
 'nvdcve-1.1-2008.json',
 'nvdcve-1.1-2009.json',
 'nvdcve-1.1-2010.json',
 'nvdcve-1.1-2011.json',
 'nvdcve-1.1-2012.json',
 'nvdcve-1.1-2013.json',
 'nvdcve-1.1-2014.json',
 'nvdcve-1.1-2015.json',
 'nvdcve-1.1-2016.json',
 'nvdcve-1.1-2017.json',
 'nvdcve-1.1-2018.json',
 'nvdcve-1.1-2019.json',
 'nvdcve-1.1-2020.json']

#### Reload the NVD Modifications file - this emptys the cvemods collection and rebuilds it. 
use reload_NVD_mods_file(False) to use an existing mods file in the working dir.<BR>
reload_NVD_mods_file(True) will download the latest mods file and then insert <br>
 it into the modscol collection (True is the default)

In [101]:
um.reload_NVD_mod_file()

#### get counts of the two NVD collections 

In [33]:
print("Total CVEs: {}".format(um.get_count(um.maincol)))
print("Total CVE Mods: {}".format(um.get_count(um.modscol)))

Total CVEs: 153671
Total CVE Mods: 682


#### Get the last modified date of a CVE by CVEID from a given collection

In [84]:
d1 = um.get_cve_last_modified("CVE-1999-0199", um.modscol)
print(list(d1))

[{'lastModifiedDate': '2020-12-03T16:52Z'}]


In [80]:
d1 = um.get_cve_last_modified("CVE-1999-0199", um.maincol)
print(list(d1))

[{'lastModifiedDate': '2020-11-23T19:49Z'}]


#### Get all info from a CVE and prettify the output

In [66]:
cve = um.get_cve("CVE-2015-9550", um.maincol)
print(json.dumps(um._parse_json(cve),indent=4))

{
    "_id": {
        "$oid": "5fc5890346cf49375398e001"
    },
    "publishedDate": "2020-11-24T21:15Z",
    "lastModifiedDate": "2020-11-24T21:23Z",
    "cve_data_type": "CVE",
    "cve_data_format": "MITRE",
    "cve_data_version": "4.0",
    "CVE_ID": "CVE-2015-9550",
    "cve_CVE_data_meta_ASSIGNER": "cve@mitre.org",
    "cve_problemtype_problemtype_data": [
        {
            "description": []
        }
    ],
    "cve_references_reference_data": [
        {
            "url": "https://pierrekim.github.io/blog/2015-07-16-backdoor-and-RCE-found-in-8-TOTOLINK-products.html",
            "name": "https://pierrekim.github.io/blog/2015-07-16-backdoor-and-RCE-found-in-8-TOTOLINK-products.html",
            "refsource": "MISC",
            "tags": []
        }
    ],
    "cve_description_description_data": [
        {
            "lang": "en",
            "value": "An issue was discovered on TOTOLINK A850R-V1 through 1.0.1-B20150707.1612 and F1-V2 through 1.1-B20150708.1646 devices.

#### Remove a CVE by CVEID from a given collection

In [None]:
um.remove_cve("CVE-2020-15999",um.cvecol)

#### Get a dataframe of true mods - CVEs that actually existed and were modified

In [83]:
df = um.get_true_mods_CVEID()
df

Unnamed: 0,CVE_ID
0,CVE-1999-0199
1,CVE-2004-0725
2,CVE-2004-1424
3,CVE-2004-1425
4,CVE-2004-1711
...,...
1059,CVE-2020-9883
1060,CVE-2020-9889
1061,CVE-2020-9951
1062,CVE-2020-9961


In [None]:
df["CVE_ID"].unique().shape