In [None]:
import gzip
from io import BytesIO
import numpy as np
import pandas as pd
import os
import requests
import json

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")

In [None]:
from preprocessing_utils import preprocess_NVD_data

In [None]:
data_path = 'data'
if not os.path.exists(data_path):
    os.makedirs(data_path)

# EPSS data

Download the EPSS data from https://www.first.org/epss/data_stats into `data` folder

In [None]:
base_url = "https://epss.empiricalsecurity.com/epss_scores-"
date_current = "2025-10-01"  # Or automatically get current day
ext = ".csv.gz"
nvd_url = base_url + date_current + ext
nvd_filename = "epss_scores-" + date_current + ext
response = requests.get(nvd_url)
with open(os.path.join(data_path, nvd_filename), "wb") as binary_file:
    binary_file.write(response.content)

In [None]:
epss_current = pd.read_csv(os.path.join(data_path, nvd_filename), compression='gzip', header=1)
epss_current  # a Python statement with a variable name at the end of a cell will display its contents below


# NVD data

In [None]:
base_url = "https://services.nvd.nist.gov/rest/json/cves/2.0"
date_start_NVD = '2025-09-01T00:00:00.000Z'  # Do NOT change these dates
date_end_NVD = '2025-10-01T00:00:00.000Z'  # Do NOT change these dates
start_index = 0
results_per_page = 1000
total_results = 1

candidate_cves = []

while start_index < total_results:
    params = {
        "pubStartDate": date_start_NVD,
        "pubEndDate": date_end_NVD,
        "resultsPerPage": results_per_page,
        "startIndex": start_index,
        "noRejected": ""
    }
    response = requests.get(base_url, params=params, timeout=6)
    if response.status_code != 200:
        print("Error:", response.status_code)
        break

    data = response.json()
    total_results = data.get("totalResults", 0)

    candidate_cves.extend(data.get("vulnerabilities", []))

    start_index += results_per_page
    print(start_index)

In [None]:
# normalize and preprocess data
df = pd.json_normalize(candidate_cves, record_path=None, sep='.', max_level=None)
df = preprocess_NVD_data(df)

# remove vulnerabilities marked as "reject" or "reserved"
df = df[(df['cve.vulnStatus'] != 'Reserved') & (df['cve.vulnStatus'] != 'Reject')]

# merge NVD and EPSS data
df = df.merge(epss_current, left_on="cve.id", right_on="cve", how="left")

In [None]:
# save nvd data
with open(os.path.join(data_path, "nvd_cves.json"), "w", encoding="utf-8") as f:
    json.dump(candidate_cves, f, indent=2)

# save the final dataframe
df.to_csv(os.path.join(data_path, "vuln_2025_09.csv"))

# Exploratory Data Analysis

- display some examples (e.g., the first two CVE records)

In [None]:
df.head(2).T

- show a bar plot with the daily volume of published CVEs

In [None]:
published_counts = df["cve.published"].dt.date.value_counts().sort_index()

plt.figure(figsize=(12, 5))
sns.barplot(x=published_counts.index, y=published_counts.values, color="k")
plt.xticks(rotation=90)
plt.xlabel("Date")
plt.ylabel("Number of CVEs Published")
plt.title("CVE Publications per Day")
plt.tight_layout()
plt.show()

- print the description of the last ten published vulnerabilities

In [None]:
for idx, x in enumerate(df.sort_values('cve.published', ascending=False)[:10].iterrows()):
    print('-' * 100)
    print(x[1]['cve.id'], x[1]['cve.published'])
    print(x[1].description)


### <font color='blue'><b><i>TODO</i></b>: produce plots or tables to address the folowing points</font>
- what is the percentage of CVEs which received a CVSS score?
- report descriptive statistics of CVSS the CVSS base score and/or show its distribution
- report descriptive statistics of EPSS and/or show its distribution
- produce a scatter plot showing CVSS vs EPSS
- <b>be creative</b>!
    - How many vulnerabilities are published on CISA KEV? 
    - What are the the 20 most frequent vendors? (vendor name can be extracted from the `vulnerable_cpes` field).
    - What are the 20 most frequent CWEs?
    - Anaything else you see fit!

<font color='blue'>Use text cells to discuss the outcome after each point</font>

What is the percentage of CVEs which received a CVSS score?

In [None]:
print(f"{df["cvss_baseScore"]}%")

# CVE selection


### <font color='blue'><b><i>TODO</i></b>
- Filter the CVEs with low EPSS (<1%)
- Select candidate CVEs
    - From the resulting subset, select 10 CVEs that you think will reach high EPSS by the end of the course.
    - Clearly describe the criteria you used for selection (e.g., high CVSS, popular software, CWE, popular vendor, number of references, keyword in description, manual inspection, random sampling, security blogs).
- Share the selected CVE ids with the instructor (by two weeks). Use the code cell below to produce the csv file to submit.
- Track the EPSS of your CVEs over time


As per specification, we start by filtering the CVEs with low EPSS (<1%)

In [None]:
df_candidates = df[df['percentile'] <= 0.01]
df_candidates

Since these CVEs are quite unknown, let's see if there are some unfilled columns we can remove to speed up analysis

In [None]:
df_candidates.head()

We build a ML pipeline to try to predict which one of these CVEs will make it big in the next 3 months. To do so, we first download the complete history of EPSS daily values since first publication for all CVEs.

As our dataset, we take all CVEs from NVD and we add the binary label "made_it" which will be true if the following properties are satisfied, and false otherwise:

 $\text{P1: }\exists t_s\in[\text{2021-04-14}, \text{2025-07-01}] : \text{EPSS(CSV}, t_s\text{)} < 1\%(t_s)$

 $\text{P2: }\exists t_f\in[t_s, t_s+90] : \text{EPSS(CSV}, t_s\text{)} >= 90\%(t_f)$

We choose a percentile of 90% because we want to make it big! (we should go much lower 50% to be more sure)

The metrics used for evalution will be:

for each vuln:
$\frac{\sum_{t=1}^{T}pct_{i,t}-pct_{i,0}}{T}$

## NVD complete database
We start by downloading all the CVEs that have ever been published between 2002 and 2024

In [None]:
base_url = "https://nvd.nist.gov/feeds/json/cve/2.0/nvdcve-2.0-"
years = range(2002, 2025, 1)
ext = ".json.gz"

all_cves = []

for year in years:
    nvd_filename = "cves-" + str(year) + ext
    nvd_url = base_url + str(year) + ext
    response = requests.get(nvd_url)
    if response.status_code != 200:
        print("Error:", response.status_code)
        break
    with gzip.GzipFile(fileobj=BytesIO(response.content)) as f:
        all_cves.extend(json.load(f).get("vulnerabilities", []))
    with open(os.path.join(data_path, "cves-" + str(year) + ".json"), "wb") as binary_file:
        binary_file.write(json.load(f))

In [None]:
# normalize and preprocess data
df = pd.json_normalize(all_cves, record_path=None, sep='.', max_level=None)
df = preprocess_NVD_data(df)
df.describe()

In [None]:
df.isnull().sum()

We drop all the following columns because they have too many missing values. We also drop all CVEs that don't have a CVSS version

In [None]:
X = df.drop(columns=["cve.evaluatorSolution", "cve.evaluatorImpact", "cve.vendorComments", "cve.evaluatorComment", "cve.cisaExploitAdd", "cve.cisaActionDue", "cve.cisaRequiredAction", "cve.cisaVulnerabilityName"]).dropna()
X.info()

## Feature construction from historical EPSS data

For each CVE, we download its complete EPSS history and we determine if at any point it satisfied the metrics set by the exercise. Since the metrics are 4, we will append 4 binary labels accordingly.

In [None]:
nickname = 'template_submsission'  # TODO: put your nickname here

# TODO: put your selected IDs here
selected = ['CVE-YYYY-XXXXX0',
            'CVE-YYYY-XXXXX1',
            'CVE-YYYY-XXXXX2',
            'CVE-YYYY-XXXXX3',
            'CVE-YYYY-XXXXX4',
            'CVE-YYYY-XXXXX5',
            'CVE-YYYY-XXXXX6',
            'CVE-YYYY-XXXXX7',
            'CVE-YYYY-XXXXX8',
            'CVE-YYYY-XXXXX9',
            ]

df[df['cve.id'].isin(selected)].to_csv(os.path.join(data_path, f'{nickname}.csv'))            