In [None]:
%%html
<style>
  table {margin-left: 0 !important;}
</style>

# 0. Overview

Common Vulnerabilities and Exposures (CVE) is a list of computer security threats provided by the U.S. Department of Homeland Security and maintained by the MITRE corporation. 

Per MITRE's terminology documentation, CVE distinguishes between vulnerabilities where

> A "vulnerability" is a weakness in the computational logic (e.g., code) found in software and some hardware components (e.g., firmware) that, when exploited, results in a negative impact to confidentiality, integrity, OR availability. Mitigation of the vulnerabilities in this context typically involves coding changes, but could also include specification changes or even specification deprecations (e.g., removal of affected protocols or functionality in their entirety).

and exposures where

> An "exposure" is a system configuration issue or a mistake in software that allows access to information or capabilities that can be used by a hacker as a stepping-stone into a system or network.

CVE considers a configuration issue or a mistake an exposure if it does not directly allow compromise but could be an important component of a successful attack, and is a violation of a reasonable security policy.

In [None]:
import numpy as np
import pandas as pd 
import scipy.optimize as opt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import timedelta
import scipy.stats as sps

cve = pd.read_csv('../input/cve-common-vulnerabilities-and-exposures/cve.csv', header=0, index_col=0)
products = pd.read_csv('../input/cve-common-vulnerabilities-and-exposures/products.csv', header=0, index_col=0)
vendors = pd.read_csv('../input/cve-common-vulnerabilities-and-exposures/vendors.csv', header=0, index_col=0)

cve.pub_date = pd.to_datetime(cve.pub_date)

| attribute | dtype  | description 
| :-- | :-- | :--
| mod_date | datetime | The date the entry was last modified 
| pub_date | datetime | The date the entry was published
| cvss | float | Common Vulnerability Scoring System (CVSS) score, a measure of the severity of a vulnerability 
| cwe_code | categorical | Common Weakness Enumeration (CWE) code, identifying the type of weakness
| cwe_name | categorical | The name associated with the CWE code
| summary | str | A text summary of the vulnerability
| access_authentication | categorical | {NONE, SINGLE, MULTIPLE}
| access_complexity | categorical | {LOW, MEDIUM, HIGH}
| access_vector | categorical | {LOCAL, NETWORK, ADJACENT NETWORK}
| impact_availability | categorical | {NONE, PARTIAL, COMPLETE}
| impact_confidentiality | categorical | {NONE, PARTIAL, COMPLETE}
| impact_integrity | categorical | {NONE, PARTIAL, COMPLETE}

# 1. Features
## 1.1 Publication Date

In [None]:
X = cve.pub_date.dt.to_period('Q').sort_index().value_counts()
X.index = X.index.to_timestamp()
X = X.sort_index()
X[X.index] = np.cumsum(X.values)

rolling = cve.pub_date.dt.to_period('M').sort_index().value_counts()
rolling.index = rolling.index.to_timestamp()
rolling = rolling.sort_index()
rolling[rolling.index] = np.cumsum(rolling.values)
rolling = rolling.rolling(12, axis=0).sum().pct_change(axis=0)

# 2 Plots
# Left: cumulative, right rolling avg. growth rate
fig = make_subplots(rows=1,cols=2)

# Trace 1: Grey bars
fig.add_trace(
    go.Bar(
        x=X.index[:-20],
        y=X.values[:-20],
        marker_color="#bbbbbb",
    ),
    row=1,
    col=1
)

# Trace 2: Blue (emph) bars
fig.add_trace(
    go.Bar(
        x=X.index[-20:],
        y=X.values[-20:],
        marker_color="#2aa198"
    ), 
    row=1,
    col=1
)

# Trace 3: Growth rate
fig.add_trace(
    go.Scatter(
        x=rolling.index,
        y=rolling.values,
        marker_color="#bbbbbb",
        mode="lines"
    ),
    row=1,
    col=2
)

fig.update_layout(
    title=dict(
        text="Threat Proliferation",
        xref="paper",
        x=0., y=1.
    ),
    font=dict(
        family="Arial",
        size=14,
        color="#586e75"
    ),
    xaxis=dict(
        showgrid=False
    ),
    yaxis=dict(
        showgrid=False
    ),
    annotations = [
        dict(
            xref='paper',
            yref='paper',
            x=0., y=1.2,
            showarrow=False,
            text ='The number of known threats (left) continues to grow, but ' + \
            'growth (right), measured as a perent change <br>on a 12-month rolling average, has leveled.',
            valign='top',
            align='left'
        ),
        dict(
            ax=-80,
            ay=-100,
            x=X.index[-20],
            y=40000,
            text='60% of threats occur after 2015'
        )
    ],
    showlegend=False,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    bargap=0
)

fig.show()

## 1.2 Severity

The Common Vulnerability Scoring System (CVSS) is an open framework for describing the characteristics
and severity of computer security exploits developed and maintained by FIRST. These scores consider
exploitability and impact alongside temporal and environmental factors. Scores range from 0 to 10.

![CVSS Subscore Components](https://i.imgur.com/IfyiFzc.png)

In [None]:
fig = go.Figure()
X = cve.cvss.sort_values().astype('int').value_counts().sort_index()[1:]

# Three traces
fig.add_trace(
    go.Bar(
        x=X.index.map(lambda x: "{}-{}".format(x-1,x)),
        y=X.values/np.sum(X.values)*100,
        marker_color=['#bbbbbb', '#bbbbbb', '#bbbbbb', '#dc322f', '#dc322f', '#dc322f', '#dc322f', '#bbbbbb', '#bbbbbb', '#bbbbbb'],
        text=np.vectorize(lambda x: str(x) + "%")(np.round((X.values/np.sum(X.values) * 100),1)),
        textposition='outside'
))

fig.update_layout(
    title=dict(
        text="Threat Severity Distribution",
        xref="paper",
        x=0., y=1.
    ),
    font=dict(
        family="Arial",
        size=14,
        color="#586e75"
    ),
    xaxis=dict(
        showgrid=False,
    ),
    yaxis=dict(
        showgrid=False,
        showticklabels=False
    ),
    annotations=[
        dict(
            xref='paper',
            yref='paper',
            x=0., y=1.2,
            showarrow=False,
            text ="CVSS scores reflect a threat's severity. Over 75 percent of scores fall in FIRSTs Medium (4.0-6.9) threat category<br>" +
            "with a thicker tail toward the higher end of the spectrum.",
            valign='top',
            align='left'
        ),
    ],
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    bargap=0
)

fig.show()

In [None]:
# Authentication access

# multiple = pd.get_dummies(cve.access_authentication.dropna()).groupby(cve.pub_date.dt.to_period('Y')).mean().MULTIPLE
single = pd.get_dummies(cve.access_authentication.dropna()).groupby(cve.pub_date.dt.to_period('Y')).mean().SINGLE
none = pd.get_dummies(cve.access_authentication.dropna()).groupby(cve.pub_date.dt.to_period('Y')).mean().NONE

low = pd.get_dummies(cve.access_complexity.dropna()).groupby(cve.pub_date.dt.to_period('Y')).mean().LOW
med = pd.get_dummies(cve.access_complexity.dropna()).groupby(cve.pub_date.dt.to_period('Y')).mean().MEDIUM
high = pd.get_dummies(cve.access_complexity.dropna()).groupby(cve.pub_date.dt.to_period('Y')).mean().HIGH

net = pd.get_dummies(cve.access_vector.dropna()).groupby(cve.pub_date.dt.to_period('Y')).mean().NETWORK
loc = pd.get_dummies(cve.access_vector.dropna()).groupby(cve.pub_date.dt.to_period('Y')).mean().LOCAL
adj = pd.get_dummies(cve.access_vector.dropna()).groupby(cve.pub_date.dt.to_period('Y')).mean().ADJACENT_NETWORK

part = pd.get_dummies(cve.impact_availability.dropna()).groupby(cve.pub_date.dt.to_period('Y')).mean().PARTIAL
no_ia = pd.get_dummies(cve.impact_availability.dropna()).groupby(cve.pub_date.dt.to_period('Y')).mean().NONE
comp = pd.get_dummies(cve.impact_availability.dropna()).groupby(cve.pub_date.dt.to_period('Y')).mean().COMPLETE

part_ic = pd.get_dummies(cve.impact_confidentiality.dropna()).groupby(cve.pub_date.dt.to_period('Y')).mean().PARTIAL
no_ic = pd.get_dummies(cve.impact_confidentiality.dropna()).groupby(cve.pub_date.dt.to_period('Y')).mean().NONE
comp_ic = pd.get_dummies(cve.impact_confidentiality.dropna()).groupby(cve.pub_date.dt.to_period('Y')).mean().COMPLETE

part_ii = pd.get_dummies(cve.impact_integrity.dropna()).groupby(cve.pub_date.dt.to_period('Y')).mean().PARTIAL
no_ii = pd.get_dummies(cve.impact_integrity.dropna()).groupby(cve.pub_date.dt.to_period('Y')).mean().NONE
comp_ii = pd.get_dummies(cve.impact_integrity.dropna()).groupby(cve.pub_date.dt.to_period('Y')).mean().COMPLETE

traces = [
    [single, none],
    [low, med, high],
    [net, loc, adj],
    [part, no_ia, comp],
    [part_ic, no_ic, comp_ic],
    [part_ii, no_ii, comp_ii],
]

texts = [
    ['Single', 'None'],
    ['Low', 'Medium', 'High'],
    ['Network', 'Local', 'Adj. Network'],
    ['Partial', 'None', 'Complete'],
    ['Partial', 'None', 'Complete'],
    ['Partial', 'None', 'Complete'],
]

colors = ['#2aa198', '#268bd2', '#dc322f']

fig = make_subplots(
    rows=3, 
    cols=2, 
    subplot_titles=[
        'Authentication required',
        'Impact on availability',
        'Access required',
        'Impact on confidentiality',
        'Attack vector',
        'Impact on integrity'
])

for i, (t, txt) in enumerate(zip(traces, texts)):
    for ndx, trace in enumerate(t):
        fig.add_trace(
            go.Scatter(
                x = [trace.index.to_timestamp()[-1]],
                y = [trace[-1]],
                name = "",
                text = " {}% {}".format(np.round(trace[-1] * 100, 1), txt[ndx]),
                textposition='middle right',
                mode='text',
                cliponaxis=False
            ),
            row=i%3 + 1,
            col=i//3 + 1,
        )

        fig.add_trace(
            go.Scatter(
                x = trace.index.to_timestamp(),
                y = trace,
                name = "",
                line=dict(color=colors[ndx])
            ),
            row=i%3 + 1,
            col=i//3 + 1
        )

        fig.add_trace(
            go.Scatter(
                x = [trace.index.to_timestamp()[0]],
                y = [trace[0]],
                name = "",
                text = "{}% ".format(np.round(trace[0] * 100), 1),
                textposition='middle left',
                mode='text',
                cliponaxis=False
            ),
            row=i%3 + 1,
            col=i//3 + 1
        )

fig.update_layout(
    showlegend=False,
    height=1200,
    title=dict(
        text="Access and impact",
    ),
    font=dict(
        family="Arial",
        size=12,
        color="#586e75"
    ),
    xaxis=dict(
        showgrid=False,
    ),
    yaxis=dict(
        showgrid=False,
        showticklabels=False
    ),
    yaxis2=dict(
        showgrid=False,
        showticklabels=False
    ),
    yaxis3=dict(
        showgrid=False,
        showticklabels=False
    ),
    yaxis4=dict(
        showgrid=False,
        showticklabels=False
    ),
    yaxis5=dict(
        showgrid=False,
        showticklabels=False
    ),
    yaxis6=dict(
        showgrid=False,
        showticklabels=False
    ),
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)'
)

fig.show()

## 1.3 Exploit Types

In [None]:
fig = go.Figure()
X = pd.get_dummies(
        cve.cwe_name[cve.cwe_name.isin(cve.cwe_name.value_counts()[:10].index)]
    ).join(
        cve.pub_date
    ).groupby(
        cve.pub_date.dt.to_period("D")
).sum()

colors = ['#ababab', '#cb4b16', '#268bd2', '#ebebeb', '#2aa198', '#dc322f', '#bbbbbb', '#9b9b9b', '#cbcbcb', '#dbdbdb']
X.index = X.index.to_timestamp()
X = X.divide(X.sum(axis=1), axis=0)

# One trace for each column
for ndx in range(X.shape[1]):
    data = X.iloc[:,ndx].rolling(365,axis=0).mean() 
    fig.add_trace(go.Scatter(
        x=data.index[365:],
        y=data.values[365:], 
        name=X.columns[ndx] if "'" not in X.columns[ndx] else X.columns[ndx].split("'")[1],
        marker_color=colors[ndx]
    ))

fig.update_layout(
    title=dict(
        text="How threats have changed over time",
        xref="paper",
        x=0., y=1.
    ),
    height=1100,
    font=dict(
        family="Arial",
        size=14,
        color="#586e75"
    ),
    xaxis=dict(
        showgrid=False,
    ),
    yaxis=dict(
        showgrid=False,
        showticklabels=False
    ),
    annotations=[
        dict(
            xref='paper',
            yref='paper',
            x=0., y=1.075,
            showarrow=False,
            text ="These threats are the 10 most common, but their relative prominence is shifting," +
            "injection (both code and SQL)<br> is becoming less common while cross-site scripting and input validation are on the rise. " +
            "Values are shown as <br> a 365-entry rolling average of relative frequencies.",
            valign='top',
            align='left'
        ),
    ],
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    legend=dict(x=0., y=1.)
)

fig.show()

## 1.4 Products and Vendors

In [None]:
X = products.vulnerable_product.value_counts()[25:0:-1]

fig = go.Figure()
fig.add_trace(go.Bar(
    y=np.vectorize(lambda x: " ".join(map(lambda x: x.title() if len(x) > 2 else x.upper(), x.split("_"))))(X.index),
    x=X.values,
    orientation='h',
    marker_color= ["#268bd2"] * 3 + ["#bbbbbb"] + ["#268bd2"] * 5 + ["#859900"] + ["#268bd2"] + ["#859900"] + ["#268bd2"] * 2 + ["#bbbbbb"] * 2 + ["#268bd2"] * 2 + ["#859900"] * 2 + ["#268bd2"] * 5
))

fig.update_layout(
    height=800,
    title=dict(
        xref='paper',
        text="Affected Products",
        x=0, y=.965
    ),
    font=dict(
        family="Arial",
        size=14,
        color="#586e75"
    ),
    xaxis=dict(
        showgrid=False
    ),
    yaxis=dict(
        showgrid=False,
        tickmode="linear"
    ),
    annotations=[
        dict(
            xref='paper',
            yref='paper',
            x=0., y=1.075,
            showarrow=False,
            text="Most of the top 25 affected products are operating systems (blue) or web browsers (green)",
            valign='top',
            align='left'
        ),
    ],
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    bargap=.2
)

fig.show()

In [None]:
X = vendors.vendor.value_counts()[25:0:-1]

fig = go.Figure()
fig.add_trace(go.Bar(
    y=np.vectorize(lambda x: " ".join(map(lambda x: x.title() if len(x) > 3 else x.upper(), x.split("_"))))(X.index),
    x=X.values,
    orientation='h',
    marker_color= "#bbbbbb"
))

fig.update_layout(
    height=800,
    title=dict(
        xref='paper',
        text="Affected Vendors",
        x=0, y=.965
    ),
    font=dict(
        family="Arial",
        size=14,
        color="#586e75"
    ),
    xaxis=dict(
        showgrid=False
    ),
    yaxis=dict(
        showgrid=False,
        tickmode="linear"
    ),
    annotations=[
        dict(
            xref='paper',
            yref='paper',
            x=0., y=1.075,
            showarrow=False,
            text="40% of the products affected by any vulnerability are distributed by these top 25 vendors",
            valign='top',
            align='left'
        ),
    ],
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    bargap=.2
)

fig.show()