# Ramp-up Analysis

This analysis aims to perform a ramp-up analysis of the dataset.
A ramp-up is basically the summary of login requests per timeslot
over the time. We expect to see an increasing number of IPs as well
as an increasing number of requests proportionally to the IPs.

## TODO:

Find some literature, which supports the ramp-up expectation

In [None]:
from datetime import datetime, timezone

APP_NAME = "Ramp-up Analysis"

# Define the deltas which should be used to group the log data.
deltas = ["5T", "10T", "15T", "20T", "30T", "60T", "120T"]

# Define the protocols
#protocols = ['ssh', 'telnet']
protocols = ['ssh']

# Define the network categories
#network_types = ['cloud', 'campus', 'residential']
network_types = ['campus']

# Define until when the data should be plotted.
# Exclusive the moment of plotting. 
until = datetime(2021, 5, 18, tzinfo=timezone.utc)


In [None]:
%load configuration.py

In [None]:
%load setup-pd.py

In [None]:
%load data.py

In [None]:
# Generate keys for honeypots
# ================

# Generate keys for each honeypot and each protocol. With the help of
# this, the pandas Timeseries resampling is applied in order to count()
# the occurences per sample.

keys = []

for h in honeypots:
    keys += [(cat, h[0], h[1]) for cat in categories]

    


In [None]:
# Generate Dataframes per honeypot
# ================

# Collect all log entries per honeypot. That makes it faster to
# query data later on.

dfs = {}

for k in keys:
    df = df_log[df_log.category == k[0]]
    df = df[df.origin == k[1]]
    df = df[df.origin_id == k[2]]
    dfs[k] = df[["timestamp", "ip"]]


In [None]:
# Create time series data of the ramp up.
# ================



df_timeseries = {}

for d in deltas:
    df_timeseries[d] = {}
    
    for cat in categories:
        df_timeseries[d][cat] = {}
        
        for h in honeypots:
            org = h[0]
            oid = h[1]
            
            df = dfs[(cat, org, oid)].resample(d, label='left', on='timestamp').count()
            df = df[['ip']]
            
            stored = df_timeseries[d][cat].get(org, None)
            
            if stored is None:
                stored = df
                stored.rename(columns={ 'ip' : f"{org} {oid}"}, inplace=True)
            else:
                stored = stored.join(df)
                stored.rename(columns={ 'ip' : f"{org} {oid}"}, inplace=True)
            
            df_timeseries[d][cat][org] = stored
            

In [None]:

from datetime import datetime, timezone


for delta in df_timeseries.keys():
    for category in df_timeseries[delta].keys():
        
        if category not in protocols:
            continue
        
        for origin in df_timeseries[delta][category].keys():
            
            if origin not in network_types:
                continue
            
            # Build plot parameters
            
            title = f"{category.upper()} in {origin} networks"
            xlabel = f"Time Slots ({delta.replace('T', '')} Minutes)"
            
            # Plot
            df = df_timeseries[delta][category][origin]            
            df = df[df.index < until]
            
            df.plot.line(
                figsize=(15,12),
                title=title,
                xlabel=xlabel,
                ylabel="# Requests",
                #subplots=True,
            )