In [1]:
APP_NAME = "Dataplane Ramp-up Analysis per Honeypot"

In [2]:
# %load configuration.py
configuration = {
    "pg_url": "jdbc:postgresql://localhost:5432/honeypot",
    "pg_user": "max",
    "pg_password": "kM9ZhBOBFIl",
    "spark_host": "10.10.10.80",
    "spark_uri": "spark://10.10.10.80:7077",
}


In [3]:
# %load setup.py

import findspark
findspark.init('/opt/spark');

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as fn

import pandas as pd

spark = SparkSession.builder \
    .master(configuration['spark_uri']) \
    .appName(APP_NAME) \
    .config('spark.driver.host', configuration['spark_host']) \
    .config('spark.jars', 'postgresql-42.2.20.jar') \
    .getOrCreate()


options = {
    "url": configuration['pg_url'],
    "user": configuration['pg_user'],
    "password": configuration["pg_password"],
    "driver": "org.postgresql.Driver",
}

df_log = spark.read.format("jdbc").options(
    dbtable="log",
    **options,
).load()

df_dp = spark.read.format("jdbc").options(
    dbtable="dataplane",
    **options,
).load()

df_ip = spark.read.format("jdbc").options(
    dbtable="ip",
    **options,
).load()



In [4]:
# %load data.py


from dateutil import parser

dates = [
    "2021-05-17", #  0: first monday to start logging on
    "2021-05-18", #  1:
    "2021-05-19", #  2:
    "2021-05-20", #  3:
    "2021-05-21", #  4
    "2021-05-22", #  5: Saturday
    "2021-05-23", #  6: Sunday
    "2021-05-24", #  7: Monday (Whit monday)
    "2021-05-25", #  8:
    "2021-05-26", #  9:
    "2021-05-27", # 10: 
    "2021-05-28", # 11:
    "2021-05-29", # 12: Saturday
    "2021-05-30", # 13: Sunday
    "2021-05-31", # 14: first dayout logging
]

dates = [parser.parse(d) for d in dates]

# Get the date filters per week day.
# Consider that each filter consists of a tuple (lower & upper bound)

def get_i_weekday(day: int):
    """Returns a list of indices which represent the corresponding week day."""
    b = day % 7
    return [b, b + 7]

def get_name_weekday(day: int) -> str:
    """Returns the name of the weekday."""
    if day == 0:
        return "Monday"
    elif day == 1:
        return "Tuesday"
    elif day == 2:
        return "Wednesday"
    elif day == 3:
        return "Thursday"
    elif day == 4:
        return "Friday"
    elif day == 5:
        return "Saturday"
    elif day == 6:
        return "Sunday"
    else:
        raise ValueError(f"Invalid day of the week {day}")
    
    
# honeypots

honeypots = [
    ('campus', 1),
    ('campus', 2),
    ('campus', 3),
    ('residential', 6),
    ('residential', 7),
    ('residential', 8),
    ('residential', 9),
    ('cloud', 12),
    ('cloud', 13),
    ('cloud', 14),
    ('cloud', 15),
]

# categories
categories = ['ssh', 'telnet']

# Creation of dataframes

The following section first generates the filter on which the log data needs
to be filtered. Afterwards, the dataframe is created. The data frames are
filtered per honeypot, per distinct IP addresses and number of requests in
total. Also, the data is represented cummulative and as a partial filter per
slot. It is only data shown based on SSH requests and on Telnet requests.

In [5]:
# Generate all Filters
# ================

# First define the time deltas (in minutes) which you would like to display
# at the end of this notebook.

#deltas = [3, 5, 10, 15, 30, 60]
deltas = [60, 120]

# The remainin code generates the filters based on the deltas given above.

from datetime import timedelta

# Time delta in Minutes:

deltas = [timedelta(minutes=d) for d in deltas]
log_start = dates[1]
log_end = dates[5]

# filters partially for each partition
filters_par = {}
# filters cumulative from beginning to end for each parition
filters_cum = {}

# format
fmt = '%Y-%m-%d %H:%M:%S'

for d in deltas:
    filter_par = []
    filter_cum = []
    
    x = log_start + d
    filter_par.append(
        (format(log_start, fmt), format(x, fmt))
    )
    
    while x < log_end:
        filter_par.append(
            (format(x, fmt), format(x + d, fmt))
        )
        filter_cum.append(
            (format(log_start, fmt), format(x, fmt))
        )
        
        x += d
    
    filters_par[d] = filter_par
    filters_cum[d] = filter_cum

In [6]:
# Generate dataframes per honeypot and category
# ================
# This will speedup the selection process of the next steps.

keys = []
dfs = {}

for h in honeypots:
    keys += [(cat, h[0], h[1]) for cat in categories]
    
for k in keys:
    df = df_log.select(
        'id',
        'category',
        'origin',
        'origin_id',
        'timestamp',
        'ip'
    ).filter(
        f"category == '{k[0]}'"
    ).filter(
        f"origin == '{k[1]}'"
    ).filter(
        f"origin_id == '{k[2]}'"
    ).filter(
        f"'{format(log_start, fmt)}' <= timestamp and timestamp < '{format(log_end, fmt)}'"
    )
        
    dfs[k] = df
    

In [7]:

# TODO: Rewrite this part...
# Dataframe cummulative & partial
# The outermost container of the dict is the timedelta.
# In the time delta is the key "category, network type, honeypot" wrapped.
# The innermost layer contains the values per timeslot (depending on the time delta)
df_req_par = {}
df_req_cum = {}

for d in deltas:
    for f in filters_par[d]:       
        df = df_log.select(
            'category','origin', 'origin_id', 'id'
        ).filter(
            f"'{f[0]}' <= timestamp AND timestamp < '{f[1]}'"
        ).groupBy('category', 'origin', 'origin_id').count().withColumn(
            "timestamp", fn.lit(f[0])
        ).select(
            'timestamp', 'category', 'origin', 'origin_id', 'count'
        ).orderBy(
            'timestamp', 'category', 'origin', 'origin_id'
        )
        
        if not df_req_par.get(d):
            df_req_par[d] = df
        else:
            df_req_par[d] = df_req_par[d].union(df)
        
    print(f'Finished partial evaluation on time delta {str(d)}')

    for f in filters_cum[d]:
        df = df_log.select(
            'category', 'origin', 'origin_id', 'id'
        ).filter(
            f"'{f[0]}' <= timestamp AND timestamp < '{f[1]}'"
        ).groupBy('category', 'origin', 'origin_id').count().withColumn(
            'timestamp', fn.lit(f[0])
        ).select(
            'timestamp', 'category', 'origin', 'origin_id', 'count'
        ).orderBy(
            'timestamp', 'category', 'origin', 'origin_id'
        )
        
        if not df_req_cum.get(d):
            df_req_cum[d] = df
        else:
            df_req_cum[d] = df_req_cum[d].union(df)
        
    print(f'Finished cumulative evaluation on time delta {str(d)}')

    
for d in deltas:
    print(f"df_req_par[{d}]: # {df_req_par[d].count()}")
    print(f"df_req_cum[{d}]: # {df_req_cum[d].count()}")

Finished partial evaluation on time delta 1:00:00
Finished cumulative evaluation on time delta 1:00:00
Finished partial evaluation on time delta 2:00:00
Finished cumulative evaluation on time delta 2:00:00


KeyboardInterrupt: 

In [None]:
pdf_req_par = {}
pdf_req_cum = {}

for d in deltas:
    pdf_req_par[d] = df_req_par[d].toPandas()
    pdf_req_cum[d] = df_req_cum[d].toPandas()
    

In [None]:
from matplotlib import pyplot as plt