In [None]:
import pandas as pd
import numpy as np
import seaborn as sns # import for plot styles
import matplotlib.pyplot as plt
%matplotlib inline

import authlog

First, we load all logs we can find, filter to get sshd and search for some logs containing failed login attempts with the respective user name tried.
We also look for disconnect messages that show the IP address from where the attack came.

In [None]:
import config

In [None]:
df = authlog.load_all_logs(config.dataDirLogs)
df = df[df.program == 'sshd']
probedUsers = authlog.extract_probed_users(df)
failedPasswordUsers = authlog.extract_failed_password_users(df)
df['probed user'] = probedUsers.combine_first(failedPasswordUsers)
df['disconnect ip'] = authlog.extract_disconnect_ip(df)

In [None]:
df

As you can see in the table (if you have matching attacks in your logs), we now have some user name attempts and corresponding IP addresses.
They are on different lines, however they have the same *action* number.
We will now join these attack attempts on themselves to get probed user name and origin ip address.
Since the action number repeats after a while, we must eliminate entries where both dates are too far apart.

In [None]:
merged = pd.merge(
    df[['date', 'action', 'probed user']].dropna(),
    df[['date', 'action', 'disconnect ip']],
    on='action').dropna()
merged['tdiff'] = merged.date_y - merged.date_x
merged = merged[abs(merged.tdiff) < pd.to_timedelta('1 min')][['date_x', 'probed user', 'disconnect ip']]
merged.columns = ['date', 'probed user', 'ip']
merged = merged.set_index('date').sort_index()

In [None]:
merged

In [None]:
merged.ip.resample('5min').agg({'attempts': 'count', 'unique ips': lambda s: len(s.unique())}).plot(figsize=[11,6])

In [None]:
merged.ip.value_counts()

In [None]:
frequentIps = merged.ip.value_counts()[lambda x: x >= 0].index

In [None]:
df2 = merged[['ip']][merged.ip.isin(frequentIps)].reset_index()
df2.ip = df2.ip.astype('category')
df2['dummy'] = 0
df2


In [None]:
plt.figure(figsize=(8,14))
sns.stripplot(data=df2, x='date', y='ip', order=frequentIps, size=3, palette='deep')
plt.gcf().autofmt_xdate()
plt.xlim(df2.date.min(), df2.date.max())