In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import cufflinks as cf
import plotly
import matplotlib.pyplot as plt
import nltk

In [None]:
df = pd.read_csv("Actual-Data/stingar_full-20190520.csv", low_memory = False )

In [None]:
df.info()

In [None]:
df.head()

Dictionary
========

The data has the following fields:

1. **app** - Honeypot type. Can be cowrie, amun, glastof, dioanea, conpot or rdphoney. Different honeypots capture different attacks/traffic. For example, Cowrie taps in to SSH/Telnet connections.
2. **command** - Commands run by the attacker if they are able to login. It's a string. Typically a location the attacker is trying to pull their attack tools from. Can be a personal workspace.
3. **data** - Base-64 encoded data sent by attacker. **Field exists only for rdphoney**. 
4. **dest_ip** - IP of the docker/honeypot receiving the incoming connection. Not really useful. **DELETED**
5. **dest_port** - Port for the incoming connection as translated by the docker. Original port value unknown as it depends on the internal mapping mechanism of the docker.
6. **dionaea_action** - Action taken by dionaea honeypot. Can be accept, reject, connect or listen.
7. **direction** - inbound only. **Redundant column**. **DELETED**
8. **ids_type** - **Redundant column.** Only IDS type is network. **DELETED**
9. **protocol** - **Redundant column.** As only protocol used is IP. **DELETED**
10. **request_url** - **URL requested by attacker available only for "glastof" honeypot.
11. **sensor** - UUID of honeypot. Unique ID of a honeypot.
12. **severity** - **Redundant column**. Only value is high. **DELETED**
13. **signature** - Honeypot's interpretation of the attack based on its configuration, different for different honeypots. Exploration tells me it is the type of attack.
14. **src_ip** - IP of the attacker. 
15. **src_port** - Port used by the attacker.
16. **ssh_password** - passwords tried by attacker to get in. List of tuples.
17. **ssh_username** - usernames used by attacker to gain access.
18. **ssh_version** - version string passed by attacker while trying to gain access.
19. **tags** - arbitrary info/values that an attacker is allowed is send and will be accepted/stored as tags by the sensor
20. **timestamp** - UTC timestamp of the request. 
21. **transport** - **Redundant column**. Single value. **DELETED**
22. **type** - type of service used by the attacker.
23. **url** - URL requested by the attacker, is extracted from the command field. Convenience filed extracted from 'command' field.
24. **username** - **Redundant field.** **DELETED**
25. **vendor_product** - same as app. **Redundant field**. **DELETED**
26. **d_time** - pulled from timestamp. Local time.
27. **d_time_window** - 5 minutes margin from d_time. 

	

In [None]:
#Drop the redundant columns
df.drop(["direction", "ids_type", "protocol", "severity", "vendor_product", "username"], axis = 1, inplace = True)   
                  

In [None]:
df.drop(["Unnamed: 0"], inplace = True, axis = 1)
df.head()

In [None]:
df['type'].unique()

In [None]:
df['tags'].unique()



In [None]:
df['signature'].unique()

In [None]:
df['command'].unique()

In [None]:
df['command'].nunique()

In [None]:
df['command'].value_counts()

In [None]:
df.groupby('signature').describe()

In [None]:
df['data']

In [None]:
df['data'].unique()

In [None]:
#Destination IP is not useful and hence we can drop the column
df.head()

In [None]:
df['dest_port']

In [None]:
df['dest_port'].value_counts()

In [None]:
df[df['dest_port'] == 'cowrie.sessions']

In [None]:
#Cowrie Sessions Data is pretty useless. Might as well drop the rows
df.drop(df.index[df[df['dest_port'] == 'cowrie.sessions'].index], inplace = True)

In [None]:
df['dest_port'].value_counts()

In [None]:
df[df['dest_port'] == 'tcp'].index

In [None]:
df["src_ip"]

In [None]:
df.drop(df.index[df[df['dest_port'] == 'tcp'].index], inplace = True)

In [None]:
df['dest_port'].value_counts()

In [None]:
df.head()

In [None]:
df.head()

Let's explore the signature column

In [None]:
sns.set(style='darkgrid', rc={"grid.linewidth": 0.1})
fig = plt.figure(figsize = (25,15))
sns.countplot(x = "signature", data = df)
plt.tight_layout()

In [None]:
df['app'].value_counts()

We need to drop all the rows apart from 



cowrie         1238845
dionaea        195462
amun             5801
rdphoney         2646
glastopf          738
conpot             27

In [None]:
new_frame = df[(df['app'] == 'cowrie') | (df['app'] == 'dionaea') | (df['app'] == "amun") | (df['app'] == 'rdphoney') | (df['app'] == 'glastopf') | (df['app'] == 'conpot')].copy()   

In [None]:
new_frame.head()

In [None]:
new_frame['app'].value_counts()

In [None]:
sns.set(style='darkgrid', rc={"grid.linewidth": 0.1})
fig = plt.figure(figsize = (25,15))
sns.countplot(x = "app", data = new_frame)
plt.tight_layout()

In [None]:
new_frame['dest_port'].value_counts()

In [None]:
new_frame['dionaea_action'].value_counts()

In [None]:
new_frame['command'].value_counts()

In [None]:
new_frame['data'].value_counts()

In [None]:
new_frame['ssh_password'].value_counts()

In [None]:
new_frame['src_ip'].value_counts()

In [None]:
new_frame['src_port'].value_counts()

In [None]:
new_frame['ssh_version'].value_counts()

In [None]:
new_frame['tags'].value_counts()

In [None]:
df['tags'].value_counts()

In [None]:
new = pd.read_csv("Actual-Data/stingar_full-20190520.csv", low_memory = False )
new['tags'].value_counts()

In [None]:
fig = plt.figure()
fig.set_size_inches(11.7, 8.27)
sns.heatmap(new.isnull(), cmap="viridis")
plt.tight_layout()

In [None]:
new['tags'].count()

In [None]:
values = {"tags": "cloud"}
new_frame.fillna(values, inplace = True)

In [None]:
new_frame['tags'].value_counts()

In [None]:
fig = plt.figure()
fig.set_size_inches(11.7, 8.27)
sns.heatmap(new_frame.isnull(), cmap="viridis")
plt.tight_layout()

In [None]:
new_frame["request_url"].value_counts()

In [None]:
new_frame['request_url'].count()

In [None]:
new_frame['url'].count()

In [None]:
new_frame['url'].value_counts()

In [None]:
total = new_frame.isnull().sum().sort_values(ascending=False)
percent = (new_frame.isnull().sum()/new_frame.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
f, ax = plt.subplots()
f.set_size_inches(11.7, 8.27)
plt.xticks(rotation='90')
sns.barplot(x=missing_data.index, y=missing_data['Percent'])
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)
missing_data.head()

In [None]:
new_frame['command'].fillna(value = "exit", inplace = True)

In [None]:
new_frame['command'].value_counts()

In [None]:
new_frame['ssh_password'].fillna(method = "bfill", inplace = True)

In [None]:
new_frame['ssh_password'].value_counts()

In [None]:
new_frame.command.value_counts()

In [None]:
new_frame.ssh_version.value_counts()

In [None]:
new_frame.ssh_password.fillna("none",inplace=True)
new_frame.ssh_version.fillna("SSH-2.0-OpenSSH_7.3", inplace = True)
new_frame.dionaea_action.fillna("Not Applicable", inplace = True)

In [None]:
new_frame.dionaea_action.value_counts()

In [None]:
total = new_frame.isnull().sum().sort_values(ascending=False)
percent = (new_frame.isnull().sum()/new_frame.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
f, ax = plt.subplots(figsize=(15, 6))
plt.xticks(rotation='90')
sns.barplot(x=missing_data.index, y=missing_data['Percent'])
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)
missing_data.head()

In [None]:
new_frame.hash.value_counts()

In [None]:
new_frame.head()

In [None]:
new_frame.drop(['hash'], axis =1 , inplace=True)

In [None]:
total = new_frame.isnull().sum().sort_values(ascending=False)
percent = (new_frame.isnull().sum()/new_frame.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
f, ax = plt.subplots(figsize=(15, 6))
plt.xticks(rotation='90')
sns.barplot(x=missing_data.index, y=missing_data['Percent'])
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)
missing_data.head()

In [None]:
new_frame.drop(columns=['transport'], inplace=True)

In [None]:
new_frame['dest_ip'].value_counts()

In [None]:
new_frame['dest_port'].value_counts()

In [None]:
new_frame.head()

In [None]:
new_frame['sensor'].value_counts()

In [None]:
df['ssh_password'].value_counts()

In [None]:
df['ssh_username'].value_counts()

## Choose Categories

Now, let's try to categorize data so that we can make predictions.

In [None]:
#ssh_username
value_counts = new_frame['ssh_username'].value_counts()
to_remove = value_counts[value_counts <= 1000].index
new_frame = new_frame[~new_frame.ssh_username.isin(to_remove)]
new_frame['ssh_username'].value_counts()

In [None]:
new_frame['ssh_password'].value_counts()

In [None]:
value_counts = new_frame['ssh_password'].value_counts()
to_remove = value_counts[value_counts <= 2000].index
new_frame = new_frame[~new_frame.ssh_password.isin(to_remove)]
new_frame['ssh_password'].value_counts()

In [None]:
value_counts = new_frame['sensor'].value_counts()
to_remove = value_counts[value_counts <= 300].index
new_frame = new_frame[~new_frame.sensor.isin(to_remove)]
new_frame['sensor'].value_counts()

In [None]:
new_frame['tags'].value_counts()

In [None]:
new_frame['ssh_username'].value_counts()

In [None]:
new_frame['ssh_password'].value_counts()

In [None]:
new_frame['ssh_version'].value_counts()

In [None]:
value_counts = new_frame['ssh_version'].value_counts()
to_remove = value_counts[value_counts <= 20].index
new_frame = new_frame[~new_frame.ssh_version.isin(to_remove)]
new_frame['ssh_version'].value_counts()

In [None]:
value_counts = new_frame['dest_port'].value_counts()
to_remove = value_counts[value_counts <= 1].index
new_frame = new_frame[~new_frame.dest_port.isin(to_remove)]
new_frame['dest_port'].value_counts()

In [None]:
new_frame['src_ip'].value_counts()
value_counts = new_frame['dest_port'].value_counts()
to_remove = value_counts[value_counts <= 1].index
new_frame = new_frame[~new_frame.dest_port.isin(to_remove)]
new_frame['dest_port'].value_counts()

In [None]:
print(new_frame['app'].value_counts())
print("\n")
print(new_frame['tags'].value_counts())
print("\n")
print(new_frame['sensor'].value_counts())   
print("\n")
print(new_frame['ssh_username'].value_counts()) 
print("\n")
print(new_frame['ssh_password'].value_counts())
print("\n")
print(new_frame['dest_ip'].value_counts())
print("\n")
print(new_frame['dest_port'].value_counts())
print("\n")
print(new_frame['signature'].value_counts())
print("\n")
print(new_frame['type'].value_counts())
print("\n")
print(new_frame['d_time_window'].value_counts())

In [None]:
data_analysis = new_frame[["app", "sensor", "tags", "src_ip", "src_port" ,"ssh_username", "ssh_version", "ssh_password", "dest_ip", "dest_port", "signature", "type", "d_time_window"]].copy()

In [None]:
data_analysis.head()

In [None]:
sns.set()
fig, ax = plt.subplots(figsize=(35,10))
data_analysis.groupby(['tags','type']).count()['src_ip'].plot(ax=ax)
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(figsize= (15,8))
data_analysis.groupby(['tags','signature']).count()['src_ip'].unstack().plot(ax=ax)
plt.tight_layout()
ax.set_xlabel("Tags")
# fig, ax = plt.subplots(figsize=(35,10))
# data.groupby(['date','type']).count()['amount'].unstack().plot(ax=ax)




In [None]:
df_cloud = data_analysis.loc[data_analysis['tags'] == "cloud"]
df_localnet = data_analysis.loc[data_analysis['tags'] != "cloud"] 

In [None]:
sns.set()
fig, ax = plt.subplots(figsize= (15,8))
df_cloud.groupby(['signature']).count()['src_ip'].plot(ax=ax)
plt.tight_layout()


fig, ax = plt.subplots(figsize= (15,8))
data_analysis.groupby(['tags','signature']).count()['src_ip'].unstack().plot(ax=ax)
plt.tight_layout()
ax.set_xlabel("Tags")

In [None]:
data_analysis["src_port"].value_counts().sort_values(ascending = False).head(10)

In [None]:
data_analysis["dest_port"].value_counts().sort_values(ascending = False).head(10).plot.bar(rot = 0, title = "Destination Port")

In [None]:
data_analysis["ssh_password"].value_counts().sort_values(ascending = False).head(10).plot.bar(rot=0,figsize=(20,10))


In [None]:
new_frame['app'].value_counts().sort_values(ascending = False).head(10).plot.bar(rot = 0, title = "Type of Honeypot")

In [None]:
new_frame['tags'].value_counts().sort_values(ascending = False).head(10).plot.bar(rot = 0, figsize =(20,10))

In [None]:
new_frame['ssh_username'].value_counts().sort_values(ascending = False).head(5).plot.bar()
plt.tight_layout()

In [None]:
sns.set()
new_frame['ssh_password'].value_counts().sort_values(ascending = False).head(10).plot.bar(figsize = (10,5), title = "SSH password attempted")

In [None]:
a4_dims = (22.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
sns.catplot(ax = ax, y="signature", kind="count",
            palette="pastel", edgecolor=".6",
            data=new_frame);


In [None]:
new_frame

In [None]:
a4_dims = (22.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
sns.catplot(ax = ax, y="ssh_username", kind="count",
            palette="pastel", edgecolor=".6",
            data=new_frame)