In [None]:
import pandas as pd
import numpy as np
import statistics
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

def most_common(row):
    return max(set(row), key=row.count)

def get_seconds(row):
    return row.second

def sort_dt(row):
    return sorted(row)

def get_td_mean(row):
    td = 0
    if len(row) > 2:
        for i in range (0,len(row)-1):
            td += row[i+1]-row[i]
        return td/(len(row)-1)
    else:
        return 0
        
def get_td_sd(row):
    sd = 0
    new_list = []
    if len(row) > 2:
        for i in range (0,len(row)-1):
            new_list.append(row[i+1]-row[i])
        return statistics.stdev(new_list)
    else:
        return 0
    
def rem_dups(arr):
    myset = set(arr)
    return list(myset)

In [None]:
#Import Data
data = pd.read_csv("Actual-Data/stingar_full-20190523.csv")
data

In [None]:
data['d_time'] = pd.to_datetime(data['d_time']).values.astype(np.int64)

In [None]:
data_grouped_td = data[['src_ip','d_time']].groupby('src_ip',as_index = False).agg({'d_time':lambda x: x.tolist()})
data_grouped_td['d_time'] = data_grouped_td['d_time'].apply(sort_dt)
data_grouped_td['d_time'] = data_grouped_td['d_time'].apply(get_td_mean)
data_grouped_td['d_time'] = pd.to_datetime(data_grouped_td['d_time'], unit='ns')
data_grouped_td['d_time'] = data_grouped_td['d_time'].apply(get_seconds)
data_grouped_td = data_grouped_td.rename({'d_time':'mean_time_difference'},axis = 1)

In [None]:
data_grouped_sd = data[['src_ip','d_time']].groupby('src_ip',as_index = False).agg({'d_time':lambda x: x.tolist()})
data_grouped_sd['d_time'] = data_grouped_sd['d_time'].apply(sort_dt)
data_grouped_sd['d_time'] = data_grouped_sd['d_time'].apply(get_td_sd)
data_grouped_sd['d_time'] = pd.to_datetime(data_grouped_sd['d_time'], unit='ns')
data_grouped_sd['d_time'] = data_grouped_sd['d_time'].apply(get_seconds)
data_grouped_sd = data_grouped_sd.rename({'d_time':'sd_time_difference'},axis = 1)

In [None]:
data_grouped = data[['src_ip','d_time']].groupby('src_ip',as_index = False).agg({'d_time':np.mean})
data_grouped['d_time'] = pd.to_datetime(data_grouped['d_time'], unit='ns')

In [None]:
data_grouped1 = data[['src_ip','d_time']].groupby('src_ip',as_index = False).agg({'d_time':np.std})
data_grouped1['d_time'] = pd.to_datetime(data_grouped1['d_time'], unit='ns')
data_grouped1['d_time'] = data_grouped1['d_time'].apply(get_seconds)

In [None]:
data_grouped2 = data[['sensor','src_ip']].groupby('src_ip',as_index = False).agg({'sensor':lambda x: x.tolist()})
data_grouped2['most_common_sensor'] = data_grouped2['sensor'].apply(most_common)

In [None]:
data_final = data_grouped.merge(data_grouped1, left_on='src_ip', right_on='src_ip')
data_final = data_final.merge(data_grouped_td,left_on='src_ip', right_on='src_ip')
data_final = data_final.merge(data_grouped_sd,left_on='src_ip', right_on='src_ip')
data_final = data_final.merge(data_grouped2,left_on='src_ip', right_on='src_ip')
data_final = data_final.rename({'d_time_x':'mean_time_of_attack','d_time_y':'sd_time_of_attack','sensor':'all_sensors'},axis = 1)

In [None]:

#Check the sensors column and see how data gets stored

data_final['all_sensors'] = data_final['all_sensors'].apply(rem_dups)
data_final['all_sensors'].iloc[1659]

In [None]:
data_final["sensor_number"] = data_final["all_sensors"].apply(len)
data_final.drop(["mean_time_of_attack", "sd_time_of_attack"], axis = 1)
data_final.drop(["mean_time_of_attack", "sd_time_of_attack", "all_sensors", "most_common_sensor"], axis = 1, inplace= True)

In [None]:
data_final['sensor_number'].value_counts()
data_final

In [None]:
new = data[["ssh_username", "src_ip"]].dropna()
new["length_username"] = new["ssh_username"].apply(len)
user_length = new.groupby("src_ip").mean()

In [None]:
current = pd.merge(user_length, data_final, how = "outer", on = "src_ip")
current

In [None]:
current['length_username'].fillna(value = current['length_username'].mean(), inplace = True)

In [None]:
new_command = data[['src_ip', 'command']]
new_command.dropna(inplace = True)
new_command['length_command'] = new_command['command'].apply(len) 

In [None]:
feature = new_command.groupby('src_ip').mean()
current = pd.merge(current, feature, how = 'outer', on = "src_ip")
current

In [None]:
current['length_command'].fillna(value = current['length_command'].mean(), inplace = True)

In [None]:
data['app'].value_counts()
counts = data['app'].value_counts()
res = data[~data['app'].isin(counts[counts < 27].index)]
res['app'].value_counts()
honeypot = res[['app', 'src_ip']]
honeypot

In [None]:
honeypot['app'].value_counts()
honeypot.groupby('src_ip').head()

In [None]:
new_current = pd.merge(current, honeypot, how = 'inner', on = 'src_ip')
current

In [None]:
dat = data.groupby('src_ip')[['src_ip', 'app']].head()
temp = dat.drop_duplicates()

In [None]:
let = pd.get_dummies(temp['app'])
let.head()

In [None]:
det = pd.concat([let,temp], axis = 1)
det['app'].value_counts()
counts = det['app'].value_counts()
det = det[~det['app'].isin(counts[counts < 100].index)]
det['app'].value_counts()

In [None]:
current.columns

In [None]:
current.drop_duplicates('src_ip', inplace = True)
current
head = data[['src_ip', 'app']]
head = head.groupby('src_ip').count()
head.reset_index(inplace = True)
head

In [None]:
current = pd.merge(current, head, how = 'inner', on = 'src_ip')
current.rename(columns={'app':'daily_frequency'}, inplace=True)
current

In [None]:
#current.drop(['app_y'], axis = 1, inplace = True)
#current.drop(['app'], axis = 1, inplace = True)

In [None]:
new = data[['src_ip', 'ssh_password']]
new.head()

In [None]:
new_data = data[['src_ip', 'ssh_password']]
new_data.head()
new_data.dropna(inplace = True)

In [None]:
new_data['length_password'] = new_data['ssh_password'].apply(len)

In [None]:
new_data.drop(['ssh_password'], axis = 1, inplace = True)

In [None]:
new_data.drop_duplicates(inplace= True)
current

In [None]:
current = pd.merge(current, new_data, how = 'outer', on = 'src_ip')
current['length_password'].fillna(value = current['length_password'].mean(), inplace = True)

In [None]:
current.drop_duplicates(inplace= True)
current.drop_duplicates('src_ip', inplace= True)
current

In [None]:
dat_t = data.groupby('src_ip')['dest_port'].nunique()
new_dat_t = dat_t.reset_index()
new_dat_t

In [None]:
current = pd.merge(new_dat_t, current, how = 'inner', on = 'src_ip')

In [None]:
current

In [None]:
current.rename(columns={'dest_port':'dest_port_number'}, inplace=True)

In [None]:
current['dest_port_number'].value_counts()

In [None]:
current['length_command'].value_counts()

In [None]:
current['length_password'].value_counts()

In [None]:
current['length_username'].value_counts()

In [None]:
current.info()

In [None]:
new_frame = data[['src_ip', 'signature']]
new_frame.info()
new_frame['signature'].value_counts()

In [None]:
new_frame.drop_duplicates(subset = "src_ip",inplace = True)

In [None]:
current_df = pd.merge(current, new_frame, on = 'src_ip', how = 'inner')
current_df.info()
dummy = pd.get_dummies(current_df['signature'], drop_first = True)
cur = pd.concat([current_df,dummy], axis = 1)
cur.info()

In [None]:
cur.drop(['SSH session on cowrie honeypot'], axis = 1, inplace = True)

In [None]:
honeypot['app'].value_counts()
honeypot.drop_duplicates(subset = 'src_ip', inplace= True)
honeypot['app'].value_counts()
cur

In [None]:
new_curr = pd.merge(cur, honeypot, on = 'src_ip', how = 'inner')
new_curr.drop(['command attempted on cowrie honeypot'], axis =1, inplace = True)
new_curr['signature'].value_counts()
new_curr

Clustering Algorithm
=================

K-Means Algorithm
-----------------------

The k-means algorithm belongs to the category of prototype-based clustering.
Prototype-based clustering means that each cluster is represented by a prototype, which can either be the centroid (average) of similar points with continuous features, or the medoid (the most representative or most frequently occurring point) in the case of categorical features.
While k-means is very good at identifying clusters with a spherical shape, one of the drawbacks of this clustering algorithm is that we have to specify the number of clusters, k, a priori. 

In [None]:
target = new_curr['app']
target.value_counts()

In [None]:
feature = new_curr.drop(['signature'], axis = 1)

In [None]:
feature

In [None]:
fig, axes = plt.subplots(1, 1, figsize = (8,5))
sns.scatterplot('length_username', 'length_password', hue = 'app', data = feature)
axes.set(xlabel='Length of Username', ylabel='Length of Password')
axes.set_xlim([1,10])

In [None]:
sns.pairplot(feature)

In [None]:
fig, axes = plt.subplots(1, 1, figsize = (8,5))
sns.heatmap(feature.corr(), annot= True)

In [None]:
feature['app'].value_counts()

In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.preprocessing import StandardScaler

dat = feature.drop(['src_ip','app'], axis = 1)
kmeans = KMeans(n_clusters = 3)
kmeans.fit(dat)
y_kmeans = kmeans.predict(dat)
centroids = kmeans.cluster_centers_

In [None]:
centroids

In [None]:
fig, axes = plt.subplots(1, 1, figsize = (8,4))
plt.scatter(dat.iloc[:, 1], dat.iloc[:, 7], c = y_kmeans, s = 50, cmap='rainbow')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 1], centers[:, 7], c='black', s = 100, alpha = 1)
axes.set_xlim([1,10])
axes.set_xlabel('Length of Username')
axes.set_ylabel('Length of Password')
axes.set_title('K-Means with  = 3')

In [None]:
from scipy.spatial.distance import cdist

distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(dat)
    kmeanModel.fit(dat)
    distortions.append(sum(np.min(cdist(dat, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / dat.shape[0])

# Plot the elbow
fig, axes = plt.subplots(1,1, figsize = (8,5))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
dat = feature.drop(['app', 'src_ip'], axis = 1)
kmeans = KMeans(n_clusters = 8)
kmeans.fit(dat)
y_kmeans = kmeans.predict(dat)
centroids = kmeans.cluster_centers_

fig, axes = plt.subplots(1, 1, figsize = (8,4))
plt.scatter(dat.iloc[:, 1], dat.iloc[:, 7], c = y_kmeans, s = 50, cmap='rainbow')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 1], centers[:, 7], c='black', s=100, alpha=1);
axes.set_xlim([1,10])
axes.set_xlabel('Length of Username')
axes.set_ylabel('Length of Password')
axes.set_title('K-Means with K = 8')

In [None]:
dion = feature[feature['app'] == 'dionaea']
from sklearn.cluster import KMeans
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.preprocessing import StandardScaler

dat = dion.drop(['src_ip','app'], axis = 1)
kmeans = KMeans(n_clusters = 3)
kmeans.fit(dat)
y_kmeans = kmeans.predict(dat)
centroids = kmeans.cluster_centers_

In [None]:
fig, axes = plt.subplots(1, 1, figsize = (8,5))
plt.scatter(dat.iloc[:, 1], dat.iloc[:, 7], c = y_kmeans, s = 50, cmap='rainbow')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 1], centers[:, 7], c='black', s=200, alpha=0.5)

In [None]:
from scipy.spatial.distance import cdist

distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(dat)
    kmeanModel.fit(dat)
    distortions.append(sum(np.min(cdist(dat, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / dat.shape[0])

# Plot the elbow
fig, axes = plt.subplots(1,1, figsize = (8,5))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 5)
kmeans.fit(dat)
y_kmeans = kmeans.predict(dat)
centroids = kmeans.cluster_centers_

fig, axes = plt.subplots(1, 1, figsize = (8,5))
plt.scatter(dat.iloc[:, 1], dat.iloc[:, 7], c = y_kmeans, s = 50, cmap='rainbow')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 1], centers[:, 7], c='black', s=200, alpha=0.5);

K-Neighbourhood
---------------------


The KNN algorithm assumes that similar things exist in close proximity The KNN algorithm hinges on this assumption being true enough for the algorithm to be useful. KNN captures the idea of similarity (sometimes called distance, proximity, or closeness) with some mathematics we might have learned in our childhood— calculating the distance between points on a graph.

In [None]:
feature

In [None]:
dem = new_curr.drop(['signature'], axis = 1)
count_t = dem['app'].value_counts()
result = dem[~new_curr['app'].isin(counts[counts < 1000].index)]
result['app'].value_counts()
result

In [None]:
dummy = pd.get_dummies(data = result, columns = ['app'], drop_first = True)
dummy.drop(['src_ip'], axis =1 , inplace = True)
dummy

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(dummy.drop(['app_dionaea'], axis =1))
scaled_features = scaler.transform(dummy.drop(['app_dionaea'], axis =1))
df_feat = pd.DataFrame(scaled_features,columns = dummy.columns[:-1])
df_feat.head()

In [None]:
df_feat['length_command'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(scaled_features,dummy['app_dionaea'],
                                                    test_size=0.40)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,y_train)

In [None]:
pred = knn.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
c = confusion_matrix(y_test,pred)
sns.heatmap(c, annot = True)

In [None]:
from yellowbrick.classifier import ClassificationReport
visualizer = ClassificationReport(knn)
visualizer.fit(X_train, y_train)  
visualizer.score(X_test, y_test) 
visualizer.poof()             

In [None]:
error_rate = []

# Will take some time
for i in range(1,40):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))
    
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,pred))

In [None]:
from yellowbrick.classifier import ClassificationReport
visualizer = ClassificationReport(knn)
visualizer.fit(X_train, y_train)  
visualizer.score(X_test, y_test) 
visualizer.poof()   

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

fig = plt.gcf()
fig.set_size_inches(8, 4)

y_scores = knn.predict_proba(X_test)
fpr, tpr, threshold = roc_curve(y_test, y_scores[:, 1])
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve of kNN')
plt.show()

Logistic Regression
-----------------------

Logistic Regression is generally used for classification purposes. Unlike Linear Regression, the dependent variable can take a limited number of values only i.e, the dependent variable is categorical. When the number of possible outcomes is only two it is called Binary Logistic Regression.


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 

In [None]:
my_frame = data[['src_ip', 'tags']]
my_frame['tags'].fillna(value = 'cloud,', inplace = True)

result

In [None]:
my_frame['tags'].value_counts()

In [None]:
def func(row):
    index = row.find(",")
    return row[0:index]

In [None]:
my_frame['tags'] = my_frame['tags'].apply(func)

In [None]:
my_frame['tags'].value_counts()
sns.countplot(my_frame['tags'])

In [None]:
new_frame = my_frame.drop_duplicates(subset='src_ip')
curr_doc = pd.merge(result, new_frame, how = 'inner', on = 'src_ip')
curr_doc.drop(['src_ip', 'app'], axis = 1, inplace = True)
curr_doc

In [None]:
fig, axes = plt.subplots(figsize = (8,5))
df_tags = pd.get_dummies(curr_doc['tags'], drop_first = True )
df_new = pd.concat([curr_doc, df_tags], axis=1)
df_new.drop(['tags'], axis = 1, inplace = True)
df_new.rename(columns={'localnet':'tag'}, inplace = True)
sns.countplot(df_new['tag'])
axes.set_title('Count-plot based on type of Honeypot')
axes.set_xlabel("Honeypot Type")

In [None]:
X = df_new.drop('tag', axis=1)
y = df_new['tag']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
parameters = model.coef_
parameters

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

fig = plt.gcf()
fig.set_size_inches(8, 5)

y_scores = model.predict_proba(X_test)
fpr, tpr, threshold = roc_curve(y_test, y_scores[:, 1])
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve of Logistic Regression')
plt.show()

In [None]:
from yellowbrick.classifier import ClassificationReport
visualizer = ClassificationReport(model)
visualizer.fit(X_train, y_train)  
visualizer.score(X_test, y_test) 
visualizer.poof()   