In [6]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from scipy.special import softmax

In [2]:
# import scenario 1 (CTU-42)
dsn = pd.read_csv('../data/scenario_1.csv')

In [36]:
dsn['Label'].unique()

array(['flow=Background-UDP-Established', 'flow=Background',
       'flow=Background-Established-cmpgw-CVUT',
       'flow=Background-TCP-Established',
       'flow=From-Background-CVUT-Proxy',
       'flow=To-Background-UDP-CVUT-DNS-Server',
       'flow=Background-UDP-Attempt',
       'flow=From-Normal-V42-UDP-CVUT-DNS-Server',
       'flow=Background-UDP-NTP-Established-1',
       'flow=To-Background-CVUT-WebServer',
       'flow=Background-google-analytics3',
       'flow=Background-Attempt-cmpgw-CVUT',
       'flow=To-Background-CVUT-Proxy',
       'flow=Background-google-analytics2',
       'flow=Background-google-analytics6', 'flow=From-Normal-V42-Jist',
       'flow=Background-google-webmail', 'flow=Background-TCP-Attempt',
       'flow=Background-google-analytics12',
       'flow=Background-google-analytics1',
       'flow=Background-google-analytics13',
       'flow=Background-google-analytics14', 'flow=To-Background-Jist',
       'flow=Background-ajax.google', 'flow=From-Nor

In [43]:
pd.options.display.max_rows=150
dsn.groupby('Label').count()['SrcAddr']

Label
flow=Background                                                              40216
flow=Background-Attempt-cmpgw-CVUT                                           30983
flow=Background-CS-Host-CVUT                                                    22
flow=Background-Established-cmpgw-CVUT                                      137257
flow=Background-TCP-Attempt                                                 105438
flow=Background-TCP-Established                                             223543
flow=Background-UDP-Attempt                                                  66699
flow=Background-UDP-Established                                            1169677
flow=Background-UDP-NTP-Established-1                                         2233
flow=Background-ajax.google                                                   1197
flow=Background-google-analytics1                                              500
flow=Background-google-analytics10                                             73

In [10]:
# Unique Botnet Labels
for label in dsn['Label'].unique():
    if 'botnet' in label.lower():
        print(label)

flow=From-Botnet-V42-UDP-DNS
flow=From-Botnet-V42-TCP-HTTP-Google-Net-Established-6
flow=From-Botnet-V42-TCP-Established
flow=From-Botnet-V42-TCP-Established-HTTP-Binary-Download-1
flow=From-Botnet-V42-TCP-Established-HTTP-Binary-Download-Custom-Port-7
flow=From-Botnet-V42-TCP-Established-HTTP-Binary-Download-3
flow=From-Botnet-V42-TCP-CC6-Plain-HTTP-Encrypted-Data
flow=From-Botnet-V42-TCP-Established-HTTP-Binary-Download-Custom-Port-4
flow=From-Botnet-V42-TCP-Established-HTTP-Ad-63
flow=From-Botnet-V42-TCP-Established-HTTP-Binary-Download-9
flow=From-Botnet-V42-TCP-WEB-Established
flow=From-Botnet-V42-TCP-Attempt
flow=From-Botnet-V42-TCP-CC54-Custom-Encryption
flow=From-Botnet-V42-TCP-Established-HTTP-Ad-64
flow=From-Botnet-V42-TCP-CC53-HTTP-Not-Encrypted
flow=From-Botnet-V42-TCP-Attempt-SPAM
flow=From-Botnet-V42-TCP-CC1-HTTP-Not-Encrypted
flow=From-Botnet-V42-TCP-Not-Encrypted-SMTP-Private-Proxy-1
flow=From-Botnet-V42-TCP-Established-HTTP-Ad-40
flow=From-Botnet-V42-UDP-Established
fl

In [28]:
print('No. unique SrcAddr:', dsn['SrcAddr'].nunique())
print('No. unique DestAddr:', dsn['DstAddr'].nunique())

No. unique SrcAddr: 542093
No. unique DestAddr: 119296


In [29]:
print('Total No. of flows: ', len(dsn))
bot_count = dsn['Label'].apply(lambda x: 1 if 'botnet' in x.lower() else 0)
print('No. of botnet flows: ', bot_count.sum())

Total No. of flows:  2824636
No. of botnet flows:  40961


In [35]:
unique_nodes = set(dsn['SrcAddr'].unique()).union(set(dsn['DstAddr'].unique()))
print('No. of unique nodes: ', len(unique_nodes))

No. of unique nodes:  607565


In [3]:
edge_dsn = dsn[['SrcAddr', 'DstAddr']].drop_duplicates()

In [4]:
edge_list = [(u,v) for u, v in edge_dsn.values]

In [10]:
edge_list

[('212.50.71.179', '147.32.84.229'),
 ('84.13.246.132', '147.32.84.229'),
 ('217.163.21.35', '147.32.86.194'),
 ('83.3.77.74', '147.32.85.5'),
 ('74.89.223.204', '147.32.84.229'),
 ('66.169.184.207', '147.32.84.229'),
 ('182.239.167.121', '147.32.84.229'),
 ('147.32.3.93', '147.32.84.59'),
 ('24.117.206.20', '147.32.84.229'),
 ('94.208.78.74', '147.32.84.229'),
 ('79.129.201.26', '147.32.84.229'),
 ('147.32.86.194', '217.163.21.35'),
 ('147.32.80.13', '147.32.84.162'),
 ('74.200.246.228', '147.32.84.59'),
 ('77.238.167.32', '147.32.86.194'),
 ('94.124.104.196', '147.32.84.59'),
 ('98.127.111.126', '147.32.84.229'),
 ('123.1.72.4', '147.32.84.229'),
 ('147.32.84.229', '212.217.56.83'),
 ('147.32.84.229', '213.142.200.29'),
 ('147.32.84.229', '93.45.94.195'),
 ('147.32.84.229', '83.78.136.90'),
 ('147.32.80.13', '147.32.85.112'),
 ('217.164.10.229', '147.32.84.229'),
 ('199.59.148.20', '147.32.84.184'),
 ('77.100.246.74', '147.32.84.229'),
 ('58.72.174.152', '147.32.84.229'),
 ('83.137.2

In [5]:
G = nx.Graph()
G.add_edges_from(edge_list)

In [7]:
class DeepWalk(object):
    def __init__(self, walk_length=5, window_size=2, n_walks=1, n_dim=2, learning_rate=0.01, decay_rate=1, verbose=False):
        self.walk_length = walk_length
        self.window_size = window_size
        self.n_walks = n_walks
        self.n_dim = n_dim
        self.learning_rate = learning_rate
        self.decay_rate = decay_rate
        self.verbose = verbose
        
    def fit(self, G):
        
        if not isinstance(G, nx.classes.graph.Graph):
            raise TypeError('G must be networkx Graph object.')
        
        A = nx.convert_matrix.to_numpy_array(G)
        self.__n = A.shape[0]
        
        # make sure A is symmetric
        if (A.sum(axis=1) != A.sum(axis=0)).any():
            raise ValueError('Graph must be undirected.')
        
        # calculate transition matrix P
        D = np.zeros(A.shape)
        d_diag = A.sum(axis=1)
        np.fill_diagonal(D, d_diag)
        P = np.linalg.inv(D) @ A
        
        # Initialized Random Weights
        self.W = np.random.rand(A.shape[0], self.n_dim)
        self.C = np.random.rand(A.shape[0], self.n_dim)
        self.W_orig = self.W
        self.C_orig = self.C
        self.loss_ = np.zeros(self.n_walks)
        prev_loss = np.inf
        for N in range(self.n_walks):
            self.loss = 0
            self.lr = self.learning_rate/(1 + self.decay_rate*N)
            walk = self.__generate_random_walk(A, P)

            # make contexts
            samples = self.__generate_contexts(walk)
            
            for w_t, w_c in samples:
                # forward pass
                y_pred, h, u = self.__forward_pass(w_t)
                
                # Calculate Error
                EI = np.sum([np.subtract(y_pred, context) for context in w_c], axis=0)
                
                # Backpropagate
                self.__backpropagate(EI, h, w_t)
        
                # calculate loss
                self.loss += -np.sum([u[list(context).index(True)] for context in w_c]) + len(w_c)*np.log(np.sum(np.exp(u)))
            self.loss_[N] = self.loss
            if self.verbose:
                print(f'Walk: {N} Loss: {self.loss}')
            
        return self
    
    def __generate_random_walk(self, A, P):
        
        walk = np.zeros(self.walk_length)
        node = np.random.choice(A.shape[0])
        walk[0] = node

        for i in range(1, self.walk_length):
            probs = P[node]
            node = np.random.choice(A.shape[0], p=probs)
            walk[i] = node
            
        return walk
    
    def __to_onehot(self, word):
        
        onehot = np.zeros(self.__n, dtype=bool)
        onehot[int(word)] = True
    
        return onehot
            
    def __generate_contexts(self, walk):
        sample = []
        for i in range(self.walk_length):
            center_word = self.__to_onehot(walk[i])
            contexts = []
            for j in range(i-self.window_size, i+self.window_size+1):
                if (j >= 0) and (i != j) and (j < self.walk_length):
                    contexts.append(self.__to_onehot(walk[j]))
            sample.append([center_word, contexts])
        return sample
        
    def __forward_pass(self, w):
        
        h = self.W.T @ w
        u = self.C @ h
        y_c = softmax(u)
        
        return y_c, h, u
    
    
    def __backpropagate(self, error, h, x):
        
        dL_dC = np.outer(error, h) # shape (n,d)
        dL_dW = np.outer(x, np.dot(self.C.T, error))
       
        self.W = self.W - self.lr*dL_dW
        self.C = self.C - self.lr*dL_dC
        
        
        

In [8]:
dw = DeepWalk(walk_length=100, window_size=5, n_walks=1000, n_dim=2, learning_rate=0.01, decay_rate=0.001, verbose=False).fit(G)

MemoryError: Unable to allocate 2.69 TiB for an array with shape (607565, 607565) and data type float64