In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
def pagerank(df, beta, threshold=10**(-20), epochs=100):
    
    ip_list = df['src_ip'].unique().tolist() + df['dst_ip'].unique().tolist()
    ip_list = list(set(ip_list))
    
    ip_dict = dict()
    for i, ip in enumerate(ip_list):
        ip_dict[ip] = i
    
    num_nodes = len(ip_list)
    neighbors = [[] for _ in range(num_nodes)]
    
    for i, row in df.iterrows():
        src = ip_dict[row['src_ip']]
        dst = ip_dict[row['dst_ip']]
        neighbors[src].append(dst)
    
    r = [1/num_nodes] * num_nodes
    
    for epoch in range(epochs):
        r_next = [0] * num_nodes
        
        for u in range(num_nodes):
            for v in neighbors[u]:
                r_next[v] += (1-beta) * r[u] / len(neighbors[u])
                
        teleport_prob = 1-sum(r_next)
        for u in range(num_nodes):
            r_next[u] += teleport_prob / num_nodes
            
        delta = sum(abs(a-b) for a, b in zip(r, r_next))
        
        r = r_next
        
        if delta < threshold:
            break
            
    return r, ip_list

In [3]:
total_df = pd.read_csv('grouped.csv')
total_df.head(5)

Unnamed: 0,Rdate,src_ip,dst_ip,Proto,src_port,dst_port,Action,src_country,dst_country,direction,group
0,20210410000000.0,154.58.159.102,103.177.12.42,6,52897,445,2,,US,outbound,11
1,20210410000000.0,154.58.159.20,125.66.92.196,6,60579,445,2,,DE,outbound,11
2,20210410000000.0,154.58.159.164,117.121.178.223,6,63831,445,2,,US,outbound,11
3,20210410000000.0,154.58.159.165,205.34.95.97,6,55241,445,2,,US,outbound,11
4,20210410000000.0,154.58.159.102,93.56.164.131,6,52898,445,2,,US,outbound,11


In [4]:
total_df.loc[total_df['direction']=='outbound','group'].value_counts().index

Int64Index([11, 13, 59,  0,  6,  1, 45, 23, 55,  2, 18, 28, 35, 50, 24, 48, 14,
            30, 39, 41, 65, 17, 58, 38, 60, 69, 32,  5, 31, 64, 51, 56, 34, 52,
            26, 44, 22, 54, 10, 46,  7, 27, 42, 53, 47, 40, 61, 20,  8, 33, 21,
            37, 15,  9, 62, 36, 68, 67, 19, 57, 12, 63, 16, 29,  4,  3, 25],
           dtype='int64')

In [5]:
grouparr = [6, 1, 45, 23, 55,  2, 18, 28, 35, 50, 24, 48, 14,
            30, 39, 41, 65, 17, 58, 38, 60, 69, 32,  5, 31, 64, 51, 56, 34, 52,
            26, 44, 22, 54, 10, 46,  7, 27, 42, 53, 47, 40, 61, 20,  8, 33, 21,
            37, 15,  9, 62, 36, 68, 67, 19, 57, 12, 63, 16, 29,  4,  3, 25]

In [6]:
label_dict = dict()

In [7]:
for group in grouparr:
    label_dict[group] = dict()
    df = total_df[(total_df['group']==group)]
    r, ip = pagerank(df, 0.15)
    
    score = np.array(r)
    mean = np.mean(score)
    sigma = np.std(score)
    
    for i in range(len(r)):
        if mean - 6*sigma <= r[i] <= mean + 6*sigma:
            label_dict[group][ip[i]] = 0
        else:
            print(group, ip[i])
            label_dict[group][ip[i]] = 1

6 173.224.148.8
6 52.77.83.1
45 72.192.214.24
55 110.73.10.1
2 8.228.177.105
2 173.224.148.8
2 52.77.83.1
18 52.77.83.1
50 52.77.83.1
24 52.77.83.1
48 173.224.148.8
48 140.65.156.1
14 74.234.255.4
14 178.35.241.145
30 173.224.148.8
41 173.224.148.8
65 173.224.148.8
17 134.15.114.2
38 33.59.107.227
60 173.224.148.8
69 52.77.83.1
69 45.111.16.233
69 72.97.108.152
32 37.62.217.50
31 173.224.148.8
64 52.77.83.1
64 45.111.16.201
34 52.77.83.1
44 52.77.83.1
44 45.111.16.231
22 173.224.148.8
54 173.224.148.8
10 173.224.148.8
46 52.77.83.1
27 173.224.148.8
42 72.192.214.72
42 175.218.0.117
53 52.77.83.1
47 45.111.16.102
61 140.65.156.1
8 52.77.83.1
67 45.111.16.61


In [8]:
for group in [0, 13, 59]:
    label_dict[group] = dict()
    df = pd.read_csv(f'PageRank_group_{group}.csv')
    r = df['pageranks'].tolist()
    ip = df['Id'].tolist()
    
    score = np.array(r)
    mean = np.mean(score)
    sigma = np.std(score)
    
    for i in range(len(r)):
        if mean - 6*sigma <= r[i] <= mean + 6*sigma:
            label_dict[group][ip[i]] = 0
        else:
            print(group, ip[i])
            label_dict[group][ip[i]] = 1

0 52.77.83.1
0 173.224.148.8
13 52.77.83.1
59 72.192.214.61
59 173.224.148.8


In [9]:
label_dict

{6: {'196.131.96.28': 0,
  '236.133.104.73': 0,
  '26.26.96.57': 0,
  '76.134.241.116': 0,
  '217.18.227.18': 0,
  '60.11.29.144': 0,
  '186.68.226.25': 0,
  '101.160.224.130': 0,
  '207.161.122.47': 0,
  '245.181.148.19': 0,
  '81.139.23.68': 0,
  '166.238.144.23': 0,
  '77.8.62.239': 0,
  '175.93.127.138': 0,
  '161.45.169.147': 0,
  '147.226.233.228': 0,
  '16.220.17.57': 0,
  '255.221.16.10': 0,
  '198.151.106.82': 0,
  '158.209.104.254': 0,
  '173.163.232.109': 0,
  '98.160.135.67': 0,
  '113.44.138.60': 0,
  '77.34.200.28': 0,
  '241.150.159.19': 0,
  '112.152.119.138': 0,
  '101.250.109.254': 0,
  '118.129.82.26': 0,
  '223.223.146.98': 0,
  '160.73.79.232': 0,
  '136.131.162.227': 0,
  '37.195.36.219': 0,
  '113.44.138.183': 0,
  '105.44.91.199': 0,
  '166.238.144.22': 0,
  '184.44.42.31': 0,
  '112.152.119.70': 0,
  '35.192.76.36': 0,
  '29.226.124.67': 0,
  '93.147.91.227': 0,
  '139.209.16.164': 0,
  '56.138.41.132': 0,
  '60.65.88.48': 0,
  '37.141.236.228': 0,
  '181.159.1

In [10]:
with open('label_dict.pkl', 'wb') as f:
    pickle.dump(label_dict, f)