In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np
import hashlib
import matplotlib.pyplot as plt
import datetime
from dateutil.relativedelta import relativedelta as rdelta
import seaborn as sns

In [2]:
def hashing(ip,h_f):
    return int(h_f(ip.encode()).hexdigest(), 16)

In [3]:
def bit_jacard(b1,b2):
    return np.logical_and(b1,b2).sum()/np.logical_or(b1,b2).sum()

In [4]:
def bit_wise(data,n_feat,h_f):
    bitmap = np.zeros((n_feat),bool)

    index = hashing(data,h_f)%n_feat

    bitmap[index]=True

    return bitmap

In [5]:
def jacard(a,b):
    a= set(a)
    b=set(b)
    inter = set.intersection(a,b)
    union = set.union(a,b)
    return len(inter)/len(union)

In [6]:
def min_jacard(l):
    start  = 0
    max_arg=[]
    for target in range(len(l)):
        ll=[]
        if target==0:
            max_arg.append((0,0))    
        else:
            for i in l[start:target]:
                ll.append(jacard(l[target][3],i[3]))
            max_arg.append((np.argmin(ll)+ 1 ,ll[np.argmin(ll)]))
    return max_arg

In [7]:
def hitter_bit(d,inter,outer,n_feat,h_f):
    if inter not in d:
        d[inter] = bit_wise(data = outer,n_feat=n_feat, h_f = h_f)
    else:
        d.get(inter)[hashing(outer,h_f=h_f)% n_feat]=1

In [8]:
def hitter_original(d,inter,outer,n_feat,h_f):
    if inter not in d:
        d[inter] = set([outer])
    else:
        d.get(inter).add(outer)

In [9]:
def df2bitset(df,n_feat,hash_func,bit=True):
    d=dict()
    h_f = hash_func
    if bit:
        hitter = hitter_bit
    else:
        hitter = hitter_original
    for _,i in tqdm(df.iterrows()):
        if i['direction'] == 'outbound':
            inter = i['src_ip']
            outer = i['dst_ip']
        else:
            inter = i['dst_ip']
            outer = i['src_ip']
            
        
        hitter(d,inter,outer,n_feat,h_f)
    
            
    return d

In [10]:
def preprocess(df):
    df['Rdate'] = df['Rdate'].astype('str').apply(lambda x:datetime.datetime(year=int(x[:4]),month=int(x[4:6]),day=int(x[6:8]),hour=int(x[8:10]),minute=int(x[10:12]),second=int(x[12:14])).strftime('%Y-%m-%d %H:%M:%S'))
    df['Rdate']= pd.to_datetime(df['Rdate'])
    df1  = df[df['Rdate']<df.iloc[0,0]+rdelta(days=1)]
    df2 = df[(df['Rdate']<df.iloc[0,0]+rdelta(days=2))&(df['Rdate']>df.iloc[0,0]+rdelta(days=1))]
    return df1,df2

In [11]:
def original(df1,df2,n_feat,hash_func):
    res = dict()
    df1_org = df2bitset(df1,n_feat,hash_func,bit=False)
    df2_org = df2bitset(df2,n_feat,hash_func,bit=False)
    for i in tqdm(df2_org):
        res[i] = [jacard(df1_org.get(i,set()),df2_org[i]),len(df2_org[i])]
    return res

In [12]:
df= pd.read_csv('./directed_data.csv')

In [13]:
dfall_d = df2bitset(df,500000,hashlib.sha512)

0it [00:00, ?it/s]

In [14]:
ll=dict()
for i in tqdm(dfall_d):
    ll[i] = []
    for c in dfall_d:
        ll.get(i).append(bit_jacard(dfall_d.get(i,0),dfall_d.get(c)))

  0%|          | 0/701 [00:00<?, ?it/s]

In [15]:
alldf = pd.DataFrame(ll).T
alldf.columns = [i for i in dfall_d]

In [17]:
dist = 1 - alldf.to_numpy()

In [18]:
dist

array([[0.        , 0.50384006, 0.50094537, ..., 0.99999698, 1.        ,
        0.99999698],
       [0.50384006, 0.        , 0.5008941 , ..., 0.99999699, 0.99999699,
        0.99999699],
       [0.50094537, 0.5008941 , 0.        , ..., 0.999997  , 0.999997  ,
        0.999997  ],
       ...,
       [0.99999698, 0.99999699, 0.999997  , ..., 0.        , 1.        ,
        1.        ],
       [1.        , 0.99999699, 0.999997  , ..., 1.        , 0.        ,
        1.        ],
       [0.99999698, 0.99999699, 0.999997  , ..., 1.        , 1.        ,
        0.        ]])

In [19]:
from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.8, linkage='single', affinity='precomputed').fit(dist)

In [20]:
clustering.labels_

array([11, 11, 11, 11,  0, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0, 59,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0, 67,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 45,  0,  0,  0,
        0,  0, 13,  6, 13,  6, 14,  0,  6, 14, 23,  6, 50,  6, 35,  6, 13,
       69, 53,  0,  0,  0,  0,  0,  0,  0,  0,  0, 42,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  1, 13,  0, 44,  0, 28,  0,  6,  6,  6, 13,  6,  0,
        0,  0,  0,  6,  6,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  6, 13,  6,  6, 68,  0,  0,  0,  0, 13,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0, 64,  0, 43,  6, 13, 55, 65,  0,
       66,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 47,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  6,  0,  0,  0,  0,  0, 13,  6,  6,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  6,  0,
        0,  0,  0,  6, 13

In [21]:
alldf['group'] = clustering.labels_

In [22]:
group_idx = alldf['group'].to_dict()

In [23]:
def group_segmentation(x):
    if x['direction']== 'outbound':
        return group_idx[x['src_ip']]
    else:
        return group_idx[x['dst_ip']]

In [25]:
df['group'] = df[['direction','src_ip','dst_ip']].apply(group_segmentation,axis=1)

In [26]:
df['group'].value_counts()

11    2736285
13    1776675
59     581104
0      534446
6       47324
       ...   
16          8
4           4
29          4
3           3
25          2
Name: group, Length: 70, dtype: int64

In [27]:
df.to_csv('./grouped.csv',index=None)