# IRC Behavioral Analysis

### Imports

In [13]:
import zat
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from zat.log_to_dataframe import LogToDataFrame
from zat.bro_log_reader import BroLogReader
from collections import defaultdict
import numba

## Loading Data

In [14]:
import os

log_names = ['irc1']#['03','04','34','39','42','51','56','62']
project_dir = '/Users/preneond/Documents/Work/Stratosphere/IRC-Research/IRC-Behavioral-Analysis/' 
log_dir = os.path.join(project_dir, 'zeek/logs/')
out_dir = os.path.join(project_dir, 'python/out/')

fileout_join_freq = 'join_freq.log'
fileout_lev_dist = 'lev_dist.log'

logs_fn_join = [os.path.join(log_dir,l,'irc_join.log') for l in log_names]
logs_fn_privmsg = [os.path.join(log_dir,l,'irc_privmsg.log') for l in log_names]

In [15]:
def load_logs(file):
    logs_arr = []
    if not os.path.isfile(file):
        return logs_arr
    reader = BroLogReader(file)

    for log in reader.readrows():
        # log is in dictionary format
        logs_arr.append(log)

    return logs_arr

In [7]:
logs_join = list(map(lambda x: load_logs(x),logs_fn_join))
# logs_join = list(filter(lambda x: len(x) != 0 ,logs_join))

logs_privmsg = list(map(lambda x: load_logs(x),logs_fn_privmsg))
# logs_privmsg = list(filter(lambda x: len(x) != 0 ,logs_privmsg))

Successfully monitoring /Users/preneond/Documents/Work/Stratosphere/IRC-Research/IRC-Behavioral-Analysis/zeek/logs/irc1/irc_join.log...
Successfully monitoring /Users/preneond/Documents/Work/Stratosphere/IRC-Research/IRC-Behavioral-Analysis/zeek/logs/irc1/irc_privmsg.log...


### Divide logs by channels

In [16]:
from collections import defaultdict
logs_join_divided = []
for logs in logs_join:
    logs_per_channel = defaultdict(lambda: [])    
    for log in logs:
        logs_per_channel[log['channel']].append(log)
    logs_join_divided.append(logs_per_channel)

In [17]:
logs_privmsg_divided = []
for logs in logs_privmsg:
    logs_per_channel = defaultdict(lambda: [])    
    for log in logs:
        logs_per_channel[log['target']].append(log)
    logs_privmsg_divided.append(logs_per_channel)

## Number of Users in Channel per Day

In [18]:
import json

def ircjoin_compute(logs):
    if len(logs) == 0:
        return None, None

    logs_ts = list(map(lambda x: x['ts'].date(), logs))

    # first ts of join command
    ts_min = min(logs_ts)
    ts_max = max(logs_ts)
#     print('min date: {}, max date: {}'.format(ts_min, ts_max))
    span = ts_max - ts_min
    
    dates = [ts_min+timedelta(days=i) for i in range(span.days+1)]

    ## count how many join commands are in which day 
    logs_per_day = defaultdict(lambda: 0)
    for v in logs_ts:
        logs_per_day[v] += 1
    
    dates_count = []
    count = 0
    for d in dates:
        count += logs_per_day[d]
        dates_count.append(count)
    
    return dates, dates_count


def ircjoin_visualize(dates, dates_count):
    plt.bar(dates,dates_count)
    plt.show()    
    
def irc_save2file(data, file_out):
    with open(file_out, 'w+') as f:
        json.dump(data, f, default=str)

In [19]:
for ln, l in zip(log_names, logs_join_divided):
    fn = os.path.join(out_dir, ln, fileout_join_freq)
    print(fn)
    data = []
    for l_k in l.keys():
        log = l[l_k]
        d, dc = ircjoin_compute(log)
        # ircjoin_visualize(d, dc)
        if d is None or dc is None:
            data.append([])
        else:
            data.append([l_k, list(zip(d, dc))])
    
    irc_save2file(data, fn)

# import random
# log = logs_join[2]
# ircjoin_visualize(log)

/Users/preneond/Documents/Work/Stratosphere/IRC-Research/IRC-Behavioral-Analysis/python/out/irc1/join_freq.log


## Levenshtein Distance of Messages in Channel

In [20]:
import itertools
from Levenshtein import distance as levenshtein_distance

def compute_levenshtein_distance(logs_msg):
    combs = itertools.combinations(logs_msg, 2)
    dist_lev_arr = []
    for msg1, msg2 in combs:
        dist_lev_arr.append(levenshtein_distance(msg1,msg2))
        
    return dist_lev_arr

### Bubble plot

In [23]:
from multiprocessing import Pool

n = len(logs_privmsg)

def compute_lev_dist_per_channel(l_k):
    print('channel: ', l_k)
    # compute levenshtein distance
    logs_msg = [log['msg'] for log in logs[l_k]]
    logs_lev_dist = compute_levenshtein_distance(logs_msg)
    # compute number of msg's senders per channel
    sources = set([log['source'] for log in logs[l_k]])
    # print('sources: ', len(sources))
    return [l_k, logs_lev_dist]
    
for ln, logs in zip(log_names, logs_privmsg_divided):
    with Pool() as pool:
        fn = os.path.join(out_dir, ln, fileout_lev_dist)
        data = []
        # loop through channels            
        data = pool.map(compute_lev_dist_per_channel,logs.keys())     
        irc_save2file(data, fn)
        #print('lev_dist: ', logs_lev_dist)

channel:  #worldchat
channel:  #linux-cs
channel:  #red_rulz
channel:  203FAD02P
channel:  #Kiev
channel:  Smetana
channel:  #dump
channel:  #cz
channel:  #idlerpg.hu
channel:  #programatori
channel:  otis
channel:  #adom
channel:  #news.cz
channel:  #Cybercafe
channel:  PR0FF0NEONE
channel:  Mik3
channel:  X@channels.undernet.org
channel:  #styja
channel:  kim^
channel:  F0xTr0T
channel:  #linux.cz
channel:  Umana`
channel:  #quizer
channel:  #atw
channel:  #take-over
channel:  #NetBSD
channel:  #cd1
channel:  #zcu
channel:  #romania
channel:  #OpenBSD
channel:  #9x9pub
channel:  #WORLDCHAT
channel:  #ddos
channel:  #kiev
channel:  izel
channel:  #freebsd
channel:  #beginner
channel:  #Jakarta
channel:  #ATW
channel:  #turks
channel:  #Beginner
channel:  #..
channel:  Keso
channel:  #amigacs
channel:  #sscait
channel:  #networker
channel:  #dupa_maryny
channel:  #kosice
channel:  #obsd
channel:  #Becka
channel:  #u
channel:  elisabetta`
channel:  #7gods
channel:  #Kosice
channel:  #du

In [None]:
print('# Sources:\t{} \n# Messages:\t\t{} \nLev dist:\t{}'.format(logs_sources, logs_messages, logs_lev_dist))

In [None]:
cm = plt.cm.get_cmap('jet')

x = np.array(logs_sources)
y = np.array(logs_messages)
z = 5*np.power(np.array(logs_lev_dist),2)

fig, ax = plt.subplots()
sc = ax.scatter(x,y,s=z,c=z,cmap=cm)
ax.grid(alpha=0.5)
fig.colorbar(sc)
plt.xlabel('Number of Users in Channel')
plt.ylabel('Number of Messages')
plt.title('Levenstein Distance of Messages per Capture')
plt.show()

## 