# CMSC Session Extraction

Comprehensive, Multi-Source Cybersecurity Events Dataset (https://csr.lanl.gov/data/cyber1/), also found https://www.secrepo.com)

Explore temporal token Gram

`$ gzip -dc auth.txt.gz | gsplit -l 50000000 -d - auth_part_ --filter='gzip > $FILE.csv.gz'`
 
<b>Note: This notebook requires pandas 0.25.3 (python 3.6+) to run.</b>

In [2]:
import pandas as pd
cmsc_dataset = {
    "redteam": {
        'filename': './CMSC/redteam.txt.gz',
        'options': {'compression': 'gzip'},
        'columns': ['attack_time', 'src_user', 'src_machine', 'dst_machine']
    },

    "auth" : {
        'filename': './CMSC/auth_part_04.csv.gz',
        'options': {'compression': 'gzip'},
        'columns': ['time', 'src_user', 'dst_user', 'src_machine', 'dst_machine', 
                    'auth_type', 'logon_type', 'orientation', 'result']
    },
    
    "auth_100k" : {
        'filename': './CMSC/auth_100k.txt',
        'options': {},
        'columns': ['time', 'src_user', 'dst_user', 'src_machine', 'dst_machine', 
                    'auth_type', 'logon_type', 'orentiation', 'result']
    },

    "proc" : {
        'filename' :  './CMSC/proc.txt.gz',
        'options': {'compression': 'gzip'},
        'columns' : ["time","user_domain","computer","process" "name","action"]
    }
}

def load_logs(name):
    try:
        dataset =  cmsc_dataset[name]
        return pd.read_csv(dataset['filename'], **dataset['options'], header=None, names=dataset['columns'])
    except KeyError:
        print("Dataset [{}] does not exist".format(name))

In [3]:
load_logs('noexists')

Dataset [noexists] does not exist


## Load Authentication Logs

In [86]:
auth = load_logs('auth')

In [87]:
auth = auth[(auth['orentiation'] == 'LogOn') | (auth['orientation'] == 'LogOff')]

In [88]:
auth.head(5)

Unnamed: 0,time,src_user,dst_user,src_machine,dst_machine,auth_type,logon_type,orentiation,result
0,1044281,C609$@DOM1,C609$@DOM1,C609,C528,Kerberos,Network,LogOn,Success
1,1044281,C612$@DOM1,C612$@DOM1,C612,C612,?,Network,LogOff,Success
2,1044281,C612$@DOM1,C612$@DOM1,C612,C612,Kerberos,Network,LogOn,Success
3,1044281,C6312$@DOM1,C6312$@DOM1,C6312,C612,Kerberos,Network,LogOn,Success
4,1044281,C664$@DOM1,C664$@DOM1,C612,C612,?,Network,LogOff,Success


### Extract Sessions by LogOn and LogOff event
- Use destination user and destination machine as key to find out all sessions with Logon and LogOff 
- Later we will look at processes created by each user on the destination machines as the events within a session

In [89]:
from enum import Enum

class State(Enum):
    Init = 0
    Started = 1
    Ended = 2

def isEnd(t):
    return t[0] == 0

def isStart(t):
    return t[1] == 0

"""
Suppress Event(a, b), start event if b == 0 else End event
"""
def suppress(events):
    output = []
    state = State.Init
    open_session = (0, 0)
    for e in events:
        
        if state == State.Init:
            if isEnd(e):
                open_session = (0, e[1])
            else:
                state = State.Started
                open_session = (e[0], 0)
                
        elif state == State.Started:
            if open_session[1] > 0:  
                output.append(open_session)
            
            if isEnd(e):
                open_session = (open_session[0], e[1])
                state = State.Ended
                
        else: 
            if isEnd(e):
                open_session = (open_session[0], e[1])  
            elif isStart(e):
                output.append(open_session)
                open_session = e
                state = State.Started
                
    if open_session[1] > 0 or open_session[0] > 0:
        output.append(open_session)
        
    return output

assert suppress([(71, 0), (81, 0), (0, 84), (0, 85)]) == [(71, 85)]
assert suppress([(71, 0), (81, 0), (0, 84)]) == [(71, 84)]
assert suppress([(71, 0), (81, 0)]) == [(71, 0)]
assert suppress([(0, 81)]) == [(0, 81)]

### Convert LogOn/LogOff field to a tuple (_, _)
- LogOn will be (_, 0) and logOff will be (0, _)

In [90]:
auth_sess = auth[(auth['orentiation'] == 'LogOn') | (auth['orentiation'] == 'LogOff')].assign(
    auth_time=auth.apply(lambda x: (x['time'], 0) if x['orentiation'] == 'LogOn' else (0, x['time']), axis=1),
).groupby(['dst_user', 'src_machine', 'dst_machine']).agg({'auth_time': lambda x: list(x)})

In [145]:
auth_sess.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,auth_time
dst_user,src_machine,dst_machine,Unnamed: 3_level_1
ANONYMOUS LOGON@C1,C1,C1,"[(1067590, 0)]"
ANONYMOUS LOGON@C10,C10,C10,"[(0, 1066242), (0, 1066253), (0, 1067934), (0,..."
ANONYMOUS LOGON@C10,C10037,C10,"[(1252299, 0)]"
ANONYMOUS LOGON@C10,C10144,C10,"[(1252416, 0)]"
ANONYMOUS LOGON@C10,C10328,C10,"[(1252274, 0), (1252278, 0)]"
ANONYMOUS LOGON@C10,C11082,C10,"[(1252789, 0)]"
ANONYMOUS LOGON@C10,C11303,C10,"[(1240219, 0), (1240221, 0)]"
ANONYMOUS LOGON@C10,C13107,C10,"[(1252564, 0)]"
ANONYMOUS LOGON@C10,C13214,C10,"[(1252542, 0)]"
ANONYMOUS LOGON@C10,C14005,C10,"[(1088608, 0)]"


### identify sessions and remove open session (0, 0) or (0, _) if any

In [91]:
sessions = auth_sess.assign(
    sessions=auth_sess['auth_time'].apply(
        lambda x: [a for a in suppress(x) if a[0] != 0 and a[1] != 0]   # remove open session
    )
).reset_index()

In [92]:
# filter out empty user and open-sessions
auth_sessions = sessions[
    (sessions['dst_user'] != '') & (sessions['sessions'].apply(lambda x: len(x) != 0))
][['dst_user', 'src_machine', 'dst_machine', 'sessions']]

In [144]:
auth_sessions.head(25)

Unnamed: 0,dst_user,src_machine,dst_machine,sessions
46,ANONYMOUS LOGON@C100,C100,C100,"[(1168334, 1228174)]"
48,ANONYMOUS LOGON@C1000,C1000,C1000,"[(1064333, 1197942)]"
87,ANONYMOUS LOGON@C10071,C10071,C10071,"[(1233821, 1246194)]"
95,ANONYMOUS LOGON@C10089,C10089,C10089,"[(1233699, 1251215)]"
102,ANONYMOUS LOGON@C10107,C10107,C10107,"[(1063702, 1246756)]"
106,ANONYMOUS LOGON@C10114,C10114,C10114,"[(1064084, 1088312), (1151129, 1151869), (1157..."
109,ANONYMOUS LOGON@C10118,C10118,C10118,"[(1062473, 1069278)]"
118,ANONYMOUS LOGON@C10136,C10136,C10136,"[(1233671, 1247378)]"
124,ANONYMOUS LOGON@C1015,C1015,C1015,"[(1134274, 1251502)]"
137,ANONYMOUS LOGON@C10150,C10150,C10150,"[(1077307, 1181568)]"


## Load Redteam Logs

In [4]:
redteam = load_logs('redteam')
redteam['label'] = '1'

In [11]:
redteam.groupby('src_user').agg({'dst_machine': 'count'}).reset_index().sort_values(by='dst_machine').tail(10)

Unnamed: 0,src_user,dst_machine
102,U9947@DOM1,15
48,U3635@DOM1,18
23,U1723@DOM1,19
98,U8946@DOM1,19
79,U748@DOM1,26
37,U293@DOM1,31
22,U1653@DOM1,31
77,U737@DOM1,32
38,U3005@DOM1,36
70,U66@DOM1,118


In [129]:
redteam.tail(100)

Unnamed: 0,attack_time,src_user,src_machine,dst_machine,label
649,1244695,U114@DOM1,C17693,C1710,1
650,1245222,U1106@DOM1,C17693,C4554,1
651,1245398,U1106@DOM1,C17693,C21919,1
652,1245678,U3575@DOM1,C17693,C5030,1
653,1246243,U3206@DOM1,C17693,C8585,1
...,...,...,...,...,...
744,2552687,U655@DOM1,C17693,C828,1
745,2552687,U655@DOM1,C17693,C828,1
746,2552687,U655@DOM1,C17693,C828,1
747,2552687,U655@DOM1,C17693,C828,1


In [138]:
labeled_auth = auth.join(redteam.set_index(['src_user', 'src_machine', 'dst_machine', 'attack_time']), 
                         on=['dst_user', 'src_machine', 'dst_machine', 'time'])

In [143]:
labeled_auth.head(10)

Unnamed: 0,time,src_user,dst_user,src_machine,dst_machine,auth_type,logon_type,orentiation,result,label
0,1044281,C609$@DOM1,C609$@DOM1,C609,C528,Kerberos,Network,LogOn,Success,
1,1044281,C612$@DOM1,C612$@DOM1,C612,C612,?,Network,LogOff,Success,
2,1044281,C612$@DOM1,C612$@DOM1,C612,C612,Kerberos,Network,LogOn,Success,
3,1044281,C6312$@DOM1,C6312$@DOM1,C6312,C612,Kerberos,Network,LogOn,Success,
4,1044281,C664$@DOM1,C664$@DOM1,C612,C612,?,Network,LogOff,Success,
5,1044281,C6828$@DOM1,C6828$@DOM1,C6828,C528,Kerberos,Network,LogOn,Success,
6,1044281,C6897$@DOM1,C6897$@DOM1,C1015,C1015,?,Network,LogOff,Success,
7,1044281,C729$@DOM1,C729$@DOM1,C1015,C1015,?,Network,LogOff,Success,
8,1044281,C743$@DOM1,C743$@DOM1,C586,C586,?,Network,LogOff,Success,
9,1044281,C743$@DOM1,C743$@DOM1,C743,C586,Kerberos,Network,LogOn,Success,


In [142]:
labeled_auth.label.fillna(0)

0           0
1           0
2           0
3           0
4           0
           ..
49999995    0
49999996    0
49999997    0
49999998    0
49999999    0
Name: label, Length: 42599254, dtype: object

In [147]:
labeled_auth[labeled_auth['label'] == '1']

Unnamed: 0,time,src_user,dst_user,src_machine,dst_machine,auth_type,logon_type,orentiation,result,label
4179730,1066394,U66@DOM1,U66@DOM1,C17693,C3610,NTLM,Network,LogOn,Success,1
4289405,1066725,U66@DOM1,U66@DOM1,C17693,C881,NTLM,Network,LogOn,Success,1
4292560,1066734,U66@DOM1,U66@DOM1,C17693,C1482,NTLM,Network,LogOn,Success,1
4309577,1066786,U66@DOM1,U66@DOM1,C17693,C2388,NTLM,Network,LogOn,Success,1
4836042,1068312,U12@DOM1,U12@DOM1,C17693,C366,NTLM,Network,LogOn,Success,1
...,...,...,...,...,...,...,...,...,...,...
48253180,1246653,U114@DOM1,U114@DOM1,C17693,C1710,NTLM,Network,LogOn,Success,1
49450524,1251013,U3549@DOM1,U3549@DOM1,C17693,C1887,NTLM,Network,LogOn,Success,1
49456293,1251033,U3549@DOM1,U3549@DOM1,C17693,C1887,NTLM,Network,LogOn,Success,1
49465247,1251067,U3549@DOM1,U3549@DOM1,C17693,C586,NTLM,Network,LogOn,Success,1


In [44]:
redteam_sessions = redteam.groupby(['src_user', 'src_machine', 'dst_machine']).agg(lambda x: list(x)).reset_index()

In [45]:
redteam_sessions.head(5)

Unnamed: 0,src_user,src_machine,dst_machine,time
0,U1025@DOM1,C17693,C1046,"[1153024, 1153118, 1153716]"
1,U1025@DOM1,C17693,C338,"[1152460, 1152644]"
2,U1025@DOM1,C17693,C3597,[1155729]
3,U10379@C3521,C17693,C3521,"[726137, 2303788]"
4,U1048@DOM1,C17693,C12320,"[768006, 2305762]"


In [49]:
auth_redteam = auth_sessions.join(redteam_sessions.set_index(['src_user', 'src_machine', 'dst_machine']), 
                                  on=['dst_user', 'src_machine', 'dst_machine'], 
                                  how='outer')

In [47]:
auth_sessions.head(10)

Unnamed: 0,dst_user,src_user,src_machine,dst_machine,sessions
46,ANONYMOUS LOGON@C100,ANONYMOUS LOGON@C100,C100,C100,"[(1168334, 1228174)]"
48,ANONYMOUS LOGON@C1000,ANONYMOUS LOGON@C1000,C1000,C1000,"[(1064333, 1197942)]"
87,ANONYMOUS LOGON@C10071,ANONYMOUS LOGON@C10071,C10071,C10071,"[(1233821, 1246194)]"
95,ANONYMOUS LOGON@C10089,ANONYMOUS LOGON@C10089,C10089,C10089,"[(1233699, 1251215)]"
102,ANONYMOUS LOGON@C10107,ANONYMOUS LOGON@C10107,C10107,C10107,"[(1063702, 1246756)]"
106,ANONYMOUS LOGON@C10114,ANONYMOUS LOGON@C10114,C10114,C10114,"[(1064084, 1088312), (1151129, 1151869), (1157..."
109,ANONYMOUS LOGON@C10118,ANONYMOUS LOGON@C10118,C10118,C10118,"[(1062473, 1069278)]"
118,ANONYMOUS LOGON@C10136,ANONYMOUS LOGON@C10136,C10136,C10136,"[(1233671, 1247378)]"
124,ANONYMOUS LOGON@C1015,ANONYMOUS LOGON@C1015,C1015,C1015,"[(1134274, 1251502)]"
137,ANONYMOUS LOGON@C10150,ANONYMOUS LOGON@C10150,C10150,C10150,"[(1077307, 1181568)]"


In [50]:
auth_redteam.head(10)

Unnamed: 0,dst_user,src_user,src_machine,dst_machine,sessions,time
46,ANONYMOUS LOGON@C100,ANONYMOUS LOGON@C100,C100,C100,"[(1168334, 1228174)]",
48,ANONYMOUS LOGON@C1000,ANONYMOUS LOGON@C1000,C1000,C1000,"[(1064333, 1197942)]",
87,ANONYMOUS LOGON@C10071,ANONYMOUS LOGON@C10071,C10071,C10071,"[(1233821, 1246194)]",
95,ANONYMOUS LOGON@C10089,ANONYMOUS LOGON@C10089,C10089,C10089,"[(1233699, 1251215)]",
102,ANONYMOUS LOGON@C10107,ANONYMOUS LOGON@C10107,C10107,C10107,"[(1063702, 1246756)]",
106,ANONYMOUS LOGON@C10114,ANONYMOUS LOGON@C10114,C10114,C10114,"[(1064084, 1088312), (1151129, 1151869), (1157...",
109,ANONYMOUS LOGON@C10118,ANONYMOUS LOGON@C10118,C10118,C10118,"[(1062473, 1069278)]",
118,ANONYMOUS LOGON@C10136,ANONYMOUS LOGON@C10136,C10136,C10136,"[(1233671, 1247378)]",
124,ANONYMOUS LOGON@C1015,ANONYMOUS LOGON@C1015,C1015,C1015,"[(1134274, 1251502)]",
137,ANONYMOUS LOGON@C10150,ANONYMOUS LOGON@C10150,C10150,C10150,"[(1077307, 1181568)]",


# Load Proc Logs

In [3]:
procs = load_logs('proc')

In [5]:
procs.head(10)

Unnamed: 0,time,user_domain,computer,processname,action
0,1,C1$@DOM1,C1,P16,Start
1,1,C1001$@DOM1,C1001,P4,Start
2,1,C1002$@DOM1,C1002,P4,Start
3,1,C1004$@DOM1,C1004,P4,Start
4,1,C1017$@DOM1,C1017,P4,Start
5,1,C1018$@DOM1,C1018,P4,Start
6,1,C1020$@DOM1,C1020,P3,Start
7,1,C1020$@DOM1,C1020,P4,Start
8,1,C1028$@DOM1,C1028,P16,End
9,1,C1029$@DOM1,C1029,P4,Start


In [11]:
procs['action'].unique()

array(['Start', 'End'], dtype=object)

In [7]:
allprocessnames = procs['processname'].unique()

In [8]:
len(allprocessnames)

62974

In [None]:
procs.groupBy(['user_domain', 'computer', 'processname']).agg()

In [22]:
testprocs = procs.head(10000)

In [55]:
def identifyProcess(proc_df):
    proc_action_time = proc_df[(proc_df['action']=='Start') | (proc_df['action']=='End')].assign(
        action_time=proc_df.apply(lambda x: (x['time'], 0) if x['action'] == 'Start' else (0, x['time']), axis=1)
    ).groupby(['user_domain', 'computer', 'processname']).agg({'action_time': lambda x: list(x)})
    
    print(proc_action_time.head(2))
    proc_clean_df = proc_action_time.assign(
        proc_dur=proc_action_time['action_time'].apply(
            lambda x: [a for a in suppress(x) if a[0] != 0 and a[1] != 0]
        )
    ).reset_index()
    
    return proc_clean_df[proc_clean_df['proc_dur'].apply(lambda x: len(x) != 0)]

In [71]:
testprocs[testprocs['computer'] == 'C1006']

Unnamed: 0,time,user_domain,computer,processname,action
448,2,C1006$@DOM1,C1006,P25,Start
1949,3,C1006$@DOM1,C1006,P25,End
3529,3,U20@DOM1,C1006,P20,Start
3531,3,U40@DOM1,C1006,P21,Start
3532,3,U40@DOM1,C1006,P259,End
3533,3,U40@DOM1,C1006,P259,Start
3534,3,U40@DOM1,C1006,P35,Start
4300,4,U40@DOM1,C1006,P21,End
4301,4,U40@DOM1,C1006,P35,End
5753,36,C1006$@DOM1,C1006,P25,Start


In [56]:
x=identifyProcess(testprocs)

                                          action_time
user_domain computer processname                     
C1$@DOM1    C1       P16             [(1, 0), (0, 6)]
C10$@DOM1   C10      P9           [(44, 0), (0, 122)]


In [83]:
x[x['computer'] == 'C1006']

Unnamed: 0,user_domain,computer,processname,action_time,proc_dur
0,C1$@DOM1,C1,P16,"[(1, 0), (0, 6)]","[(1, 6)]"
1,C10$@DOM1,C10,P9,"[(44, 0), (0, 122)]","[(44, 122)]"
22,C1006$@DOM1,C1006,P25,"[(2, 0), (0, 3), (36, 0), (0, 37), (37, 0), (0...","[(2, 3), (36, 37), (37, 38), (38, 97)]"
23,C1007$@DOM1,C1007,P47,"[(21, 0), (0, 29)]","[(21, 29)]"
30,C101$@DOM1,C101,P25,"[(0, 12), (0, 17), (17, 0), (60, 0), (0, 72), ...","[(17, 102), (102, 136), (136, 137), (137, 145)]"
...,...,...,...,...,...
6875,U90@DOM1,C1786,P79,"[(0, 9), (9, 0), (0, 67), (67, 0)]","[(9, 67)]"
6877,U92@DOM1,C730,P53,"[(7, 0), (0, 9)]","[(7, 9)]"
6878,U92@DOM1,C730,P80,"[(1, 0), (0, 7)]","[(1, 7)]"
6879,U95@DOM1,C2055,P105,"[(48, 0), (0, 62), (78, 0), (0, 111), (134, 0)...","[(48, 62), (78, 111), (134, 143)]"


In [81]:
t = x.groupby(['user_domain', 'computer']).apply(
    lambda x: list(zip(x['processname'], x['proc_dur']))).reset_index()

In [82]:
t[t['computer'] == 'C1006']

Unnamed: 0,user_domain,computer,0
2,C1006$@DOM1,C1006,"[(P25, [(2, 3), (36, 37), (37, 38), (38, 97)])]"
368,U20@DOM1,C1006,"[(P20, [(3, 67)])]"
389,U40@DOM1,C1006,"[(P21, [(3, 4)]), (P35, [(3, 4)])]"
