In [4]:
#import nltk; nltk.download('stopwords')

import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [8]:
df = pd.read_csv('/Users/samanthawise/Documents/BRISTOL/MASTERS/data_science_toolbox/Workshops/data/losalamosdata.csv', error_bad_lines = False)
#print(df.target_names.unique())
df.head()

Unnamed: 0.1,Unnamed: 0,source_computer,destination_computer,time,source_user@domain,destination_user@domain,auth_type,longon_type,auth_orien,success/fail
0,1,C2815,C625,3721,C2815$@DOM1,C2815$@DOM1,Kerberos,Network,LogOn,Success
1,2,C14046,C754,33492,ANONYMOUS LOGON@C754,ANONYMOUS LOGON@C754,NTLM,Network,LogOn,Success
2,3,C18769,C754,816792,C18769$@DOM1,C18769$@DOM1,NTLM,Network,LogOn,Success
3,4,C5808,C18781,1770007,ANONYMOUS LOGON@C18781,ANONYMOUS LOGON@C18781,NTLM,Network,LogOn,Success
4,5,C13050,C13050,30731,C13050$@DOM1,C13050$@DOM1,?,?,TGT,Success


In [79]:
# just considering the users in source_user@domain

users_df = df[df['source_user@domain'].str.contains("U")]
users_df = users_df[-users_df['source_user@domain'].str.contains("ANONYMOUS")]
users_df = users_df[['time', 'destination_computer', 'source_user@domain']]
users_df['source_user@domain'] = users_df['source_user@domain'].str.split('@').str.get(0)

In [90]:
users_df['source_user@domain'].value_counts()
# top three users are U6836, U1653, U7998

U6836     6750
U1653     2849
U7998      815
U1723      483
U66        436
U4281      392
U3771      301
U463       259
U748       252
U293       250
U254       244
U636       234
U525       232
U737       219
U5002      203
U1430      201
U726       201
U1592      199
U1718      198
U307       196
U8849      195
U1916      189
U8601      187
U1522      184
U194       183
U162       183
U667       179
U1506      174
U12043     173
U3635      167
          ... 
U10987       1
U7302        1
U5759        1
U11492       1
U7323        1
U5797        1
U6963        1
U10704       1
U6644        1
U11707       1
U12422       1
U8371        1
U8471        1
U83          1
U12402       1
U11373       1
U8021        1
U6849        1
U11885       1
U7999        1
U137         1
U10477       1
U454         1
U11775       1
U7247        1
U6254        1
U1079        1
U7138        1
U1756        1
U9491        1
Name: source_user@domain, Length: 10001, dtype: int64

In [95]:
# subsetting the data frame to those three users

vals = ['U6836', 'U1653', 'U7998']

users_df1 = users_df.loc[(users_df['source_user@domain'] == 'U6836') | 
                         (users_df['source_user@domain'] == 'U1653') | 
                         (users_df['source_user@domain'] == 'U7998')]

users_df1

Unnamed: 0,time,destination_computer,source_user@domain
128,1250211,C7521,U6836
134,1253803,C2704,U6836
165,3056451,C5666,U1653
185,1253503,C12162,U6836
222,1252208,C12129,U6836
249,1252230,C7170,U6836
274,1250965,C11246,U6836
384,1254216,C3430,U6836
441,2901918,C5666,U1653
467,3110403,C5666,U1653


In [81]:
# Create a corpus, each element represents one day of connections 

sec_day = 86400
corpus = []
for i in range(57):
    day = (users_df["time"] <= sec_day*(i+1)) & (users_df["time"] > sec_day*i)
    list = users_df[day]
    corpus.append(list)

In [82]:
id2word = corpora.Dictionary(users_df_list)

texts = users_df['destination_computer']

corpus1 = [id2word.doc2bow(text.split()) for text in texts]

In [85]:
print(corpus1[:1])

[[(0, 1)]]


In [75]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus1[:1]]

[[('C2805', 1)]]

In [None]:
# Build LDA model

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus1,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus1]