## novelty

#### steps

1. study the data
    - commonality check
    - add/update stopwords
2. import required libraries
3. tokenize the lines
4. flatten the list
5. Word2Vec modeling (set no. of dimensions)
6. model.wv.__getitem__(word)[0]]
7. outlier values can be detected easily
---

In [1]:
# load libraries

import numpy as np
from gensim.models.word2vec import Word2Vec
from sklearn.svm import OneClassSVM

import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
# reading the log file
error_list = []

with open('sample.log', 'r') as f:
    for line in f:
        error_list.append(line.replace('\n',''))

In [3]:
# sample check
error_list[:5]

['normal abc-123-222',
 'user has been authenticated - continue',
 'normal abc-123-223',
 'user has been authenticated - continue',
 'normal abc-123-224']

In [4]:
# remove stopwords; use nltk or prepare a list of stopwords
# here '-', 'has', 'been' etc. can be removed!

stopwords = ['-', 'has', 'been']

error_list = []

with open('sample.log', 'r') as f:
    for line in f:
        for sw in stopwords:
            line = line.replace(sw,'')
        error_list.append(line.replace('\n',''))

In [5]:
error_list[:5]

['normal abc123222',
 'user   authenticated  continue',
 'normal abc123223',
 'user   authenticated  continue',
 'normal abc123224']

In [6]:
# import tokenizer or split using Python
from nltk import word_tokenize as wt

In [7]:
tokens = [wt(word) for word in error_list]
tokens[:10]

[['normal', 'abc123222'],
 ['user', 'authenticated', 'continue'],
 ['normal', 'abc123223'],
 ['user', 'authenticated', 'continue'],
 ['normal', 'abc123224'],
 ['user', 'authenticated', 'continue'],
 ['normal', 'abc123225'],
 ['user', 'authenticated', 'continue'],
 ['normal', 'abc123226'],
 ['user', 'authenticated', 'continue']]

In [8]:
# removing duplicates from the list of words 
flat_tokens = [item for sublist in tokens for item in sublist]

In [9]:
flat_tokens = list(set(flat_tokens))
flat_tokens = [word for word in flat_tokens if word not in stopwords]
flat_tokens

['abc123225',
 'abc123233',
 'abc123224',
 'abc123222',
 'def123224',
 'continue',
 'abc123235',
 'user',
 'abc123234',
 'abnormal',
 'def123221',
 'authenticated',
 'abc123227',
 'normal',
 'abc123226',
 'abc123232',
 'error',
 'abc123236',
 'abc123223',
 'abc123237',
 'def123256']

In [10]:
# building the W2V model
model = Word2Vec(tokens, min_count=1, size=1)
model

<gensim.models.word2vec.Word2Vec at 0x13e3e302588>

In [11]:
# preparing the training data

train = []

for word in flat_tokens:
    train.append([word, model.wv.__getitem__(word)[0]])

In [12]:
train

[['abc123225', 0.13382563],
 ['abc123233', 0.063248985],
 ['abc123224', 0.4479755],
 ['abc123222', 0.07106219],
 ['def123224', 0.91472024],
 ['continue', 0.62603045],
 ['abc123235', 0.46992135],
 ['user', 0.5984304],
 ['abc123234', 0.13310286],
 ['abnormal', 2.6557949],
 ['def123221', 2.14216],
 ['authenticated', 0.71153855],
 ['abc123227', 0.46044984],
 ['normal', 1.8947587],
 ['abc123226', 0.30117008],
 ['abc123232', 0.2307637],
 ['error', 2.4710228],
 ['abc123236', 0.24889427],
 ['abc123223', 0.08246646],
 ['abc123237', 0.35918424],
 ['def123256', -0.23639652]]

In [13]:
# outliers - as we can see from the above 
# few values are less than 2; 
[x for x in train if x[1] > 2]

[['abnormal', 2.6557949], ['def123221', 2.14216], ['error', 2.4710228]]

In [14]:
# we can prepare training and test datasets appropriately
# and conduct further modeling with additional data, if required
# can be used to calculate precision, recall, auc etc.

#### retraining 

In [15]:
# building the W2V model
# size (100) => number of dimensions of the embedding
# min_count (5) => words with an occurrence less than this count will be ignored
# window (5) => maximum distance between a target word and words around the target word
# workers (3) => number of threads to use while training
model = Word2Vec(tokens, min_count=1, size=1, window=1)
model

<gensim.models.word2vec.Word2Vec at 0x13e3f10eac8>

In [16]:
# preparing the training data
train = []

for word in flat_tokens:
    train.append([word, model.wv.__getitem__(word)[0]])

In [17]:
train

[['abc123225', -0.02716694],
 ['abc123233', 0.08637203],
 ['abc123224', 0.39668235],
 ['abc123222', 0.13047819],
 ['def123224', 0.9008598],
 ['continue', 0.615411],
 ['abc123235', 0.42098936],
 ['user', 0.575515],
 ['abc123234', 0.15301408],
 ['abnormal', 2.3441765],
 ['def123221', 1.9316148],
 ['authenticated', 0.707954],
 ['abc123227', 0.4160377],
 ['normal', 1.907468],
 ['abc123226', 0.19146249],
 ['abc123232', 0.18989928],
 ['error', 2.4708848],
 ['abc123236', 0.18698314],
 ['abc123223', 0.013917897],
 ['abc123237', 0.328511],
 ['def123256', -0.21214862]]

In [18]:
[x for x in train if x[1] > 2]

[['abnormal', 2.3441765], ['error', 2.4708848]]