In [367]:
%%capture

!python -m spacy download en_core_web_lg

In [59]:
%%capture

# -- External Import -- #
import pandas as pd
import numpy as np
import logging
import sys
import sklearn
import spacy
from spacy.matcher import Matcher, PhraseMatcher
from spacy.lang.en import English


# -- Internal Import-- #
from libs.data.utils import get_data
from libs.data.corpus_preprocessing import clean_logs, data_preprocessing_pipeline

In [60]:
logging.basicConfig(
    format='%(asctime)s %(levelname)s | %(message)s',
    level=logging.INFO,
    stream=sys.stdout
)

logger = logging.getLogger('notebook_logger')

In [61]:
data = get_data('../../data/train_data')

../../data/train_data/2021-10-09_1633771683/SoaesbLogsPipeline_base.csv
../../data/train_data/2021-10-11_1633944527/SoaesbLogsPipeline_base.csv
../../data/train_data/2021-10-12_1634030876/SoaesbLogsPipeline_base.csv
../../data/train_data/2021-10-10_1633858095/SoaesbLogsPipeline_base.csv
../../data/train_data/2021-10-12_1634016461/SoaesbLogsPipeline_base.csv
../../data/train_data/2021-10-12_1634026757/SoaesbLogsPipeline_base.csv
../../data/train_data/2021-10-12_1634053503/SoaesbLogsPipeline_base.csv


In [62]:
cleaned_logs_dict = data_preprocessing_pipeline(data)

Walking through database...:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [63]:
cat_logs = pd.concat(cleaned_logs_dict)
cat_logs = cat_logs.reset_index(drop=True)
print(f'Number of logs: {len(cat_logs.index)}')

cat_logs = cat_logs.drop_duplicates('log')
cat_logs = cat_logs.reset_index(drop=True)
print(f'Number of unique logs: {len(cat_logs.index)}')

Number of logs: 1581462
Number of unique logs: 42216


In [64]:
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

In [69]:
nlp = spacy.load("en_core_web_lg")
# parser = English()

In [80]:
cat_logs.head()

Unnamed: 0,container_name,log,label
0,1.master.broker.soaesb,ERROR org.apache.activemq.artemis.core.server...,healthy
1,1.backup.broker.soaesb,WARN org.apache.activemq.artemis.core.server ...,healthy
2,3.master.broker.soaesb,ERROR org.apache.activemq.artemis.core.server...,healthy
3,healthcheck,INFO HealthCheckService:106 UNHEALTHY unicorn...,healthy
4,healthcheck,INFO HealthCheckService:106 UNHEALTHY unicorn...,healthy


In [52]:
# doc = nlp(cat_logs['log'][2100])

# filtered_words = []

# for word in doc:
#     if word.is_stop==False:
#         filtered_words.append(word)
# #         print(word.text, word.pos_)
        
# filtered_words, cat_logs['log'][0]

In [71]:
import string

punctuations = string.punctuation

In [183]:
def tokenize_spacy(log):
    tokens = nlp(log)
    
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
        
    tokens = lemmas
    tokens = [tok for tok in tokens if tok not in STOP_WORDS]
    tokens = [tok for tok in tokens if tok not in punctuations]
    
    return tokens
    
#     tokens = [ word.lemma_.lower().strip() for word in tokens]
#     tokens = [ word.text.lower() for word in tokens ]
    
#     tokens = [ word for word in tokens if word not in STOP_WORDS and word not in punctuations ]
    
#     return tokens

In [357]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [294]:
cat_logs = cat_logs[~cat_logs['container_name'].str.contains('broker')]
cat_logs['container_name'].unique()

array(['healthcheck', '1.solr.soaesb', '2.solr.soaesb', '3.solr.soaesb',
       'ddf', 'security-handler-pki', 'org.apache.karaf.shell.core',
       'solr-dependencies', 'solr-factory-impl', 'catalog-solr-provider',
       'catalog-core-standardframework', 'persistence-core-listeners',
       '1.zookeeper.soaesb', '2.zookeeper.soaesb', 'soaesb',
       'org.apache.sshd.core', '3.zookeeper.soaesb',
       'org.apache.karaf.services.eventadmin', 'reflex-router-app',
       'mms-app', 'catalog-rest-impl', 'org.apache.aries.blueprint.core',
       'security-sts-ldapclaimshandler'], dtype=object)

In [369]:
cat_logs.head()

Unnamed: 0,container_name,log,label
3,healthcheck,INFO HealthCheckService:106 UNHEALTHY unicorn...,healthy
4,healthcheck,INFO HealthCheckService:106 UNHEALTHY unicorn...,healthy
5,healthcheck,INFO HealthCheckService:106 HEALTHY render /t...,healthy
6,healthcheck,INFO HealthCheckService:106,healthy
7,healthcheck,INFO HealthCheckService:106 UNHEALTHY rome ro...,healthy


In [371]:
count_vectorizer = CountVectorizer(tokenizer=tokenize_spacy, ngram_range=(3,3))

In [372]:
count_vectorizer.fit_transform(cat_logs['log'])

<20334x44796 sparse matrix of type '<class 'numpy.int64'>'
	with 760639 stored elements in Compressed Sparse Row format>

In [295]:
X = cat_logs['log']
y_labels = cat_logs['label']

In [297]:
cat_logs['label'].unique()

array(['healthy', 'locked-ldap-account', 'split-brain',
       'nitf-messaging-bundle-stopped', 'zookeeper-dead-containers',
       'core.soaesb-dead-soa-container', 'newscene-bundle-stopped',
       'unreachable-ldap', 'unreachable-unicorn',
       'core.soaesb-dead-soa-process', 'unreachable-render',
       'solr-dead-containers'], dtype=object)

In [298]:
X_train, X_test, y_train, y_test = train_test_split(X, y_labels, test_size=0.2)

In [358]:
# classifier = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=None)
classifier = LogisticRegression(max_iter=5000, random_state=0, warm_start=True)
# classifier = MultinomialNB()
# classifier = SVC(kernel='poly', random_state=0)

# vectorizer = TfidfVectorizer(tokenizer=tokenize_spacy)
vectorizer = CountVectorizer(tokenizer=tokenize_spacy, ngram_range=(3,3))

# one_vs_rest = OneVsRestClassifier(classifier)

In [359]:
class predictor(sklearn.base.TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {}
    
def clean_text(text):
    return text.strip().lower()

In [360]:
pipe = Pipeline([('cleaner', predictor()),
                ('vectorizer', vectorizer),
                ('classifier', classifier)])

In [361]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('cleaner', <__main__.predictor object at 0x7fdf2a899850>),
                ('vectorizer',
                 CountVectorizer(ngram_range=(3, 3),
                                 tokenizer=<function tokenize_spacy at 0x7fdf2df40790>)),
                ('classifier', SVC(kernel='poly', random_state=0))])

In [362]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

In [363]:
y_pred = pipe.predict(X_test)

In [364]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred, average='macro')}")
print(f"Recall: {recall_score(y_test, y_pred, average='macro')}")
print(f"{confusion_matrix(y_test, y_pred)}")

Accuracy: 0.16006884681583478
Precision: 0.6080702056200173
Recall: 0.13403043542703533
[[ 20   0   3   0   2   0   0   0   4 207   2   0]
 [  0   0   0   0   0   0   0   0   0 175   0   0]
 [  0   0  33   0   0   0   0   0   1 332   1   0]
 [  0   0   0   0   0   0   0   0   0 178   0   0]
 [  0   0   2   0  32   8   0   0   1 395   0   0]
 [  0   0   1   0   1  24   0   0   8 347   1   0]
 [  0   0   0   0   0   5  23   0   1 270   1   0]
 [  0   0   1   0   2   2   0   4   0 274   3   0]
 [  0   0   0   0   1   1   1   0  43 398   1   0]
 [  0   0  13   0   0   3   0   0   2 408   0   0]
 [  0   0   3   0   1   0   0   1   1 417  36   0]
 [  0   0   0   0   0   0   0   0   4 341   0  28]]


  _warn_prf(average, modifier, msg_start, len(result))
