In [1]:
%%capture
!pip install drain3

In [13]:
from scipy import stats 
from drain3.drain import Drain
from matplotlib import pyplot as plt

import joblib
import numpy as np
import seaborn as sns
import pandas as pd

### Import Cluster Hit Arrays

In [29]:
cluster_hits = joblib.load('/home/jovyan/results/matrices_dict.joblib')

In [30]:
cluster_hits_dict = joblib.load('/home/jovyan/results/hits_dict.joblib')

In [31]:
containers = joblib.load('/home/jovyan/results/containers.joblib')

In [32]:
labels = joblib.load('/home/jovyan/results/labels.joblib')

In [33]:
collections = joblib.load('/home/jovyan/results/collections.joblib')

In [34]:
dd = joblib.load('/home/jovyan/results/drain_dict.joblib')

### Validate Data

In [45]:
sum = 0
for container in containers:
    sum += len(dd[container].clusters)*len(labels)*len(collections)
    
print(sum)

20040


In [53]:
counter = 0
for idx, cols in cluster_hits_dict.items():
    for label, l_vals in cols.items():
        for container, c_vals in l_vals.items():
            sigma = np.sum(cluster_hits[idx][label][container], axis=1)
            for cluster, hits in c_vals.items():
                if hits[1] != sigma[hits[0]-1]:
                    print(cluster)
                    print(hits[1])
                    print(sigma[hits[0]-1])
                counter+=1
                
print(counter)

ID=18    : size=1604      : <*> <*> INFO 1 --- <*> m.a.g.st.mms.catalog.MmsCatalogHelper : <*> <*> mission metacard (ROME <*>
2
1.0
3434


In [54]:
c = '<*> <*> INFO 1 --- <*> m.a.g.st.mms.catalog.MmsCatalogHelper : <*> <*> mission metacard (ROME <*>'
count = 0
for idx, cols in cluster_hits_dict.items():
    for label, l_vals in cols.items():
        for container, c_vals in l_vals.items():
            for cluster, hits in c_vals.items():
                # if cluster.get_template() == c:
                count += hits[1]
print(count)

270450


### Record Means

In [19]:
# The cluster_hits array has the following key structure: [collection][label][container]
means_dict = {}

for idx, col_val in cluster_hits.items():
    means_dict[idx] = {}
    for label, l_val in col_val.items():
        means_dict[idx][label] = {}
        for container, c_val in l_val.items():
            Mu = stats.trim_mean(c_val, axis=1, proportiontocut=0.05)
            means_dict[idx][label][container] = Mu

In [9]:
labels

{'core.soaesb-dead-soa-process',
 'healthy',
 'newscene-bundle-stopped',
 'nitf-messaging-bundle-stopped'}

In [10]:
collections

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}

### Remove Clusters With Zero Variability

In [27]:
var_clusters = {idx: {label: {container: set() for container in containers} for label in labels} for idx in collections}

for idx, col_records in cluster_hits.items():
    for label, l_records in col_records.items():
        for container, hits in l_records.items():
            drain_model = dd[container]
            zero_rows = np.where(~hits.any(axis=1))[0]
            for row_idx in zero_rows:
                cluster = drain_model.id_to_cluster[row_idx+1] #drain clusters are 1 based and the array is 0 based hence the +1 term
                var_clusters[idx][label][container].add(cluster.get_template())
                
non_variable_clusters = {}
# produce list of clusters for each container with non-zero variability:
for container in containers:
    master_cluster = {c.get_template() for c in dd[container].clusters}
    zero_set = {c.get_template() for c in dd[container].clusters}
    for idx in collections:
        for label in labels:
            non_zero_set = master_cluster - var_clusters[idx][label][container]
            zero_set -= non_zero_set
    non_variable_clusters[container] = zero_set      

In [28]:
non_variable_clusters

{'prometheus.metrics.soaesb': set(),
 '1.zookeeper.soaesb': set(),
 'grafana.metrics.soaesb': set(),
 'brokerStack_broker-master-1.1.py5vm49suvnd58w3oo2z7s3h5': set(),
 'core.soaesb': set(),
 'filebeat': set(),
 '1.solr.soaesb': {'2021-01-21 <*> INFO <*> <*> c:metacard_cache s:shard1 <*> [c:metacard_cache s:shard1 <*> <*> <*> <*> <*> <*> <*> <*> main{ExitableDirectoryReader(UninvertingDirectoryReader(Uninverting(_1(7.4.0):C6/4:delGen=1) Uninverting(_2(7.4.0):C4/1) Uninverting(_3(7.4.0):C1)))}',
  '2021-01-21 <*> INFO <*> <*> s:shard1 <*> <*> o.a.s.s.SolrIndexSearcher Opening <*> <*>',
  '2021-01-21 <*> INFO <*> [ ] <*> <*> <*>',
  '2021-01-22 <*> INFO (searcherExecutor-24-thread-1) [ ] <*> <*> <*> <*> <*> Searcher@194973d4[catalog_shard1_replica_n2] main{ExitableDirectoryReader(UninvertingDirectoryReader(Uninverting(_p(7.4.0):C6984/6:delGen=2) Uninverting(_1(7.4.0):C2984/2:delGen=1) Uninverting(_4(7.4.0):C4035) Uninverting(_9l(7.4.0):c471/6:delGen=1) Uninverting(_a5(7.4.0):c517/6:delGe

In [26]:
for _, cols in cluster_hits_dict.items():
    for _, l_vals in cols.items():
        for _, c_vals in l_vals.items():
            for cluster, hits in c_vals.items():
                if hits == 0:
                    print(cluster)

In [None]:
for idx in collections:
    for label in labels:
        container = 'mmsStack_mms-app.1.g1bzbb75ulrenny5500oglgjg'
#cluster = '2021-01-21 <*> INFO 1 --- <*> <*> : <*> <*> mission <*> <*> <*>'
#clusters = dd[container].clusters
#for c in clusters:
#    if c.get_template() == cluster:
#        print(c.cluster_id)
        print(cluster_hits[idx][label][container][4])

In [None]:
containers

### Construct Means DataFrame

In [None]:
m_means = []
m_means_labels = []
m_clusters = []
m_indices = []
m_containers = []

for idx, col_val in cluster_hits.items():
    for label, l_val in col_val.items():
        for container, c_val in l_val.items():
            n = len(means_dict[idx][label][container])
            drain_model = dd[container]
            m_means += list(means_dict[idx][label][container])
            m_means_labels += [label]*n
            m_indices += [idx]*n
            m_containers += [container]*n
            for cluster in drain_model.clusters:
                m_clusters.append(cluster.get_template())

m_data = {'mean': m_means, 
          'index': m_indices,
          'label': m_means_labels,
          'cluster': m_clusters,
          'container': m_containers}

mdf = pd.DataFrame(m_data)

In [None]:
dims = (30, 16)
fig, ax = plt.subplots(figsize=dims)
g = sns.scatterplot(data=mdf, x="cluster", y="mean", hue="label")
ax.tick_params(labelbottom=False) 

In [None]:
'''
We will encode each label as such:
    1 := 'core.soaesb-dead-soa-process'
    2 := 'healthy'
    3 := 'newscene-bundle-stopped'
    4 := 'nitf-messaging-bundle-stopped'
'''

z_means = []
z_means_labels = []
z_clusters = []
z_indices = []
z_containers = []

for idx, col_val in cluster_hits.items():
    for label, l_val in col_val.items():
        for container, c_val in l_val.items():
            drain_model = dd[container]
            for cluster in drain_model.clusters:
                if means_dict[idx][label][container][cluster.cluster_id-1] > 0:
                    z_clusters.append(cluster.get_template())
                    z_means.append(means_dict[idx][label][container][cluster.cluster_id-1])
                    z_indices.append(idx)
                    z_means_labels.append(label)
                    z_containers.append(container)

z_data = {'mean': z_means, 
          'index': z_indices,
          'label': z_means_labels,
          'cluster': z_clusters,
          'container': z_containers}

zdf = pd.DataFrame(z_data)

In [None]:
d_total_hits = []
d_labels = []
d_clusters = []
d_indices = []
d_containers = []

for idx, col_val in cluster_hits.items():
    for label, l_val in col_val.items():
        for container, c_val in l_val.items():
            drain_model = dd[container]
            for cluster in drain_model.clusters:
                hits = list(cluster_hits[idx][label][container][cluster.cluster_id-1])
                n = len(hits)
                d_clusters+=[cluster.get_template()]*n
                d_total_hits+=hits
                d_indices+=[idx]*n
                d_labels+=[label]*n
                d_containers+=[container]*n

data = {'hits': d_total_hits, 
        'index': d_indices,
        'label': d_labels,
        'cluster': d_clusters,
        'container': d_containers}

df = pd.DataFrame(data)

In [None]:
for _, item in data.items():
    print(len(item))

In [None]:
df.head()

In [None]:
sdf = df[df['container']=='grafana.metrics.soaesb']
dims = (25, 12)
#fig, ax = plt.subplots(figsize=dims)
g = sns.displot(sdf, x="hits", row="cluster", col="label")
#ax.tick_params(labelbottom=False) 

### Display the poisson distributions

#### Generate plot data

In [None]:
collection = 1
container = 'grafana.metrics.soaesb'
drain_model = dd[container]
X = {}
Y = {}
for label in labels:
    X[label] = {}
    Y[label] = {}
    for cluster in drain_model.clusters:
        pois = p_dict[collection][label][container][cluster.get_template()]
        x = np.arange(poisson.ppf(0.01, pois.mean()),
                      poisson.ppf(0.99, pois.mean()))
        X[label][cluster.get_template()] = x
        Y[label][cluster.get_template()] = poisson.pmf(x, pois.mean())

#### Display plots

In [None]:
fig, axes = plt.subplots(len(drain_model.clusters), 1)
sns.set()
fig.suptitle(f'Poisson PMFs for {container}')

for cluster in drain_model.clusters:
    count = 0
    data = []
    for label in labels:
        sns.barplot(y=Y[label][cluster.get_template()], ax=axes[count])
    count+=1

In [None]:
[y for y in Y[label]['<*> lvl=info msg="Alert Rule returned no data" logger=alerting.evalContext ruleId=3 name="Panel Title alert" changing state to=keep_state'] for label in labels]