### Import packages

In [46]:
import json
import numpy as np
import pandas as pd
import os
import progressbar
import sys
from copy import deepcopy
import faiss

## Implementation of incident search and RCA (with extracted symptoms)

#### 1 ) Collecting Entities to create the database

In [2]:
df = pd.read_csv('./CSO_data/'+ 'CSO_entities_ensembled_Sign_bart-large-cnn-samsum.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,cso_number,symptom,root_cause,remediations,description,short_description,affected_services,child_cso,short_term_fix,customer_impacts
0,17536,The Adobe Sign Customized Email Template (CEMT...,Sign Business Intelligence team implemented a ...,There are none.,Adobe Sign Customized Email Template users are...,Adobe Sign - Customized Email Template microse...,"316391, 320691",,The BI change was rolled back and APO team cle...,Customized Email Template (CEMT) microservice ...
1,17522,Starting on 2022-05-24 between 11:29 UTC and 1...,canary deployment.,There are none.,Incident description: Cloudwatch check for 5xx...,Identity Management Services,"318561, 318560, 318205, 321727, 320998, 321962...",,There will be a rollback of the rollback.,downstream services could have experience re-...
2,17510,Microsoft Office365 Provider microservice expo...,The problem was caused by a misstep in convert...,Review the manual testing procedure with team ...,Microsoft Office365 Provider unable to run com...,Microsoft Office365 Provider,317501,,the data structure that is called.,Some commands in the Microsoft Office365 Provi...
3,17401,Some Adobe Sign users might have been unable t...,Some North America Adobe Sign users might have...,Review the SOP for DB nodes upgrades. Validate...,Incident description: Adobe Sign NA2 Compose P...,Adobe Sign North America 2,"321671, 323232, 321670, 323231, 323230, 317791...",17402.0,was set as primary node.,Some North America Adobe Sign users might have...
4,17368,The PDFcombiner worker for Sign North America ...,The Sign service operates as a distributed sys...,There is an underlying bug in JMS message hand...,Agreements are getting stuck,Adobe Sign - Adobe Sign North America 3,"322299, 318358",,instances were restarted.,3900 requests expired during the impact window...


In [49]:
from transformers import pipeline
from transformers import RobertaTokenizer, RobertaModel
import torch

In [50]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
model = RobertaModel.from_pretrained('roberta-large')


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [95]:
entities1 = ['symptom'] #, 'customer_impacts', 'short_description', 'description', 'remediations', 'root_cause', 'short_term_fix']

def embed_sentences():
    global sent_embed
    sent_embed = {}
    
    i = 0
    for cso in df['cso_number']:
        for entity in entities1:
            try:
                text = df[df['cso_number'] == cso][entity].iloc[0]
                text = text.strip()
                input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0)
                with torch.no_grad():
                    outputs = model(input_ids)
                    last_hidden_states = outputs[0]
                
                token_embeddings = torch.squeeze(last_hidden_states, dim=0)
                sentence_embed = torch.mean(token_embeddings, axis=0).numpy()
                sent_dict = {'cso': cso, 'sent': text, 'embed': sentence_embed, 'tag': entity}
                sent_embed[i] = sent_dict
                i += 1
            except:
                pass

In [96]:
embed_sentences()

In [97]:
len(sent_embed)

182

In [98]:
sent_embed[0]['embed'].shape

(1024,)

In [99]:
def save_sent_embeddings(path):
    sent_ = {}
    for i in sent_embed:
        sent_[i] = sent_embed[i]
        sent_[i]['embed'] = sent_embed[i]['embed'].tolist()
    with open(path, 'w') as f:
        json.dump(sent_, f)

In [100]:
save_sent_embeddings('sent_embeddings_sign_ESRO.json')

In [101]:
## just load it the next time, already saved at 'sent_embeddings_sign/........'
def split_save_sent_embeddings(data_dict, folder_name, file_generic):
    l = len(data_dict.keys())
    size = int(l*(0.01))
    count = 0
    temp = {}
    ###### to show progress
    widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('*'),' (', progressbar.ETA(), ') ', ]
    bar = progressbar.ProgressBar(max_value=l, widgets=widgets).start()
    ###### to show progress
    for key in data_dict:
        temp[key] = data_dict[key]
        if isinstance(data_dict[key]['embed'], np.ndarray):
            temp[key]['embed'] = data_dict[key]['embed'].tolist()
        else:
            temp[key] = data_dict[key]
        count += 1
        if (count%size == 0):
            file_name = folder_name + '/' + file_generic + '_' +  str(count-size) + '_' + str(count) + '.json'
            with open(file_name, 'w') as f:
                json.dump(temp, f)
            temp = {}
        bar.update(count)

In [102]:
split_save_sent_embeddings(sent_embed, 'FAISS - search/sent_embeddings_sign_ESRO', 'sent_embed')
# already saved, load the same as save or save different embeddings

 [elapsed time: 0:00:05] |********************************* | (ETA:   0:00:00) 

In [103]:
np.array(sent_embed[0]['embed']).shape

(1024,)

In [104]:
def load_sent_embeddings(path):
    global sent_embed
    sent_embed = {}
    with open(path, 'r') as f:
        sent_embed = json.load(f)
    l = len(sent_embed.keys())
    count = 0
    ###### to show progress
    widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('*'),' (', progressbar.ETA(), ') ', ]
    bar = progressbar.ProgressBar(max_value=l, widgets=widgets).start()
    ###### to show progress
    for i in sent_embed:
        # sent_embed[i] = sent_[i]
        sent_embed[i]['embed'] = np.asarray(sent_embed[i]['embed'], dtype = 'float32')
        count += 1
        bar.update(count)

In [105]:
load_sent_embeddings('sent_embeddings_sign_ESRO.json')

 [elapsed time: 0:00:00] |                                  | (ETA:  --:--:--) 

In [106]:
np.array(sent_embed['0']['embed']).shape

(1024,)

In [107]:
def load_split_save_sent_embeddings(folder_name):
    # global sent_embed
    sent_embed = {}
    sent = {}
    print('reading files')
    print(folder_name)
    # sys.stdout.flush()
    l = len(os.listdir(folder_name))
    count = 0
    ###### to show progress
    # widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('*'),' (', progressbar.ETA(), ') ', ]
    # bar = progressbar.ProgressBar(max_value=l, widgets=widgets).start()
    ###### to show progress
    for file in os.listdir(folder_name):
        if file == '.ipynb_checkpoints':
            continue
        print(folder_name + '/' + file)
        with open(folder_name + '/' + file, 'r') as f:
            sent = {**sent, **json.load(f)}
        count += 1
        # bar.update(count)
    
    print('converting list to numpy array')
    # sys.stdout.flush()
    l = len(sent.keys())
    count = 0
    ###### to show progress
    # widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('*'),' (', progressbar.ETA(), ') ', ]
    # bar = progressbar.ProgressBar(max_value=l, widgets=widgets).start()
    ###### to show progress
    for i in sent:
        sent_embed[int(i)] = sent[i]
        sent_embed[int(i)]['embed'] = np.asarray(sent[i]['embed'], dtype = 'float32')
        count += 1
        # bar.update(count)
    # sys.stdout.flush()
    return sent_embed

In [108]:
## just run this line to load already calculated sentence embeddings in the directory sent_embed
sent_embed_sign = load_split_save_sent_embeddings('./FAISS - search/sent_embeddings_sign_ESRO')
# sent_embed_dc = load_split_save_sent_embeddings('./FAISS - search/sent_embeddings_dc')

reading files
./FAISS - search/sent_embeddings_sign_ESRO
./FAISS - search/sent_embeddings_sign_ESRO/sent_embed_52_53.json
./FAISS - search/sent_embeddings_sign_ESRO/sent_embed_167_168.json
./FAISS - search/sent_embeddings_sign_ESRO/sent_embed_1_2.json
./FAISS - search/sent_embeddings_sign_ESRO/sent_embed_124_125.json
./FAISS - search/sent_embeddings_sign_ESRO/sent_embed_162_163.json
./FAISS - search/sent_embeddings_sign_ESRO/sent_embed_21_22.json
./FAISS - search/sent_embeddings_sign_ESRO/sent_embed_50_51.json
./FAISS - search/sent_embeddings_sign_ESRO/sent_embed_145_146.json
./FAISS - search/sent_embeddings_sign_ESRO/sent_embed_149_150.json
./FAISS - search/sent_embeddings_sign_ESRO/sent_embed_77_78.json
./FAISS - search/sent_embeddings_sign_ESRO/sent_embed_139_140.json
./FAISS - search/sent_embeddings_sign_ESRO/sent_embed_131_132.json
./FAISS - search/sent_embeddings_sign_ESRO/sent_embed_56_57.json
./FAISS - search/sent_embeddings_sign_ESRO/sent_embed_23_24.json
./FAISS - search/sent

In [109]:
np.array(sent_embed['0']['embed']).shape

(1024,)

In [110]:
sent_embed_sign[0]['embed'].shape

(1024,)

In [111]:
sent_embed_sign[0]

{'cso': 17536,
 'sent': 'The Adobe Sign Customized Email Template (CEMT) microservice experienced a service outage between 2022-05-25 at 22:55 UTC and 2022- 05-26 at 21:19 UTC. Four enterprise users (NCR Corporation, Humana, HPE, and WarnerMedia) filed support tickets with Customer Experience. CEMT shares the IP address with the Business Intelligence engine, which inadvertently tripped a protection feature in APO, which locked it out.',
 'embed': array([ 0.0999089 , -0.04202587, -0.3661461 , ...,  0.04501874,
         0.07975553, -0.02031409], dtype=float32),
 'tag': 'symptom'}

#### 2 ) building the Index 

In [112]:
def create_numpy_array(s):
    nb = len(s.keys())
    xb = []
    for i, k in enumerate(s.keys()):
        xb.append(s[k]['embed'])
        
    return np.array(xb)

In [113]:
xb = create_numpy_array(sent_embed_sign)

In [114]:
xb.shape

(182, 1024)

In [115]:
def normalize(xb):
    global xb_normalized
    xb_normalized = deepcopy(xb)
    faiss.normalize_L2(xb_normalized)
    
normalize(xb)

In [116]:
d = 1024
Index_L2 = faiss.IndexFlatL2(d)
Index_IP = faiss.IndexFlatIP(d)


Index_L2.add(xb)
Index_IP.add(xb_normalized)

In [117]:
def find_top_k_similar(xq, k, basis = 'both'):
    xq_normalized = deepcopy(xq)
    faiss.normalize_L2(xq_normalized)
    
    D_L2, I_L2 = Index_L2.search(xq, k)
    D_IP, I_IP = Index_IP.search(xq_normalized, k)
    if basis == 'L2':
        return D_L2, I_L2
    if basis == 'IP':
        return D_IP, I_IP
    return D_L2, I_L2, D_IP, I_IP

In [120]:
def rank_cso(query):
    # split the query into sentences
    # assuming text input
    
    ranked_cso_dict = dict()
    
    input_ids = torch.tensor(tokenizer.encode(query, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
        last_hidden_states = outputs[0]

    token_embeddings = torch.squeeze(last_hidden_states, dim=0)
    sentence_embed = torch.mean(token_embeddings, axis=0).numpy().reshape(1,-1)
    k = 12
#     nq = len(list_of_sent)
#     xq = np.zeros((nq, d), dtype = 'float32')
    
#     for i, sent in enumerate(list_of_sent):
#         sent_embedding = get_embedding(sent).reshape((1,-1))
#         xq[i,:] = sent_embedding
        
    # print(xq.shape)
    D_IP, I_IP = find_top_k_similar(sentence_embed, k, 'IP')  ## shape (nq, k)
    # print(D_IP)
    for i in range(1):
        for j in range(k):
            cso_sent_dict = sent_embed_sign[I_IP[i,j]]
            
            if cso_sent_dict['cso'] in ranked_cso_dict.keys():
                ranked_cso_dict[cso_sent_dict['cso']]['score'] += D_IP[i, j]
                ranked_cso_dict[cso_sent_dict['cso']]['sent'].append(cso_sent_dict['sent'])
            else:
                temp = {'score': D_IP[i, j], 'cso':cso_sent_dict['cso'], 'sent': [cso_sent_dict['sent']]}
                ranked_cso_dict[cso_sent_dict['cso']] = temp
    
    # print(ranked_cso_dict.items())
    return dict(sorted(ranked_cso_dict.items(), key=lambda x: x[1]['score'], reverse=True))

In [121]:
query = 'Adobe Sign users in India are facing errors trying to log in. If they login somehow, it is taking too long to get their documents signed.'
ans_dict = rank_cso(query)
ans_dict

{14452: {'score': 0.9972952,
  'cso': 14452,
  'sent': ['Some API calls to Adobe Sign integration API calls in the AU1 region failed on May 11th between 01:35 UTC and 02:02 UTC. The CSO was automatically launched for the alarm "Adobe_Sign_Secure_Login_AU1". The Adobe sign team quickly joined the call and restored the previous certificates on the apache servers in AU1.']},
 15934: {'score': 0.9970032,
  'cso': 15934,
  'sent': ['On 2021-11-15, multiple Identity client services experienced various levels of impact as they were unable to log in or receive errors. Over 800 customers contacted Adobe Support to report issues with various products and services. The most recent changes made were some rate-limiting config changes made about 7 hours before the CSO launched and a deployment performed on Friday 2021/11-12 were rolled back. The capacity of the Identity instances was doubled.']},
 6065: {'score': 0.9969669,
  'cso': 6065,
  'sent': ['On 02/10/2018, between 5:44AM PT and 5:56AM PT, t