### Import packages

In [46]:
import json
import numpy as np
import pandas as pd
import os
import progressbar
import sys
from copy import deepcopy
import faiss
from bs4 import BeautifulSoup
from transformers import pipeline
from transformers import RobertaTokenizer, RobertaModel
import torch

## Implementation of incident search and RCA (with extracted symptoms)

#### 1 ) Collecting Entities to create the database

In [2]:
cso_df = pd.read_csv('./CSO_data/'+ 'CSO_entities_ensembled_Sign_bart-large.csv', index_col=0)

In [None]:
json_file = './CSO_data/CSO_all_scraped_Sign.json'
with open(json_file, 'r') as f:
    json_data = json.load(f)

In [50]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
model = RobertaModel.from_pretrained('roberta-large')


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [95]:
# Create Roberta embeddings for the outage symptoms

entities1 = ['symptom'] #, 'customer_impacts', 'short_description', 'description', 'remediations', 'root_cause', 'short_term_fix']
sent_embed = {}

def embed_sentences():
    global sent_embed
    
    i = 0
    for cso in cso_df['cso_number']:
        for entity in entities1:
            try:
                text = cso_df[cso_df['cso_number'] == cso][entity].iloc[0]
                text = text.strip()
                input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0)
                with torch.no_grad():
                    outputs = model(input_ids)
                    last_hidden_states = outputs[0]
                
                token_embeddings = torch.squeeze(last_hidden_states, dim=0)
                sentence_embed = torch.mean(token_embeddings, axis=0).numpy()
                sent_dict = {'cso': cso, 'sent': text, 'embed': sentence_embed, 'tag': entity}
                sent_embed[i] = sent_dict
                i += 1
            except:
                pass

In [96]:
embed_sentences()

In [97]:
len(sent_embed)

182

In [99]:
# Save sentence embeddings (takes time)
def save_sent_embeddings(path):
    sent_ = {}
    for i in sent_embed:
        sent_[i] = sent_embed[i]
        sent_[i]['embed'] = sent_embed[i]['embed'].tolist()
    with open(path, 'w') as f:
        json.dump(sent_, f)

In [100]:
save_sent_embeddings('Roberta_sent_embeddings_ESRO.json')

In [101]:
## split sentence embedding and store it'
def split_save_sent_embeddings(data_dict, folder_name, file_generic):
    l = len(data_dict.keys())
    size = int(l*(0.01))
    count = 0
    temp = {}

    for key in data_dict:
        temp[key] = data_dict[key]
        if isinstance(data_dict[key]['embed'], np.ndarray):
            temp[key]['embed'] = data_dict[key]['embed'].tolist()
        else:
            temp[key] = data_dict[key]
        count += 1
        if (count%size == 0):
            file_name = folder_name + '/' + file_generic + '_' +  str(count-size) + '_' + str(count) + '.json'
            with open(file_name, 'w') as f:
                json.dump(temp, f)
            temp = {}


In [102]:
split_save_sent_embeddings(sent_embed, 'FAISS - search/Roberta_sent_embeddings_ESRO', 'sent_embed')
# already saved, load the same as save or save different embeddings

 [elapsed time: 0:00:05] |********************************* | (ETA:   0:00:00) 

In [104]:
# Load the sentence embeddings which was stored in a single file (takes time)
def load_sent_embeddings(path):
    global sent_embed
    sent_embed = {}
    with open(path, 'r') as f:
        sent_embed = json.load(f)
    l = len(sent_embed.keys())

    for i in sent_embed:
        sent_embed[i]['embed'] = np.asarray(sent_embed[i]['embed'], dtype = 'float32')

In [105]:
load_sent_embeddings('Roberta_sent_embeddings_ESRO.json')

 [elapsed time: 0:00:00] |                                  | (ETA:  --:--:--) 

In [107]:
# Load the sentence embeddings which were splitted (less time)
def load_split_save_sent_embeddings(folder_name):
    sent_embed = {}
    sent = {}
    print('reading files')
    print(folder_name)
    l = len(os.listdir(folder_name))
    count = 0

    for file in os.listdir(folder_name):
        if file == '.ipynb_checkpoints':
            continue
        print(folder_name + '/' + file)
        with open(folder_name + '/' + file, 'r') as f:
            sent = {**sent, **json.load(f)}
        count += 1
    
    print('converting list to numpy array')
    l = len(sent.keys())
    count = 0

    for i in sent:
        sent_embed[int(i)] = sent[i]
        sent_embed[int(i)]['embed'] = np.asarray(sent[i]['embed'], dtype = 'float32')
        count += 1

    return sent_embed

In [None]:
## just run this line to load already calculated sentence embeddings in the directory sent_embed
sent_embed_sign = load_split_save_sent_embeddings('./FAISS - search/Roberta_sent_embeddings_ESRO')

#### 2 ) building the FAISS Index 

In [112]:
def create_numpy_array(s):
    nb = len(s.keys())
    xb = []
    for i, k in enumerate(s.keys()):
        xb.append(s[k]['embed'])
        
    return np.array(xb)

In [113]:
# Create numpy arrays of the sentence embeddings
xb = create_numpy_array(sent_embed_sign)

In [114]:
xb.shape

(182, 1024)

In [115]:
def normalize(xb):
    global xb_normalized
    xb_normalized = deepcopy(xb)
    faiss.normalize_L2(xb_normalized)
    
normalize(xb)

In [116]:
# Building FAISS index
d = 1024
Index_L2 = faiss.IndexFlatL2(d)
Index_IP = faiss.IndexFlatIP(d)


Index_L2.add(xb)
Index_IP.add(xb_normalized)

In [117]:
# Faiss Index top-k search for a given CSO
def find_top_k_similar(xq, k, basis = 'both'):
    xq_normalized = deepcopy(xq)
    faiss.normalize_L2(xq_normalized)
    
    D_L2, I_L2 = Index_L2.search(xq, k)
    D_IP, I_IP = Index_IP.search(xq_normalized, k)
    if basis == 'L2':
        return D_L2, I_L2
    if basis == 'IP':
        return D_IP, I_IP
    return D_L2, I_L2, D_IP, I_IP

In [120]:
# Given a text representing the symptom of an outage, rank the top-k symptoms
def rank_cso(query):    
    ranked_cso_dict = dict()
    
    input_ids = torch.tensor(tokenizer.encode(query, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
        last_hidden_states = outputs[0]

    token_embeddings = torch.squeeze(last_hidden_states, dim=0)
    sentence_embed = torch.mean(token_embeddings, axis=0).numpy().reshape(1,-1)
    k = 12

    D_IP, I_IP = find_top_k_similar(sentence_embed, k, 'IP')  ## shape (nq, k)

    for i in range(1):
        for j in range(k):
            cso_sent_dict = sent_embed_sign[I_IP[i,j]]
            
            if cso_sent_dict['cso'] in ranked_cso_dict.keys():
                ranked_cso_dict[cso_sent_dict['cso']]['score'] += D_IP[i, j]
                ranked_cso_dict[cso_sent_dict['cso']]['sent'].append(cso_sent_dict['sent'])
            else:
                temp = {'score': D_IP[i, j], 'cso':cso_sent_dict['cso'], 'sent': [cso_sent_dict['sent']]}
                ranked_cso_dict[cso_sent_dict['cso']] = temp
    
    return dict(sorted(ranked_cso_dict.items(), key=lambda x: x[1]['score'], reverse=True))

In [None]:
def get_faiss_rank(query):
    t2 = rank_cso(query)
    cso_dict = {}
    for i in t2:
        cso_dict[int(i)] = t2[i]['score']
    return cso_dict

## Evaluation Functions

In [None]:
def html_stripper(code):
    return BeautifulSoup(code).get_text().replace('\r',' ').replace('\xa0',' ').replace('\n',' ')

In [None]:
def remove_cso_from_dict(cso_dict1, exception_cso_number):
    cso_dict = cso_dict1.copy()
    if int(exception_cso_number) in cso_dict.keys():
        del cso_dict[int(exception_cso_number)]
    if str(exception_cso_number) in cso_dict.keys():
        del cso_dict[str(exception_cso_number)]
    return cso_dict

In [None]:
def get_top_n_rc(cso_dict, n):
    return dict(sorted(cso_dict.items(), key=lambda x: x[1], reverse=True)[:n])

### ROUGE Score

In [None]:
from rouge import Rouge
rouge = Rouge()

In [None]:
def max_rouge(cso_dict, actual_rc, actual_rem, type_rpf, n):
    max_rc = 0
    max_rem = 0
    rc_cso = 0
    rem_cso = 0
    top_dic = get_top_n_rc(cso_dict,n)
    for i in top_dic.keys():
        foo = cso_df.iloc[cso_df.index[cso_df['cso_number']==int(i)].tolist()[0]]
        trc = rouge.get_scores(str(foo['root_cause']), str(actual_rc))[0]['rouge-l'][type_rpf]
        if trc < 1 and trc >= max_rc:
            rc_cso = i
            max_rc = trc
        trm = rouge.get_scores(str(foo['remediations']), str(actual_rem))[0]['rouge-l'][type_rpf]
        if trm < 1 and trm >= max_rem:
            rem_cso = i
            max_rem = trm
    return max_rc, max_rem, rc_cso, rem_cso

### Experiment

In [None]:
random_cso_list = [14757,
 6704,
 15019,
 15126,
 8365,
 12070,
 14965,
 6524,
 14886,
 16742,
 9131,
 6119,
 9144,
 15484,
 16516,
 6894,
 13738,
 9560,
 10190,
 7599,
 9242,
 6920,
 17510,
 9060,
 9828,
 15215,
 15005,
 15558,
 12686,
 8548,
 9139,
 7653,
 8653,
 13678,
 15461,
 8754,
 14055,
 10999,
 15334,
 7872,
 9624,
 6577,
 14886,
 14902,
 14797,
 10384,
 10961,
 12052,
 9563,
 6704]

### Input Type - Description at the time of Outage

In [None]:
cso_descr = pd.read_csv("./CSO_data/cso_alert.csv")
len(cso_descr)

In [None]:
def slicer(my_str,sub):
    index=my_str.find(sub)
    if index !=-1 :
        return my_str[index+8:] 
    else :
        raise Exception(my_str,' -----Sub string not found!')

In [None]:
cso_descr_data = []
for index, row in cso_descr.iterrows():
    z = row['cso_number']
    x = row['description']
    try:
        y = slicer(x,"<br><br>")
        cso_descr_data.append([z,y])
    except:
        continue
    
cso_descr_df = pd.DataFrame(cso_descr_data, columns=['cso_number', 'descr'])

In [None]:
faiss_rc_rouge = []
faiss_rem_rouge = []
for cso in random_cso_list:
    index = cso_df.index[cso_df['cso_number'] == int(cso)].tolist()[0]
    actual_rc = cso_df['root_cause'][index]
    actual_rem = cso_df['remediations'][index]

    try:
        index2 = cso_df.index[cso_df['cso_number'] == cso].tolist()[0]
        symptom = cso_df.iloc[index2]['symptom']
        cso_dict = get_faiss_rank(symptom)
        a,b,c,d = max_rouge(remove_cso_from_dict(cso_dict, int(cso)), actual_rc, actual_rem, 'f', 5)
        faiss_rc_rouge.append(a)
        faiss_rem_rouge.append(b)
    except:
        pass
    
print("Average of FAISS RC= ", np.mean(faiss_rc_rouge), "\nAverage of FAISS Rem= ", np.mean(faiss_rem_rouge))