# Inference & Evaluation for Salesforce Paper 

## Import packages

In [1]:
!pip install progressbar2



In [55]:
import json
import numpy as np
import os
import progressbar
import sys
from copy import deepcopy
import faiss
import pickle
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import(word_tokenize, sent_tokenize, TreebankWordTokenizer, wordpunct_tokenize, TweetTokenizer, MWETokenizer)
from statistics import mean
import pandas as pd
from io import StringIO
import time
import re
import pandas as pd
import networkx as nx
from transformers import pipeline
from transformers import RobertaTokenizer, RobertaModel
import torch

In [56]:
# cso_df = pd.read_csv("./CSO_data/CSO_salesforce_extracted_entities.csv", index_col=0)
cso_df = pd.read_csv("CSO_data/CSO_entities_ensembled_Sign_bart-large-cnn-samsum.csv", index_col=0)

In [57]:
json_file = './CSO_data/CSO_all_scraped_Sign.json'
with open(json_file, 'r') as f:
    json_data = json.load(f)

In [58]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
model = RobertaModel.from_pretrained('roberta-large')

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## FAISS Incident Search Functions

In [59]:
def get_embedding(sent, d = 1024): ## ignores the words not present in the vocabulary, returns 0 vector in case of empty string or string in which no word has any embedding
    res = np.zeros((d,), dtype = 'float32')
    count = 0
    words = list(word_tokenize(sent))
    for word in words:
        try:
            res = res + embed_directory[word]
            count += 1
        except:
            continue
    if (count > 0):
        res = res / count
    return res

In [60]:
## saved Glove embeddings at 'glove.42B.300d/word_embed_0_19174.json'
## just load them
def split_save_embeddings(data_dict, folder_name, file_generic):
    l = len(data_dict.keys())
    size = int(l*(0.01))
    count = 0
    temp = {}
    ###### to show progress
    widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('*'),' (', progressbar.ETA(), ') ', ]
    bar = progressbar.ProgressBar(max_value=l, widgets=widgets).start()
    ###### to show progress
    for key in data_dict:
        if isinstance(data_dict[key], np.ndarray):
            temp[key] = data_dict[key].tolist()
        else:
            temp[key] = data_dict[key]
        count += 1
        if (count%size == 0):
            file_name = folder_name + '/' + file_generic + '_' +  str(count-size) + '_' + str(count) + '.json'
            with open(file_name, 'w') as f:
                json.dump(temp, f)
            temp = {}
        bar.update(count)
def load_split_saved_embeddings(folder_name):
    global embed_directory
    embed_directory = {}
    
    print('reading files')
    sys.stdout.flush()
    
    ###### to show progress
    l = len(os.listdir(folder_name))
    widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('*'),' (', progressbar.ETA(), ') ', ]
    bar = progressbar.ProgressBar(max_value=l, widgets=widgets).start()
    count = 0
    ###### to show progress
    
    
    for file in os.listdir(folder_name):
        if file == '.ipynb_checkpoints':
            continue
        with open(folder_name + '/' + file , 'r') as f:
            embed_directory = {**embed_directory, **json.load(f)}
        count += 1
        bar.update(count)
    
    print('converting to numpy array')
    sys.stdout.flush()
    
    ###### to show progress
    l = len(embed_directory.keys())
    count = 0
    widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('*'),' (', progressbar.ETA(), ') ', ]
    bar = progressbar.ProgressBar(max_value=l, widgets=widgets).start()
    ###### to show progress
    
    for word in embed_directory:
        embed_directory[word] = np.asarray(embed_directory[word], dtype = 'float32')
        count += 1
        bar.update(count)

In [61]:
load_split_saved_embeddings('glove.42B.300d')

reading files


 [elapsed time: 0:33:10] |**********************************| (ETA:  00:00:00) 

converting to numpy array


 [elapsed time: 0:00:35] |**********************************| (ETA:  00:00:00) 

In [62]:
def load_split_save_sent_embeddings(folder_name):
    # global sent_embed
    sent_embed = {}
    sent = {}
    print('reading files')
    sys.stdout.flush()
    l = len(os.listdir(folder_name))
    count = 0
    ###### to show progress
    widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('*'),' (', progressbar.ETA(), ') ', ]
    bar = progressbar.ProgressBar(max_value=l, widgets=widgets).start()
    ###### to show progress
    for file in os.listdir(folder_name):
        if file == '.ipynb_checkpoints':
            continue
        with open(folder_name + '/' + file, 'r') as f:
            sent = {**sent, **json.load(f)}
        count += 1
        bar.update(count)
    
    print('converting list to numpy array')
    sys.stdout.flush()
    l = len(sent.keys())
    count = 0
    ###### to show progress
    widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('*'),' (', progressbar.ETA(), ') ', ]
    bar = progressbar.ProgressBar(max_value=l, widgets=widgets).start()
    ###### to show progress
    for i in sent:
        sent_embed[int(i)] = sent[i]
        sent_embed[int(i)]['embed'] = np.asarray(sent[i]['embed'], dtype = 'float32')
        count += 1
        bar.update(count)
    sys.stdout.flush()
    return sent_embed

In [63]:
## just run this line to load already calculated sentence embeddings in the directory sent_embed
sent_embed = load_split_save_sent_embeddings('./FAISS - search/sent_embeddings_sign_ESRO')
# sent_embed = load_split_save_sent_embeddings('/FAISS - search/sent_embeddings_dc')

reading files


 [elapsed time: 0:00:04] |********************************* | (ETA:   0:00:00) 

converting list to numpy array


 [elapsed time: 0:00:00] |                                  | (ETA:  --:--:--) 

#### 2 ) building the Index 

In [64]:
def create_numpy_array(s):
    nb = len(s.keys())
    xb = []
    for i, k in enumerate(s.keys()):
        xb.append(s[k]['embed'])
        
    return np.array(xb)

In [65]:
xb = create_numpy_array(sent_embed)

In [66]:
xb.shape

(182, 1024)

In [67]:
def normalize(xb):
    global xb_normalized
    xb_normalized = deepcopy(xb)
    faiss.normalize_L2(xb_normalized)
    
normalize(xb)

In [68]:
d = 1024
Index_L2 = faiss.IndexFlatL2(d)
Index_IP = faiss.IndexFlatIP(d)


Index_L2.add(xb)
Index_IP.add(xb_normalized)

In [69]:
def find_top_k_similar(xq, k, basis = 'both'):
    xq_normalized = deepcopy(xq)
    faiss.normalize_L2(xq_normalized)
    
    D_L2, I_L2 = Index_L2.search(xq, k)
    D_IP, I_IP = Index_IP.search(xq_normalized, k)
    if basis == 'L2':
        return D_L2, I_L2
    if basis == 'IP':
        return D_IP, I_IP
    return D_L2, I_L2, D_IP, I_IP

In [70]:
def rank_cso(query):
    # split the query into sentences
    # assuming text input
    
    ranked_cso_dict = dict()
    
    input_ids = torch.tensor(tokenizer.encode(query, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
        last_hidden_states = outputs[0]

    token_embeddings = torch.squeeze(last_hidden_states, dim=0)
    sentence_embed = torch.mean(token_embeddings, axis=0).numpy().reshape(1,-1)
    k = 12
        
    # print(xq.shape)
    D_IP, I_IP = find_top_k_similar(sentence_embed, k, 'IP')  ## shape (nq, k)
    # print(D_IP)
    for i in range(1):
        for j in range(k):
            cso_sent_dict = sent_embed[I_IP[i,j]]
            if cso_sent_dict['cso'] in ranked_cso_dict.keys():
                ranked_cso_dict[cso_sent_dict['cso']]['score'] += D_IP[i, j]
                ranked_cso_dict[cso_sent_dict['cso']]['sent'].append(cso_sent_dict['sent'])
            else:
                temp = {'score': D_IP[i, j], 'cso': cso_sent_dict['cso'], 'sent': [cso_sent_dict['sent']]}
                ranked_cso_dict[cso_sent_dict['cso']] = temp
    
    # print(ranked_cso_dict.items())
    return dict(sorted(ranked_cso_dict.items(), key=lambda x: x[1]['score'], reverse=True))

In [13]:
'''
def rank_cso(query):
    # split the query into sentences
    # assuming text input
    
    ranked_cso_dict = dict()
    list_of_sent = sent_tokenize(query)
    k = 20
    nq = len(list_of_sent)
    xq = np.zeros((nq, 300), dtype = 'float32')
    
    for i, sent in enumerate(list_of_sent):
        sent_embedding = get_embedding(sent).reshape((1,-1))
        xq[i,:] = sent_embedding
        
    # print(xq.shape)
    D_IP, I_IP = find_top_k_similar(xq, k, 'IP')  ## shape (nq, k)
    # print(D_IP)
    for i in range(nq):
        for j in range(k):
            cso_sent_dict = sent_embed[I_IP[i,j]]
            if cso_sent_dict['cso'] in ranked_cso_dict.keys():
                ranked_cso_dict[cso_sent_dict['cso']]['score'] += D_IP[i, j]
                ranked_cso_dict[cso_sent_dict['cso']]['sent'].append(cso_sent_dict['sent'])
            else:
                temp = {'score': D_IP[i, j], 'sent': [cso_sent_dict['sent']]}
                ranked_cso_dict[cso_sent_dict['cso']] = temp
    
    # print(ranked_cso_dict.items())
    return dict(sorted(ranked_cso_dict.items(), key=lambda x: x[1]['score'], reverse=True))
'''

"\ndef rank_cso(query):\n    # split the query into sentences\n    # assuming text input\n    \n    ranked_cso_dict = dict()\n    list_of_sent = sent_tokenize(query)\n    k = 20\n    nq = len(list_of_sent)\n    xq = np.zeros((nq, 300), dtype = 'float32')\n    \n    for i, sent in enumerate(list_of_sent):\n        sent_embedding = get_embedding(sent).reshape((1,-1))\n        xq[i,:] = sent_embedding\n        \n    # print(xq.shape)\n    D_IP, I_IP = find_top_k_similar(xq, k, 'IP')  ## shape (nq, k)\n    # print(D_IP)\n    for i in range(nq):\n        for j in range(k):\n            cso_sent_dict = sent_embed[I_IP[i,j]]\n            if cso_sent_dict['cso'] in ranked_cso_dict.keys():\n                ranked_cso_dict[cso_sent_dict['cso']]['score'] += D_IP[i, j]\n                ranked_cso_dict[cso_sent_dict['cso']]['sent'].append(cso_sent_dict['sent'])\n            else:\n                temp = {'score': D_IP[i, j], 'sent': [cso_sent_dict['sent']]}\n                ranked_cso_dict[cso_sent

In [80]:
def get_faiss_rank(query):
    t2 = rank_cso(query)
    cso_dict = {}
    for i in t2:
        cso_dict[int(i)] = t2[i]['score']
    return cso_dict

In [16]:
def rank_cso_averaged(query):
    # split the query into sentences
    # assuming text input
    
    ranked_cso_dict = dict()
    list_of_sent = sent_tokenize(query)
    k = 12
    nq = len(list_of_sent)
    xq = np.zeros((nq, 300), dtype = 'float32')
    
    for i, sent in enumerate(list_of_sent):
        sent_embedding = get_embedding(sent).reshape((1,-1))
        xq[i,:] = sent_embedding
        
    # print(xq.shape)
    D_IP, I_IP = find_top_k_similar(xq, k, 'IP')  ## shape (nq, k)
    # print(D_IP)
    for i in range(nq):
        for j in range(k):
            cso_sent_dict = sent_embed[I_IP[i,j]]
            if cso_sent_dict['cso'] in ranked_cso_dict.keys():
                ranked_cso_dict[cso_sent_dict['cso']]['score'] += D_IP[i, j]
                ranked_cso_dict[cso_sent_dict['cso']]['sent'].append(cso_sent_dict['sent'])
            else:
                temp = {'score': D_IP[i, j], 'sent': [cso_sent_dict['sent']]}
                ranked_cso_dict[cso_sent_dict['cso']] = temp
    
    # print(ranked_cso_dict.items())
    for cso in ranked_cso_dict:
        ranked_cso_dict[cso]['score'] = ranked_cso_dict[cso]['score']/ len(ranked_cso_dict[cso]['sent'])
    return dict(sorted(ranked_cso_dict.items(), key=lambda x: x[1]['score'], reverse=True))

In [21]:
query = 'Adobe Sign users in India are facing errors trying to log in. If they login somehow, it is taking too long to get their documents signed.'
ans_dict = get_faiss_rank(query)
ans_dict

{14932: 1.9938815,
 8005: 0.9974503,
 8468: 0.9973313,
 13840: 0.9972952,
 14449: 0.9971793,
 9632: 0.9969669,
 10190: 0.9969669,
 14385: 0.9969669,
 14961: 0.9969669,
 12187: 0.9968934,
 15873: 0.9968875}

## GCN Functions

In [108]:
G = pickle.load(open('./gcn_data/bipartite-ckg.gpickle', 'rb'))

In [118]:
g = G.copy()

nodelist = list(g.nodes())
available_cso = list(cso_df['cso_number'].unique())
for n in nodelist:
    cso = int(n.split('_')[0])
    
    if cso not in available_cso:
        g.remove_node(n)
        
A = np.array(list(g.nodes()))
A = np.reshape(A, (182,2))

In [119]:
def get_rank_list(symptom, all_predictions ):
    sym = symptom + '_sym'
    for i in range(A.shape[0]):
        for j in range(A.shape[1]):
            if A[i,j] == sym:
                sym_emb = all_predictions[i, j]
                
    li = []
    for i in range(A.shape[0]):
        for j in range(A.shape[1]):
            s = A[i, j]
            if s[-2:] == 'rc':
                li_sub = [s]
                rc_emb = all_predictions[i, j]
                li_sub.append(rc_emb)
                li.append(li_sub)
                
    scores_list = []
    for sub_list in li:
        arr = sub_list[1]
        dot = np.dot(sym_emb, arr)
        ele = [sub_list[0]]
        ele.append(dot)
        scores_list.append(ele)
        
    return scores_list

In [120]:
def get_gcn_rank(cso_number,embedding_path):
    all_predictions = np.load(embedding_path)
    t1 = get_rank_list(str(cso_number),all_predictions)
    td1 = {}
    for i in t1:
        td1[int(i[0].replace("_rc",""))] = i[1]
    return td1

## Evaluation Functions

In [22]:
def html_stripper(code):
    return BeautifulSoup(code).get_text().replace('\r',' ').replace('\xa0',' ').replace('\n',' ')

In [23]:
def remove_cso_from_dict(cso_dict1, exception_cso_number):
    cso_dict = cso_dict1.copy()
    if int(exception_cso_number) in cso_dict.keys():
        del cso_dict[int(exception_cso_number)]
    if str(exception_cso_number) in cso_dict.keys():
        del cso_dict[str(exception_cso_number)]
    return cso_dict

In [24]:
def get_top_n_rc(cso_dict, n):
    return dict(sorted(cso_dict.items(), key=lambda x: x[1], reverse=True)[:n])

### ROUGE Score

In [25]:
from rouge import Rouge
rouge = Rouge()

In [122]:
def max_rouge(cso_dict, actual_rc, actual_rem, type_rpf, n):
    max_rc = 0
    max_rem = 0
    rc_cso = 0
    rem_cso = 0
    top_dic = get_top_n_rc(cso_dict,n)
    for i in top_dic.keys():
        foo = cso_df.iloc[cso_df.index[cso_df['cso_number']==int(i)].tolist()[0]]
        trc = rouge.get_scores(str(foo['root_cause']), str(actual_rc))[0]['rouge-l'][type_rpf]
        if trc<1 and trc>=max_rc:
            rc_cso = i
            max_rc = trc
        trm = rouge.get_scores(str(foo['remediations']), str(actual_rem))[0]['rouge-l'][type_rpf]
        if trm<1 and trm>= max_rem:
            rem_cso = i
            max_rem = trm
    return max_rc, max_rem, rc_cso, rem_cso

In [27]:
def max_rouge_og(cso_dict, actual_rc, actual_rem, type_rpf, n):
    max_rc = 0
    max_rem = 0
    rc_cso = 0
    rem_cso = 0
    top_dic = get_top_n_rc(cso_dict,n)
    for i in top_dic.keys():
        rc = html_stripper(str(json_data[str(i)]['problems'][0]['u_root_cause_description']))
        trc = rouge.get_scores(str(rc), str(actual_rc))[0]['rouge-l'][type_rpf]
        if trc<1 and trc>=max_rc:
            rc_cso = i
            max_rc = trc
        rem = html_stripper(str(json_data[str(i)]['problems'][0]['u_permanent_solution']))
        trm = rouge.get_scores(str(rem), str(actual_rem))[0]['rouge-l'][type_rpf]
        if trm<1 and trm>= max_rem:
            rem_cso = i
            max_rem = trm
    return max_rc, max_rem, rc_cso, rem_cso

## Experiment

In [28]:
random_cso_list = [14757,
 6704,
 15019,
 15126,
 8365,
 12070,
 14965,
 6524,
 14886,
 16742,
 9131,
 6119,
 9144,
 15484,
 16516,
 6894,
 13738,
 9560,
 10190,
 7599,
 9242,
 6920,
 17510,
 9060,
 9828,
 15215,
 15005,
 15558,
 12686,
 8548,
 9139,
 7653,
 8653,
 13678,
 15461,
 8754,
 14055,
 10999,
 15334,
 7872,
 9624,
 6577,
 14886,
 14902,
 14797,
 10384,
 10961,
 12052,
 9563,
 6704]

### Input Type 1 - Description at the time of Outage

In [29]:
cso_descr = pd.read_csv("./CSO_data/cso_alert.csv")
len(cso_descr)

76

In [30]:
def slicer(my_str,sub):
    index=my_str.find(sub)
    if index !=-1 :
        return my_str[index+8:] 
    else :
        raise Exception(my_str,' -----Sub string not found!')

In [31]:
cso_descr_data = []
for index, row in cso_descr.iterrows():
    #print(z)
    z = row['cso_number']
    x = row['description']
    try:
        y = slicer(x,"<br><br>")
        cso_descr_data.append([z,y])
    except:
        continue
cso_descr_df = pd.DataFrame(cso_descr_data, columns=['cso_number', 'descr'])

In [32]:
cso_descr_df

Unnamed: 0,cso_number,descr
0,17368,Agreements are getting stuck
1,15586,Incident description: Pingdom check of Adobe S...
2,13750,Incident description: Pingdom check of Adobe S...
3,14774,Some customers are unable to login to Adobe Si...
4,15558,KBA auth is failing as KBA was down
...,...,...
69,16536,S3 Storage Outage
70,15666,Issues when sending bulk agreements
71,15663,Webhook is not working in HSBC
72,15007,EU sign in problems


In [88]:
simpred_rc_rouge = []
simpred_rem_rouge = []
for cso in random_cso_list:
    index = cso_df.index[cso_df['cso_number'] == int(cso)].tolist()[0]
    actual_rc = cso_df['root_cause'][index]
    actual_rem = cso_df['remediations'][index]
    # actual_rc = html_stripper(str(data[str(cso)]['problems'][0]['u_root_cause_description']))
    # actual_rem = html_stripper(str(data[str(cso)]['problems'][0]['u_permanent_solution']))
    
    #symptom = cso_df['symptom'][index]
    try:
        index2 = cso_df.index[cso_df['cso_number'] == cso].tolist()[0]
        symptom = cso_df.iloc[index2]['symptom']
        cso_dict = get_faiss_rank(symptom)
        a,b,c,d = max_rouge(remove_cso_from_dict(cso_dict, int(cso)), actual_rc, actual_rem, 'f', 5)
        simpred_rc_rouge.append(a)
        simpred_rem_rouge.append(b)
    except:
        pass
    
print(len(simpred_rc_rouge))
print("Average of simpred RC= ", mean(simpred_rc_rouge), "\nAverage of simpred Rem= ", mean(simpred_rem_rouge))

50
Average of simpred RC=  0.2023700203374191 
Average of simpred Rem=  0.15676940348564283


In [53]:
mode = 'f'
z = 5

simpred_rc_rouge = []
simpred_rem_rouge = []
for cso in random_cso_list:
    #print(cso," doing")
    index = cso_df.index[cso_df['cso_number']==int(cso)].tolist()[0]
    #actual_rc = cso_df['root_cause'][index]
    #actual_rem = cso_df['remediations'][index]
    actual_rc = html_stripper(str(json_data[str(cso)]['problems'][0]['u_root_cause_description']))
    actual_rem = html_stripper(str(json_data[str(cso)]['problems'][0]['u_permanent_solution']))
    
    symptom = csos_descr[cso]
    cso_dict = get_faiss_rank(symptom)
    a,b,c,d = max_rouge_og(remove_cso_from_dict(cso_dict,int(cso)),actual_rc, actual_rem,mode,z)
    #a,b,c,d = max_rouge(remove_cso_from_dict(cso_dict,int(cso)),actual_rc, actual_rem,mode,z)
    simpred_rc_rouge.append(a)
    simpred_rem_rouge.append(b)
    #print(cso," Done")
print("Average of simpred RC= ",mean(simpred_rc_rouge),"\nAverage of simpred Rem= ",mean(simpred_rem_rouge))

NameError: name 'csos_descr' is not defined

### GCN Results

In [123]:
mode = 'f'
z = 5
embeddings_path = 'gcn_data/gcn_embeddings_5.npy'
gcn_rc_rouge = []
gcn_rem_rouge = []
for cso in random_cso_list:
    #print(cso," doing")
    index = cso_df.index[cso_df['cso_number']==int(cso)].tolist()[0]
    actual_rc = cso_df['root_cause'][index]
    actual_rem = cso_df['remediations'][index]
    # actual_rc = html_stripper(str(json_data[str(cso)]['problems'][0]['u_root_cause_description']))
    # actual_rem = html_stripper(str(json_data[str(cso)]['problems'][0]['u_permanent_solution']))
    
    cso_dict =  get_gcn_rank(cso, embeddings_path)
    a,b,c,d = max_rouge(remove_cso_from_dict(cso_dict, int(cso)),actual_rc, actual_rem,mode,z)
    #a,b,c,d = max_rouge(remove_cso_from_dict(cso_dict,int(cso)),actual_rc, actual_rem,mode,z)
    gcn_rc_rouge.append(a)
    gcn_rem_rouge.append(b)
    # except:
    #     pass
print("Average of gcn pred RC= ",mean(gcn_rc_rouge),"\nAverage of gcn pred Rem= ",mean(gcn_rem_rouge))

Average of gcn pred RC=  0.16506580376526678 
Average of gcn pred Rem=  0.14748898911550004
