### Import packages

In [34]:
import json
import numpy as np
import pandas as pd
import os
import progressbar
import sys
from copy import deepcopy
import faiss

## Implementation of incident search and RCA

#### 1 ) Collecting Entities to create the database

In [2]:
## reading from a json file, keys are string

In [3]:
cso_dc_file = './CSO_data/CSO_all_scraped_DC.json'
cso_sign_file = './CSO_data/CSO_all_scraped_Sign.json'

In [4]:
def load_cso_json(file):
    global cso_json
    with open(file, 'r') as f:
        cso_json = json.load(f)

In [5]:
## using a pretrained embedding model to compute embedding vectors

In [6]:
# ## dowloading Glove Embeddings
# !wget https://nlp.stanford.edu/data/glove.42B.300d.zip

# !unzip glove*.zip

In [7]:
## creating embedding directory for later use
## jsut load the embeeddings next time
def create_embed_directory(file):
    global embed_directory
    embed_directory = {}
    with open(file, encoding = 'utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embed_directory[word] = vector
            
            
def save_embeddings(path):
    embed2= {}
    for word in embed_directory:
        embed2[word] = embed_directory[word].tolist()
    with open(path, 'w') as f:
        json.dump(embed2, f)
def load_embeddings(path):
    global embed_directory
    embed_directory = {}
    with open(path, 'r') as f:
        embed2 = json.load(f)
    for word in embed2:
        embed_directory[word] = np.asarray(embed2[word], dtype = 'float32')

In [8]:
# create_embed_directory('glove.42B.300d.txt')

In [9]:
# save_embeddings('glove.42B.300d.json')

In [10]:
# load_embeddings('glove.42B.300d.json')
# for reliability, use load_split_... function below

In [11]:
## saved Glove embeddings at 'glove.42B.300d/word_embed_0_19174.json'
## just load them
def split_save_embeddings(data_dict, folder_name, file_generic):
    l = len(data_dict.keys())
    size = int(l*(0.01))
    count = 0
    temp = {}
    ###### to show progress
    widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('*'),' (', progressbar.ETA(), ') ', ]
    bar = progressbar.ProgressBar(max_value=l, widgets=widgets).start()
    ###### to show progress
    for key in data_dict:
        if isinstance(data_dict[key], np.ndarray):
            temp[key] = data_dict[key].tolist()
        else:
            temp[key] = data_dict[key]
        count += 1
        if (count%size == 0):
            file_name = folder_name + '/' + file_generic + '_' +  str(count-size) + '_' + str(count) + '.json'
            with open(file_name, 'w') as f:
                json.dump(temp, f)
            temp = {}
        bar.update(count)
def load_split_saved_embeddings(folder_name):
    global embed_directory
    embed_directory = {}
    
    print('reading files')
    sys.stdout.flush()
    
    ###### to show progress
    l = len(os.listdir(folder_name))
    widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('*'),' (', progressbar.ETA(), ') ', ]
    bar = progressbar.ProgressBar(max_value=l, widgets=widgets).start()
    count = 0
    ###### to show progress
    
    
    for file in os.listdir(folder_name):
        if file == '.ipynb_checkpoints':
            continue
        with open(folder_name + '/' + file , 'r') as f:
            embed_directory = {**embed_directory, **json.load(f)}
        count += 1
        bar.update(count)
    
    print('converting to numpy array')
    sys.stdout.flush()
    
    ###### to show progress
    l = len(embed_directory.keys())
    count = 0
    widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('*'),' (', progressbar.ETA(), ') ', ]
    bar = progressbar.ProgressBar(max_value=l, widgets=widgets).start()
    ###### to show progress
    
    for word in embed_directory:
        embed_directory[word] = np.asarray(embed_directory[word], dtype = 'float32')
        count += 1
        bar.update(count)

In [12]:
# split_save_embeddings(embed_directory, 'glove.42B.300d', 'word_embed')

In [13]:
load_split_saved_embeddings('./FAISS - search/glove.42B.300d')

reading files


 [elapsed time: 0:04:48] |********************************* | (ETA:   0:00:02) 

converting to numpy array


 [elapsed time: 0:00:32] |**********************************| (ETA:  00:00:00) 

In [14]:
load_cso_json(cso_dc_file)

In [15]:
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import(word_tokenize, sent_tokenize, TreebankWordTokenizer, wordpunct_tokenize, TweetTokenizer, MWETokenizer)

In [16]:
def get_embedding(sent, d = 300): ## ignores the words not present in the vocabulary, returns 0 vector in case of empty string or string in which no word has any embedding
    res = np.zeros((d,), dtype = 'float32')
    count = 0
    words = list(word_tokenize(sent))
    for word in words:
        try:
            res = res + embed_directory[word]
            count += 1
        except:
            continue
    if (count > 0):
        res = res / count
    return res

In [17]:
## create a new object damn it!
## 
entities1 = ['u_customer_impacts', 'short_description', 'description', 'u_cso_summary', 'u_cso_timeline']
entities2 = ['u_permanent_solution', 'u_root_cause_description', 'u_problem_summary', 'u_short_term_fix']
def embed_sentences():
    global sent_embed
    sent_embed = {}
    i = 0
    ###### to show progress
    count = 0
    l = len(cso_json.keys())
    widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('*'),' (', progressbar.ETA(), ') ', ]
    bar = progressbar.ProgressBar(max_value=l, widgets=widgets).start()
    ###### to show progress
    
    for cso in cso_json:
        for entity in entities1:
            try:
                text = BeautifulSoup(cso_json[cso]['primaryIncident'][entity]).get_text()
                text = ' '.join(text.split('\n'))
                text = ' '.join(text.split('\xa0'))
                text = ' '.join(text.split())
                text = text.strip()
                list_of_sent = list(sent_tokenize(text))
                for sent in list_of_sent:
                    sent_dict = {'cso':cso, 'sent': '', 'embed': [], 'tag':''}
                    try:
                        sent_dict['sent'] = sent
                        sent_dict['embed'] = get_embedding(sent)
                        sent_dict['tag'] = entity
                        sent_embed[i] = sent_dict
                        i += 1
                    except Exception as e:
                        print('idhar', e)
                        print(sent)
                        pass
            except Exception as e:
                print('bahar1', e)
                pass
        for entity in entities2:
            try:
                text = BeautifulSoup(cso_json[cso]['problems'][0][entity]).get_text()
                text = ' '.join(text.split('\n'))
                text = ' '.join(text.split('\xa0'))
                text = ' '.join(text.split())
                text = text.strip()
                list_of_sent = list(sent_tokenize(text))
                for sent in list_of_sent:
                    sent_dict = {'cso':cso, 'sent': '', 'embed': [], 'tag':''}
                    try:
                        sent_dict['sent'] = sent
                        sent_dict['embed'] = get_embedding(sent)
                        sent_dict['tag'] = entity
                        sent_embed[i] = sent_dict
                        i += 1
                    except Exception as e:
                        print('udhar', e)
                        print(sent)
                        pass
            except Exception as e:
                print('bahar2', e)
                pass
        count += 1
        bar.update(count)

In [18]:
embed_sentences()

 [elapsed time: 0:00:00] |*                                 | (ETA:   0:00:07) 

bahar2 object of type 'NoneType' has no len()
bahar2 object of type 'NoneType' has no len()
bahar2 object of type 'NoneType' has no len()
bahar2 object of type 'NoneType' has no len()
bahar2 object of type 'NoneType' has no len()
bahar2 object of type 'NoneType' has no len()
bahar2 object of type 'NoneType' has no len()


 [elapsed time: 0:00:00] |**                                | (ETA:   0:00:07) 

bahar1 object of type 'NoneType' has no len()
bahar2 object of type 'NoneType' has no len()
bahar1 object of type 'NoneType' has no len()


 [elapsed time: 0:00:01] |*****                             | (ETA:   0:00:07) 

bahar2 object of type 'NoneType' has no len()
bahar2 object of type 'NoneType' has no len()


 [elapsed time: 0:00:03] |************                      | (ETA:   0:00:06) 

bahar2 object of type 'NoneType' has no len()


 [elapsed time: 0:00:08] |********************************* | (ETA:   0:00:00) 

bahar2 object of type 'NoneType' has no len()
bahar2 object of type 'NoneType' has no len()


In [39]:
sent_embed[25043]

{'cso': '9670',
 'sent': 'For most, if not all of these, the agreement should have woken up when we cleared the poison messages.',
 'embed': array([-5.29725216e-02, -3.25934365e-02, -6.27339035e-02, -6.88329563e-02,
         5.25446162e-02,  8.45563039e-02, -3.63912392e+00,  4.12863463e-01,
         6.68250844e-02, -4.45116937e-01,  4.74666581e-02,  1.30206913e-01,
        -1.09079042e-02, -8.51118118e-02,  5.06541058e-02, -1.08435765e-01,
        -1.30797178e-01, -6.03506453e-02,  8.97945240e-02, -6.89047202e-02,
        -1.94520317e-02, -1.06669143e-01,  4.63088676e-02,  7.18541369e-02,
        -1.29691154e-01, -3.30353156e-02, -4.28823829e-02, -1.74532488e-01,
        -4.82032858e-02,  5.49510401e-03, -2.38686681e-01,  8.90693367e-02,
         6.61032572e-02, -1.34959623e-01, -2.00491883e-02, -9.97144282e-02,
        -1.49898857e-01, -1.31189287e-01, -5.34908548e-02, -1.24764614e-01,
         5.39006963e-02,  2.16018677e-01, -6.58996552e-02, -1.39826894e-01,
        -2.74278801e-02,

In [19]:
def save_sent_embeddings(path):
    sent_ = {}
    for i in sent_embed:
        sent_[i] = sent_embed[i]
        sent_[i]['embed'] = sent_embed['embed'].tolist()
    with open(path, 'r') as f:
        json.dump(sent_,f)

In [25]:
# save_embeddings('sent_embeddings_sign.json')

In [20]:
## just load it the next time, already saved at 'sent_embeddings_sign/........'
def split_save_sent_embeddings(data_dict, folder_name, file_generic):
    l = len(data_dict.keys())
    size = int(l*(0.01))
    count = 0
    temp = {}
    ###### to show progress
    widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('*'),' (', progressbar.ETA(), ') ', ]
    bar = progressbar.ProgressBar(max_value=l, widgets=widgets).start()
    ###### to show progress
    for key in data_dict:
        temp[key] = data_dict[key]
        if isinstance(data_dict[key]['embed'], np.ndarray):
            temp[key]['embed'] = data_dict[key]['embed'].tolist()
        else:
            temp[key] = data_dict[key]
        count += 1
        if (count%size == 0):
            file_name = folder_name + '/' + file_generic + '_' +  str(count-size) + '_' + str(count) + '.json'
            with open(file_name, 'w') as f:
                json.dump(temp, f)
            temp = {}
        bar.update(count)

In [21]:
split_save_sent_embeddings(sent_embed, './FAISS - search/sent_embeddings_dc', 'sent_embed')
# already saved, load the same as save or save different embeddings

 [elapsed time: 0:00:14] |********************************* | (ETA:   0:00:00) 

In [22]:
def load_sent_embeddings(path):
    global sent_embed
    sent_embed = {}
    with open(path, 'r') as f:
        sent_embed = json.load(f)
    l = len(sent_embed.keys())
    count = 0
    ###### to show progress
    widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('*'),' (', progressbar.ETA(), ') ', ]
    bar = progressbar.ProgressBar(max_value=l, widgets=widgets).start()
    ###### to show progress
    for i in sent_embed:
        # sent_embed[i] = sent_[i]
        sent_embed[i]['embed'] = np.asarray(sent_embed[i]['embed'], dtype = 'float32')
        count += 1
        bar.update(count)

In [23]:
# load_sent_embeddings('sent_embeddings_sign.json')

In [24]:
def load_split_save_sent_embeddings(folder_name):
    # global sent_embed
    sent_embed = {}
    sent = {}
    print('reading files')
    sys.stdout.flush()
    l = len(os.listdir(folder_name))
    count = 0
    ###### to show progress
    widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('*'),' (', progressbar.ETA(), ') ', ]
    bar = progressbar.ProgressBar(max_value=l, widgets=widgets).start()
    ###### to show progress
    for file in os.listdir(folder_name):
        if file == '.ipynb_checkpoints':
            continue
        with open(folder_name + '/' + file, 'r') as f:
            sent = {**sent, **json.load(f)}
        count += 1
        bar.update(count)
    
    print('converting list to numpy array')
    sys.stdout.flush()
    l = len(sent.keys())
    count = 0
    ###### to show progress
    widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('*'),' (', progressbar.ETA(), ') ', ]
    bar = progressbar.ProgressBar(max_value=l, widgets=widgets).start()
    ###### to show progress
    for i in sent:
        sent_embed[int(i)] = sent[i]
        sent_embed[int(i)]['embed'] = np.asarray(sent[i]['embed'], dtype = 'float32')
        count += 1
        bar.update(count)
    sys.stdout.flush()
    return sent_embed

In [25]:
## just run this line to load already calculated sentence embeddings in the directory sent_embed
sent_embed_sign = load_split_save_sent_embeddings('./FAISS - search/sent_embeddings_sign')
sent_embed = load_split_save_sent_embeddings('./FAISS - search/sent_embeddings_dc')

reading files


 [elapsed time: 0:00:04] |********************************* | (ETA:   0:00:00) 

converting list to numpy array


 [elapsed time: 0:00:00] |***************************       | (ETA:   0:00:00) 

reading files


 [elapsed time: 0:00:05] |********************************* | (ETA:   0:00:00) 

converting list to numpy array


 [elapsed time: 0:00:00] |******************************    | (ETA:   0:00:00) 

#### 2 ) building the Index 

In [26]:


def create_numpy_array(sent_embed):
    nb = len(sent_embed.keys())
    d = 300 ## the dimension of the embeddings
    global xb
    xb = np.zeros((nb,d), dtype = 'float32')
    for i in range(nb):
        xb[i,:] = sent_embed[i]['embed']

In [27]:
create_numpy_array(sent_embed)

In [28]:
def normalize(xb):
    global xb_normalized
    xb_normalized = deepcopy(xb)
    faiss.normalize_L2(xb_normalized)
    
normalize(xb)

In [29]:
d = 300
Index_L2 = faiss.IndexFlatL2(d)
Index_IP = faiss.IndexFlatIP(d)


Index_L2.add(xb)
Index_IP.add(xb_normalized)

In [30]:
def find_top_k_similar(xq, k, basis = 'both'):
    xq_normalized = deepcopy(xq)
    faiss.normalize_L2(xq_normalized)
    
    D_L2, I_L2 = Index_L2.search(xq, k)
    D_IP, I_IP = Index_IP.search(xq_normalized, k)
    if basis == 'L2':
        return D_L2, I_L2
    if basis == 'IP':
        return D_IP, I_IP
    return D_L2, I_L2, D_IP, I_IP

In [31]:
def rank_cso(query):
    # split the query into sentences
    # assuming text input
    
    ranked_cso_dict = dict()
    list_of_sent = sent_tokenize(query)
    k = 12
    nq = len(list_of_sent)
    xq = np.zeros((nq, d), dtype = 'float32')
    
    for i, sent in enumerate(list_of_sent):
        sent_embedding = get_embedding(sent).reshape((1,-1))
        xq[i,:] = sent_embedding
        
    # print(xq.shape)
    D_IP, I_IP = find_top_k_similar(xq, k, 'IP')  ## shape (nq, k)
    # print(D_IP)
    for i in range(nq):
        for j in range(k):
            cso_sent_dict = sent_embed[I_IP[i,j]]
            if cso_sent_dict['cso'] in ranked_cso_dict.keys():
                ranked_cso_dict[cso_sent_dict['cso']]['score'] += D_IP[i, j]
                ranked_cso_dict[cso_sent_dict['cso']]['sent'].append(cso_sent_dict['sent'])
            else:
                temp = {'score': D_IP[i, j], 'sent': [cso_sent_dict['sent']]}
                ranked_cso_dict[cso_sent_dict['cso']] = temp
    
    # print(ranked_cso_dict.items())
    return dict(sorted(ranked_cso_dict.items(), key=lambda x: x[1]['score'], reverse=True))

In [32]:
query = 'Adobe Sign users in India are facing errors trying to log in. If they login somehow, it is taking too long to get their documents signed.'
ans_dict = rank_cso(query)
ans_dict

{'16649': {'score': 2.903912,
  'sent': ['Adobe Sign users are unable to log in.',
   'Adobe Sign users are unable to log in',
   'Between 2022-01-18 and 2022-02-10 at 23:03 UTC, some users attempting to log in to Adobe Sign were unable to.']},
 '14932': {'score': 2.8971612,
  'sent': ['On 2021-07-15 between 19:04 and 22:10 UTC, Outlook/WEP users attempting to login to Adobe Sign using the Office 365 integration were unable to sign in, preventing the users from accessing Adobe Sign.When attempting to log in the users would have experienced the error message “Sorry, an unexpected error occurred.',
   'Impact: On 2021-07-15 between 19:04 and 22:10 UTC, Outlook/WEP users attempting to login to Adobe Sign using the Office 365 integration were unable to sign in, preventing the users from accessing Adobe Sign.When attempting to log in the users would have experienced the error message “Sorry, an unexpected error occurred.',
   'It is not logged either on our end when we make the request to s