## Find similar incidents using Document Similarity doc2bow (doc to bag of words) process
#### This process can be particularly useful to find similar incidents reported by end users in an Enterprise. Once we find similar incidents the highest matching incident's resolution is most likely be helpful to resolve the current issue as well

#### The code below was used for back end engine to provide a similarity index of incidents.


### The structure of the code below follows Jonathan Mugan's 
#### Learn how to use the gensim Python library to determine the similarity between two or more documents
https://www.oreilly.com/learning/how-do-i-compare-document-similarity-using-python 

In [2]:
import pandas as pd
import re # to be used for regular Expression
import gensim  # python Library to be used to find similar documents
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import requests
import json
import time


In [3]:
df_raw=pd.read_csv('./incidentMerged1.csv')


In [4]:
df_raw.head(2)

Unnamed: 0,number,short_description
0,INC0212367,unable to send webex invite with 2 different c...
1,INC0208378,Needs help getting Webex dial conference room ...


In [5]:
len(df_raw)

8739

In [6]:
df_inp=df_raw.copy()

In [7]:
df_inp.head(1)

Unnamed: 0,number,short_description
0,INC0212367,unable to send webex invite with 2 different c...


In [8]:
df_filtered=df_inp.dropna(how='any',inplace=True)  ## Drop any record with value as NAN at any column

In [9]:
#df_inp['all_text']=df_inp['short_description']+' ' +df_inp['description']
#df_inp['all_text']=df_inp[['description']].apply(lambda x: ' '.join(x), axis=1)
df_inp['all_text']=df_inp['short_description']

In [10]:
df_inp.ix[0,'all_text']

'unable to send webex invite with 2 different conference rooms'

In [11]:
df_inp.iloc[2335]

number                                                 INC0029541
short_description    It seems that my AD Password is locked again
all_text             It seems that my AD Password is locked again
Name: 2335, dtype: object

In [274]:
## drop all non text Columns
#df_inp.drop(['sys_created_by','caller_id','sys_created_on','state','priority','opened_at','incident_state','knowledge'], axis=1, inplace=True)  
#df_inp.drop(['first_assigned','u_incident_type','parent_incident','parent'], axis=1, inplace=True)
#df_inp.drop(['u_parent_business_service','assigned_to','u_resolved_by'], axis=1, inplace=True)

In [12]:
#### Reg EX to remove special Characaters and repalce with a space
print(re.sub('[^A-Za-z0-9]+', ' ', df_inp.ix[0,-1]))
df_inp.ix[0,-1]

unable to send webex invite with 2 different conference rooms


'unable to send webex invite with 2 different conference rooms'

In [13]:
#df_inp['text_clean']=df_inp['all_text'].map(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x).lower())
#df_inp['all_text']=df_inp['all_text'].str.lower()
df_inp['text_clean']=df_inp['all_text'].map(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x).lower())

In [14]:
df_inp.ix[0,'text_clean']

'unable to send webex invite with 2 different conference rooms'

In [278]:
# #' '.join(removeStop(df_inp.ix[0,'text_clean'].str)

# filtered_words=[word for word in df_inp.ix[0,'text_clean'] if word not in stopwords.words('english')]
# ''.join(filtered_words)

In [15]:
def removeStop(inp_str):
    clean_word=''
    for word in inp_str.split():
        if word not in stopwords.words('english'):
            clean_word=clean_word+' '+word
    return clean_word  

In [16]:
removeStop(df_inp.ix[0,'text_clean'])

' unable send webex invite 2 different conference rooms'

In [17]:
len(df_inp['text_clean'])

8739

In [18]:
##### Commented out Stop Words removal
#### df_inp['text_clean_stop']=df_inp['text_clean'].map(lambda x: removeStop(x))
df_inp['text_clean_stop']=df_inp['text_clean']

In [19]:
df_inp.ix[0,'text_clean_stop']

'unable to send webex invite with 2 different conference rooms'

In [20]:
df_inp['tokenized']=df_inp['text_clean_stop'].map(lambda x: word_tokenize(x))

### We will use NLTK to tokenize.

A document will now be a list of tokens

In [285]:
#df_inp.ix[0:3,'tokenized'].tolist()

### We will create a dictionary from a list of documents. A dictionary maps every word to a number.

In [21]:
dict_t = gensim.corpora.Dictionary(df_inp['tokenized'])
print(dict_t[7])
print(dict_t.token2id['longer'])
print("Number of words in dictionary:",len(dict_t))
# for i in range(len(dict_t)):
#     print(i, dict_t[i])

rooms
155
('Number of words in dictionary:', 3885)


In [22]:
print(dict_t.token2id['see'])

370


In [288]:
#dict_t.values()

### Now we will create a corpus. 
A corpus is a list of bags of words. A bag-of-words representation for a document just lists the number of times each word occurs in the document.

In [23]:
corpus = [dict_t.doc2bow(gen_doc) for gen_doc in df_inp['tokenized']]
print(df_inp['tokenized'][0])
print(corpus[0])
len(corpus[0])

['unable', 'to', 'send', 'webex', 'invite', 'with', '2', 'different', 'conference', 'rooms']
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]


10

### Now we create a tf-idf model from the corpus. 
Note that num_nnz is the number of tokens.

In [24]:
tf_idf = gensim.models.TfidfModel(corpus)
print(tf_idf)
s = 0
for i in corpus:
    s += len(i)
print(s)

TfidfModel(num_docs=8739, num_nnz=61961)
61961


### Now we will create a similarity measure object in tf-idf space.
tf-idf stands for term frequency-inverse document frequency. Term frequency is how often the word shows up in the document and inverse document fequency scales the value by how rare the word is in the corpus.

In [29]:
sims = gensim.similarities.Similarity('/Users/spandan.chakraborty/Documents/UML/Hackathon/',tf_idf[corpus],
                                      num_features=len(dict_t))
print(sims)
print(type(sims))

Similarity index with 8739 documents in 0 shards (stored under /Users/spandan.chakraborty/Documents/UML/Hackathon/)
<class 'gensim.similarities.docsim.Similarity'>


### Now take the second incident  and convert it to tf-idf.

In [292]:
# query_doc = [w.lower() for w in word_tokenize(df_inp.ix[1,'text_clean_stop'])]
# print(query_doc)
# query_doc_bow = dict_t.doc2bow(query_doc)
# print(query_doc_bow)
# query_doc_tf_idf = tf_idf[query_doc_bow]
# print(query_doc_tf_idf)

In [293]:
# sims[query_doc_tf_idf]
# len(sims[query_doc_tf_idf])

In [294]:
# temp=sims[query_doc_tf_idf]
# temp2=sorted(range(len(temp)), key=lambda i: temp[i], reverse=True)[:5]

In [295]:
# temp2

In [296]:
# [temp[x]*100 for x in temp2]

In [297]:
# df_inp.ix[[1,122,92,244,151],'text_clean_stop']

In [298]:
# df_inp.ix[temp2,'text_clean_stop']

In [26]:
#dict(zip(df_inp.ix[[1,122,92,244,151],'number'],[temp[x]*100 for x in temp2]))

#### Following code was used to connect a ServiceNow instance to get the input Incident . The Input incident to be compared against the incidents we have got  in a CSV file 

In [300]:
# Set polling interval



# Set instance username/password
user = 'surfers.hackathon'
pwd = 'surfers.hackathon'

# Set the request parameters
getUrl = 'https://surfhackathon103.service-now.com/api/now/table/u_incident_matching?sysparm_query=u_responded=false'

# Set proper headers
getHeaders = {"Accept": "application/json"}
putHeaders = {"Content-Type": "application/json",
                  "Accept": "application/json"}

def poll():
    # Do the HTTP request
    response = requests.get(getUrl, auth=(user, pwd), headers=getHeaders)

    # Check for HTTP codes other than 200
    if response.status_code == 200:

        responseJson = response.json()

        for i in responseJson["result"]:
            sysID = i["sys_id"]

            # call ml function here and assign return value to responseText
            responseText = "Responded to " + i["u_incident"]
            responseJson = {}
            
            ### Call the mdodel
            responseText=processInString(i["u_description"])
            
            
            
            responseJson["u_response"] = responseText
            responseJson["u_responded"] = "true"
            responseString = json.dumps(responseJson)
            putUrl = 'https://surfhackathon103.service-now.com/api/now/table/u_incident_matching/' + sysID

            update = requests.put(putUrl, auth=(
                user, pwd), headers=putHeaders, data=responseString)
            print( i["u_description"])
            print(responseText)





In [302]:
# Execute thread on polling interval
pollInterval = 1.0
while True:
    poll()
    time.sleep(pollInterval)

[3249, 3277, 3351, 2962, 3253]
Cannot connect to outlook Cannot connect to outlook
{'INC0048176': 66.128993034362793, 'INC0208136': 62.239706516265869, 'INC0003602': 62.240928411483765, 'INC0063836': 61.903637647628784, 'INC0065500': 66.128993034362793}
[3254, 3242, 3348, 3065, 3051]
Cannot log into okta Cannot log into okta
{'INC0005714': 59.45172905921936, 'INC0062501': 60.741674900054932, 'INC0165344': 57.727164030075073, 'INC0071616': 59.45172905921936, 'INC0157032': 58.146721124649048}
[3242, 3348, 3051, 3056, 3098]
Cannot login to okta Cannot login to okta
{'INC0162012': 97.099208831787109, 'INC0005714': 100.0, 'INC0165344': 97.099208831787109, 'INC0071616': 100.0, 'INC0146708': 97.099208831787109}
[6375, 6479, 6515, 6272, 6621]
Need to reset VPN token Need to reset VPN token
{'INC0084903': 86.938679218292236, 'INC0064544': 84.231770038604736, 'INC0145463': 86.938679218292236, 'INC0003697': 76.730334758758545, 'INC0004114': 76.730334758758545}
[8072, 8067, 8068, 8069, 8070]
Video

ConnectionError: HTTPSConnectionPool(host='surfhackathon103.service-now.com', port=443): Max retries exceeded with url: /api/now/table/u_incident_matching?sysparm_query=u_responded=false (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x11cbc9950>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))

In [None]:
#poll()

In [27]:
def processInString(inp):
    ### Pre Process & sanitize words
    query_doc=re.sub('[^A-Za-z0-9]+', ' ', inp).lower()
    query_doc=removeStop(query_doc)
    
    query_doc=removeStop(query_doc)
    
    query_doc = [w.lower() for w in word_tokenize(query_doc)]  ### Later Santize Inp with a Cleansing Function 
    
    #print(query_doc)
    query_doc_bow = dict_t.doc2bow(query_doc)
    #print(query_doc_bow)
    query_doc_tf_idf = tf_idf[query_doc_bow]
    #print(query_doc_tf_idf)
    
    ### GEt the Match
    sims[query_doc_tf_idf]
    #len(sims[query_doc_tf_idf])
    temp=sims[query_doc_tf_idf]
    temp2=sorted(range(len(temp)), key=lambda i: temp[i], reverse=True)[:5]
    
    new_list=[t3+1 for t3 in temp2]
    print(new_list)
    
    ### Get the response
    return dict(zip(df_inp.ix[new_list,'number'],[temp[x]*100 for x in temp2]))

### The code below test Similarity %  for a test input

In [30]:
## test 
test_inp_short_desc='need kitchen clean up'
test_inp_desc='vpn pin expired'

processInString(test_inp_short_desc+' '+test_inp_desc)

[4887, 8449, 8444, 8309, 2792]


{'INC0088226': 32.834708690643311,
 'INC0096006': 41.180235147476196,
 'INC0129840': 32.943344116210938,
 'INC0200067': 43.665087223052979,
 'INC0201005': 33.520039916038513}