In [1]:
import numpy as np
import pandas as pd
from spacy.lang.en import English
import spacy
from collections import Counter
from math import sqrt
from nltk.corpus import wordnet
#from py_thesaurus import Thesaurus

In [2]:
error_Msg = "Address is wrong"

In [3]:
data_Dict = pd.read_csv("Data Dictionary.csv")

In [4]:
data_Dict

Unnamed: 0,Primary Object,Field Name,Field description,Canonical Values
0,Customer,First Name,First Name of the Applicant,
1,,Last Name,Last Name of the Applicant,
2,,Address Line1,Address Line1 of the Applicant,Address
3,,Address Line2,Address Line2 of the Applicant,Address
4,,Zipcode,Social Security Number of the Applicant,"pincode, area code"
5,,SSN,Social Security Number of the Applicant,"Social security number, unique id"
6,Claim Details,Revenue Code,Revenue Code for the Claim,
7,,Claim Amount,Total Claim Amount of the Claim,


### Checking the list of stopwords in spacy

In [5]:
spacy_Stopwords = spacy.lang.en.stop_words.STOP_WORDS

print('Number of stop words: %d' % len(spacy_Stopwords))
print('First ten stop words: %s' % list(spacy_Stopwords)[:10])

Number of stop words: 326
First ten stop words: ['your', 'if', 'indeed', 'sometime', 'him', 'rather', 'move', 'once', 'among', 'seem']


### "nlp" Object is used to create documents with linguistic annotations.

In [6]:
nlp = English()

In [7]:
my_msg = nlp(error_Msg)
my_msg

Address is wrong

### Creating list of tokens from the error message

In [8]:
token_list = []
for token in my_msg:
    token_list.append(token.text)

In [9]:
token_list

['Address', 'is', 'wrong']

### Creating list of tokens after removing stopwords


In [10]:
filtered_errorMsg = []
for word in token_list:
    lexeme = nlp.vocab[word]
    if lexeme.is_stop == False:
        filtered_errorMsg.append(word)

In [11]:
print(token_list)

['Address', 'is', 'wrong']


In [12]:
print(filtered_errorMsg)

['Address', 'wrong']


In [13]:
print(" ".join(filtered_errorMsg))

Address wrong


### Lemmatization

In [14]:
filtered_Msg = []
new_myMsg = ""
for word in filtered_errorMsg:
    new_myMsg += word + " "
new_myMsg = nlp(new_myMsg)
new_myMsg

Address wrong 

In [15]:
for lem_word in new_myMsg:
    filtered_Msg.append(lem_word.lemma_)

In [16]:
filtered_Msg

['Address', 'wrong']

In [17]:
print( " ".join(filtered_Msg))

Address wrong


### Similarity between words

In [18]:
data_Dict.columns

Index(['Primary Object', 'Field Name ', 'Field description',
       'Canonical Values'],
      dtype='object')

In [19]:
# removing white space at the end of the column name 
data_Dict.columns = data_Dict.columns.str.rstrip()

In [20]:
data_Dict.columns

Index(['Primary Object', 'Field Name', 'Field description',
       'Canonical Values'],
      dtype='object')

In [21]:
field_Name= data_Dict['Field Name'].to_list()

In [22]:
field_Name

['First Name ',
 'Last Name ',
 'Address Line1',
 'Address Line2',
 'Zipcode ',
 'SSN',
 'Revenue Code',
 'Claim Amount ']

In [26]:
def word2vec(word):
    
    # count the characters in word
    cw = Counter(word)
    
    # set of the different characters
    sw = set(cw)
    
    # precomputes the "length" of the word vector
    lw = sqrt(sum(c*c for c in cw.values()))
    
    # return a tuple
    return cw, sw, lw

def cosdis(v1, v2):
    # which characters are common to the two words
    common = v1[1].intersection(v2[1])
   
    # by definition of cosine distance
    return sum(v1[0][ch]*v2[0][ch] for ch in common)/v1[2]/v2[2]

threshold = 0.80    
for key in filtered_Msg:
    for word in field_Name:
        try:
#           Cosine simililarity
            res = cosdis(word2vec(word), word2vec(key))
#           print("The cosine similarity between : {} and : {} is: {}".format(word, key, res*100))
            if res > threshold:
                print("Found a matching word {} with original word: {}".format(word, key))
                new_word = word              
                print("The similar word from Field names is: ", new_word)
        except IndexError:
            pass


Found a matching word Address Line1 with original word: Address
The similar word from Field names is:  Address Line1
Found a matching word Address Line2 with original word: Address
The similar word from Field names is:  Address Line2


In [55]:
### Finding the synonym of the error word

In [56]:
synonyms = []
for syn in wordnet.synsets(new_word):
    for lm in syn.lemmas():
             synonyms.append(lm.name())#adding into synonyms
syn_List = list(set(synonyms))
print (syn_List)

['come_up_to', 'cover', 'treat', 'computer_address', 'speak', 'accost', 'destination', 'turn_to', 'speech', 'direct', 'call', 'plow', 'reference', 'name_and_address', 'handle', 'deal', 'address', 'savoir-faire']


In [57]:
print("Above the function")
def word2vec_syn(word):
    
    # count the characters in word
    cw = Counter(word)
    
    # set of the different characters
    sw = set(cw)
    
    # precomputes the "length" of the word vector
    lw = sqrt(sum(c*c for c in cw.values()))
#     print("Inside the function")

    # return a tuple
    return cw, sw, lw

def cosdis_syn(v1, v2):
    # which characters are common to the two words
    common = v1[1].intersection(v2[1])
   
    # by definition of cosine distance
    return sum(v1[0][ch]*v2[0][ch] for ch in common)/v1[2]/v2[2]

threshold = 0.75    
for key in syn_List:
    for word in field_Name:
        try:
#           Cosine simililarity
            res = cosdis_syn(word2vec_syn(word), word2vec_syn(key))
#           print("The cosine similarity between : {} and : {} is: {}".format(word, key, res*100))
            if res > threshold:
                print("Found a matching word {} with original word: {}".format(word, key))
#                 new_word_syn = key
#                 print("The new word is ", new_word_syn)
        except IndexError:
            pass

Above the function
Found a matching word Address Line1 with original word: address
Found a matching word Address Line2 with original word: address


In [59]:
adjSyn = pd.read_csv("ErrorAdjectivesSynonym.txt", sep = '\t')

In [60]:
adjSyn

Unnamed: 0,Adjectives,Synonyms
0,Invalid,faulty incorrect not-working wrong false defec...
1,Required,Excpected mandatory needed compulsory
2,Match,equal duplicate equivalent
3,missing,incomplete misplaced removed short not-present
