## Ontological WSD Systems - Simple Lesk

In [None]:
import bs4
from bs4 import BeautifulSoup
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
porter=PorterStemmer()
import re
import pandas as pd

Loading the files into environment:

In [None]:
infile = open("data/dictionary.xml","r")
train = open("data/train.data","r")
test = open("data/test.data","r")
validate = open("data/validate.data","r")

Reading dictionary xmlfile using beautifulsoup:

In [None]:
soup = BeautifulSoup(infile,'xml')

Creating dictionary with key as words and value as meaning, i.e. gloss:

In [None]:
dict_gloss=dict()
for i in soup.find_all("lexelt"): 
    x=(i.get('item'))
    for j in i.find_all("sense"):  
        z = j.get('id')
        y=(j.get('gloss'))
        dict_gloss.setdefault(x,[]).append(y) 

Creating dictionary with key as words and value as examples:

In [None]:
dict_eg=dict()
for i in soup.find_all("lexelt"): 
    x=(i.get('item'))
    for j in i.find_all("sense"):  
        y=(j.get('examples'))
        dict_eg.setdefault(x,[]).append(y)

Reading list with all the training data:

In [None]:
nw_train=train
train_List = [line.split('\n') for line in nw_train]

Defining funtion to get the word and it's sense from training data:

In [None]:
def get_trainWord(inpList):
    word=re.sub(r'[^a-zA-Z\.]','',(str((inpList)).split("|"))[0])
    word_sense=re.sub(r'[^\d]','',(str((inpList)).split("|"))[1])
    return word,word_sense

Cleaning and pre-processing steps:
Removing stop words (We have used built-in NLTK module for removing stop words).
Stemming and lemmatization will also be done here.

In [None]:
ad_s=('i','I','my','me','mine') # creating custom stop words to be removed
stopwords=nltk.corpus.stopwords.words('english')
stopwords.append(ad_s)

def clean_up(inp):
    clean_dt=[]
    for f in str(inp).split(" "):
        w=re.sub(r'[^a-zA-Z\.\%%]','', f)
        w=w.lower()
        if w not in stopwords:
            if w != '':
                w = porter.stem(w)
                clean_dt.append(w)
    return clean_dt

Creating dataframes to store the predicted output:

In [None]:
df_predict=pd.DataFrame(columns=['Given','Predicted'])
df_predict_Eg=pd.DataFrame(columns=('Given','Predicted'))
df_predict_Gloss=pd.DataFrame(columns=('Given','Predicted'))

#### Model 1 - Predicting senses for each training words using gloss and example

In [174]:
cnt=0
for each_entry in train_List:
    w,ws=get_trainWord(each_entry) 
    clean_entry=(clean_up(each_entry))
    p=0
    max_ov=0
    given=w+"."+str(ws)
    for i in range(0,len(dict_gloss[w])):
        
        ov= set(clean_up(dict_gloss[w][i])).intersection(clean_entry)
        ov1=set(clean_up( dict_eg[w][i])).intersection(clean_entry)
        nov=len(ov)+len(ov1)
        if nov > max_ov:
            max_ov=nov
            p=i
    predicted=w+"."+str(p)
    
    #storing in python data frame
    df_predict.loc[cnt]=given,predicted
    cnt+=1

print df_predict

            Given   Predicted
0      affect.v.1  affect.v.0
1      affect.v.1  affect.v.1
2      affect.v.1  affect.v.2
3      affect.v.1  affect.v.0
4      affect.v.1  affect.v.1
5      affect.v.1  affect.v.1
6      affect.v.1  affect.v.0
7      affect.v.1  affect.v.2
8      affect.v.1  affect.v.0
9      affect.v.1  affect.v.0
10     affect.v.1  affect.v.0
11     affect.v.1  affect.v.0
12     affect.v.1  affect.v.0
13     affect.v.1  affect.v.0
14     affect.v.1  affect.v.2
15     affect.v.1  affect.v.0
16     affect.v.1  affect.v.0
17     affect.v.1  affect.v.0
18     affect.v.1  affect.v.0
19     affect.v.1  affect.v.1
20     affect.v.1  affect.v.0
21     affect.v.1  affect.v.0
22     affect.v.1  affect.v.1
23     affect.v.1  affect.v.0
24     affect.v.1  affect.v.0
25     affect.v.1  affect.v.1
26     affect.v.1  affect.v.1
27     affect.v.1  affect.v.0
28     affect.v.1  affect.v.0
29     affect.v.1  affect.v.0
...           ...         ...
22251    work.v.1    work.v.2
22252    w

In [145]:
df_predict.to_csv('OntologicalModel1.csv') # Exporting to csv

#### Model 2 - Predicting senses for each training word using only gloss from dictionary

In [None]:
cnt=0
for each_entry in train_List:
    w,ws=get_trainWord(each_entry) 
    clean_entry=(clean_up(each_entry))
    p=0
    max_ov=0
    given=w+"."+str(ws)
    for i in range(0,len(dict_gloss[w])):
        
        ov= set(clean_up(dict_gloss[w][i])).intersection(clean_entry)
        nov=len(ov)
        if nov > max_ov:
            max_ov=nov
            p=i
    predicted=w+"."+str(p)
    
    #store in python data frame
    df_predict_Gloss.loc[cnt]=given,predicted
    cnt+=1

In [196]:
print df_predict_Gloss.head()

        Given   Predicted
0  affect.v.1  affect.v.0
1  affect.v.1  affect.v.0
2  affect.v.1  affect.v.2
3  affect.v.1  affect.v.0
4  affect.v.1  affect.v.0


In [146]:
df_predict_Gloss.to_csv('OntologicalModel2.csv') # Exporting to csv

#### Model 3 - Predicting senses for each training word using only examples of dictionary

In [None]:
cnt=0

for each_entry in train_List:
    w,ws=get_trainWord(each_entry) 
    clean_entry=(clean_up(each_entry))
    p=0
    max_ov=0
    given=w+"."+str(ws)
    for i in range(0,len(dict_gloss[w])):
        
        ov1=set(clean_up( dict_eg[w][i])).intersection(clean_entry)
        nov=len(ov1)
        if nov > max_ov:
            max_ov=nov
            p=i
    predicted=w+"."+str(p)
    
    #store in python data frame
    df_predict_Eg.loc[cnt]=given,predicted
    cnt+=1
print df_predict_Eg

In [197]:
print df_predict_Eg.head()

        Given   Predicted
0  affect.v.1  affect.v.0
1  affect.v.1  affect.v.1
2  affect.v.1  affect.v.0
3  affect.v.1  affect.v.0
4  affect.v.1  affect.v.1


In [147]:
df_predict_Eg.to_csv('OntologicalModel3.csv') # Exporting to csv

#### Testing Model 1 on validation data

In [259]:
# Reading list with all the validation data
nw_validate=validate
validate_List = [line.split('\n') for line in nw_validate]

df_predict_validate = pd.DataFrame(columns=('Given','Predicted'))

cnt=0
for each_entry in validate_List:
    w,ws=get_trainWord(each_entry) 
    clean_entry=(clean_up(each_entry))
    p=0
    max_ov=0
    given=w+"."+str(ws)
    for i in range(0,len(dict_gloss[w])):
        
        ov= set(clean_up(dict_gloss[w][i])).intersection(clean_entry)
        ov1=set(clean_up(dict_eg[w][i])).intersection(clean_entry)
        nov=len(ov)+len(ov1)
        if nov > max_ov:
            max_ov=nov
            p=i
    predicted=w+"."+str(p)
    
    #store in python data frame
    df_predict_validate.loc[cnt]=given,predicted
    cnt+=1

print df_predict_validate

            Given     Predicted
0     capital.n.1   capital.n.3
1     capital.n.1   capital.n.3
2     capital.n.1   capital.n.0
3     capital.n.1   capital.n.0
4     capital.n.1   capital.n.0
5     capital.n.1   capital.n.0
6     capital.n.1   capital.n.3
7     capital.n.1   capital.n.0
8     capital.n.1   capital.n.0
9     capital.n.1   capital.n.0
10    capital.n.1   capital.n.1
11   exchange.n.3  exchange.n.2
12   exchange.n.3  exchange.n.2
13   exchange.n.5  exchange.n.0
14   exchange.n.3  exchange.n.2
15   exchange.n.3  exchange.n.2
16   exchange.n.3  exchange.n.0
17   exchange.n.3  exchange.n.2
18   exchange.n.3  exchange.n.2
19   exchange.n.3  exchange.n.2
20   exchange.n.1  exchange.n.2
21   exchange.n.1  exchange.n.1
22   exchange.n.5  exchange.n.0
23     remove.v.1    remove.v.0
24     remove.v.1    remove.v.0
25     remove.v.1    remove.v.0
26       feel.v.1      feel.v.0
27       feel.v.3      feel.v.0
28       feel.v.1      feel.v.0
29       feel.v.1      feel.v.1
..      

In [None]:
df_predict_validate.to_csv('OntologicalModel_ValidatePrediction.csv') # Exporting to csv

Defining function to calculate accuracy:

In [287]:
accuracies = {}

def accuracy(df):
    res = (df['Given'] == df['Predicted']).values
    return ((res==True).sum()/float(len(res))) * 100  

# Finding accuracy on train data and validate data:
accuracies['model1'] = accuracy(df_predict)
accuracies['model2'] = accuracy(df_predict_Gloss)
accuracies['model3'] = accuracy(df_predict_Eg)
accuracies['model4'] = accuracy(df_predict_validate)
accuracies['model5'] = accuracy(df_predict_validate_CL)

print("{" + "\n".join("{}: {}".format(k, v) for k, v in accuracies.items()) + "}")

{model3: 9.32184372335
model2: 6.64691889951
model1: 9.88734796463
model5: 4.8231511254
model4: 7.82422293676}


Reading list with all the test data:

In [None]:
nw_test=test
test_List = [line.split('\n') for line in nw_test]

#### Prediction on test data to determine word sense using Model 3

In [236]:
# Predicting test data sense for each word using the examples (Model 3)
df_test_predict=pd.DataFrame(columns=())

cnt=0
for each_entry in test_List:
    w,ws=get_trainWord(each_entry) 
    clean_entry=(clean_up(each_entry))
    
    p=0
    max_ov=0
    given=w
    
    if dict_eg[w] != "":
        for i in range(0,len(dict_eg[w])):
            #print(given,dict_gloss[w])

            #ov= set(clean_up(dict_gloss[w][i])).intersection(clean_entry)
            ov1=set(clean_up(dict_eg[w][i])).intersection(clean_entry)
            nov=len(ov1)#+len(ov1)
            if nov > max_ov:
                max_ov=nov
                p=i
        predicted=w+"."+str(p)

        #store in python data frame
        if (dict_eg[w]):
            df_test_predict[cnt]=given,predicted
            cnt+=1
        
print df_test_predict

          0            1            2            3            4     \
0    capital.n    capital.n    capital.n    capital.n    capital.n   
1  capital.n.1  capital.n.0  capital.n.1  capital.n.1  capital.n.1   

          5            6            7            8            9     \
0    capital.n    capital.n    capital.n    capital.n    capital.n   
1  capital.n.0  capital.n.0  capital.n.1  capital.n.0  capital.n.3   

       ...           3908      3909          3910          3911          3912  \
0      ...         keep.v    keep.v    maintain.v    maintain.v    maintain.v   
1      ...       keep.v.7  keep.v.3  maintain.v.2  maintain.v.0  maintain.v.1   

           3913          3914          3915          3916          3917  
0    maintain.v    maintain.v    maintain.v    maintain.v    maintain.v  
1  maintain.v.0  maintain.v.0  maintain.v.0  maintain.v.0  maintain.v.0  

[2 rows x 3918 columns]


In [148]:
df_test_predict.to_csv('OntologicalModel_TestPrediction.csv') # Export to csv

In [198]:
print df_test_predict.head()

          0            1            2            3            4     \
0    capital.n    capital.n    capital.n    capital.n    capital.n   
1  capital.n.1  capital.n.0  capital.n.1  capital.n.0  capital.n.0   

          5            6            7            8            9     \
0    capital.n    capital.n    capital.n    capital.n    capital.n   
1  capital.n.0  capital.n.0  capital.n.0  capital.n.0  capital.n.0   

       ...           3908      3909          3910          3911          3912  \
0      ...         keep.v    keep.v    maintain.v    maintain.v    maintain.v   
1      ...       keep.v.0  keep.v.3  maintain.v.1  maintain.v.0  maintain.v.0   

           3913          3914          3915          3916          3917  
0    maintain.v    maintain.v    maintain.v    maintain.v    maintain.v  
1  maintain.v.3  maintain.v.2  maintain.v.0  maintain.v.0  maintain.v.0  

[2 rows x 3918 columns]


## Ontological WSD Systems - Corpus Lesk

Building new dictionary for Corpus Lesk using training data:

In [191]:
dict_eg_new = dict()

def get_trainWord_new(inpList):
    word=re.sub(r'[^a-zA-Z\.]','',(str((inpList)).split("|"))[0])
    return word

for each_entry in train_List:
    w = get_trainWord_new(each_entry)
    clean_entry = (clean_up(each_entry))
    dict_eg_new[w] = clean_entry

print dict_eg_new

{'begin.v': ['begin.v', u'amend', u'aim', u'skirt', u'suprem', 'court', u'rule', 'threw', u'convict', u'texa', u'flagburn', u'ground', 'freedom', 'speech', u'violat', '.', u'feder', u'research', 'said', u'lungcanc', u'mortal', u'rate', u'peopl', u'year', 'age', 'begun', u'declin', u'particularli', 'white', u'male', '.', u'nation', 'cancer', u'institut', 'also', u'project', u'overal', 'u.s.', u'mortal', u'rate', 'lung', 'cancer', '%%', 'begin', '%%', 'drop', u'sever', u'year', u'cigarett', u'smoke', u'continu', u'abat', '.', 'bush', 'met', 'south', 'korean', u'presid', 'roh', u'indic', 'seoul', u'plan', u'eas', 'trade', u'rule', u'ensur', u'economi', u'becom', 'open', u'industri', u'nation', u'mid', '.', 'bush', u'assur', 'roh', 'u.s.', 'would', 'stand', u'secur', u'commit', 'long', 'threat', 'communist', 'north', 'korea', '.'], 'complain.v': ['complain.v', 'old', 'order', u'act', u'aton', 'sung', 'grunnfeu', u'arapaci', u'love', 'serbantian', 'import', u'enter', u'deliv', 'wellknown', 

Combining all the 3 dictionaries:

In [252]:
ndic = dict(dict_eg.items() + dict_eg_new.items() + dict_gloss.items())

In [249]:
df_predict_validate_CL = pd.DataFrame(columns=('Given','Predicted'))

Implementing Corpus Lesk on validation data using augmented dictionary:

In [263]:
cnt=0
for each_entry in validate_List:
    w,ws=get_trainWord(each_entry) 
    clean_entry=(clean_up(each_entry))
    p=0
    max_ov=0
    given=w+"."+str(ws)
    for i in range(0,len(ndic[w])):
        ov= set(clean_up(ndic[w][i])).intersection(clean_entry)
        nov=len(ov)
        if nov > max_ov:
            max_ov=nov
            p=i
    predicted=w+"."+str(p)
    
    #store in python data frame
    df_predict_validate_CL.loc[cnt]=given,predicted
    cnt+=1

print df_predict_validate_CL

            Given     Predicted
0     capital.n.1   capital.n.0
1     capital.n.1   capital.n.0
2     capital.n.1   capital.n.0
3     capital.n.1   capital.n.0
4     capital.n.1   capital.n.0
5     capital.n.1   capital.n.0
6     capital.n.1   capital.n.0
7     capital.n.1   capital.n.0
8     capital.n.1   capital.n.0
9     capital.n.1   capital.n.0
10    capital.n.1   capital.n.0
11   exchange.n.3  exchange.n.4
12   exchange.n.3  exchange.n.0
13   exchange.n.5  exchange.n.0
14   exchange.n.3  exchange.n.0
15   exchange.n.3  exchange.n.0
16   exchange.n.3  exchange.n.0
17   exchange.n.3  exchange.n.0
18   exchange.n.3  exchange.n.0
19   exchange.n.3  exchange.n.0
20   exchange.n.1  exchange.n.0
21   exchange.n.1  exchange.n.0
22   exchange.n.5  exchange.n.0
23     remove.v.1    remove.v.0
24     remove.v.1    remove.v.0
25     remove.v.1    remove.v.0
26       feel.v.1      feel.v.0
27       feel.v.3      feel.v.0
28       feel.v.1      feel.v.0
29       feel.v.1      feel.v.0
..      

In [265]:
df_predict_validate_CL.to_csv('OntologicalModel_ValidatePrediction_CorpusLesk.csv') # Exporting to csv

Implementing Corpus Lesk on test data:

In [234]:
df_test_predict_CL=pd.DataFrame(columns=())

cnt=0
for each_entry in test_List:
    w,ws=get_trainWord(each_entry) 
    clean_entry=(clean_up(each_entry))
    
    p=0
    max_ov=0
    given=w
    
    if ndic[w] != "":
        for i in range(0,len(ndic[w])):

            ov= set(clean_up(ndic[w][i])).intersection(clean_entry)
            nov=len(ov)
            if nov > max_ov:
                max_ov=nov
                p=i
        predicted=w+"."+str(p)

        #store in python data frame
        if (ndic[w]):
            df_test_predict_CL[cnt]=given,predicted
            cnt+=1
        
print df_test_predict_CL


          0            1            2            3            4     \
0    capital.n    capital.n    capital.n    capital.n    capital.n   
1  capital.n.1  capital.n.0  capital.n.1  capital.n.0  capital.n.0   

          5            6            7            8            9     \
0    capital.n    capital.n    capital.n    capital.n    capital.n   
1  capital.n.0  capital.n.0  capital.n.0  capital.n.0  capital.n.0   

       ...           3908      3909          3910          3911          3912  \
0      ...         keep.v    keep.v    maintain.v    maintain.v    maintain.v   
1      ...       keep.v.0  keep.v.3  maintain.v.1  maintain.v.0  maintain.v.0   

           3913          3914          3915          3916          3917  
0    maintain.v    maintain.v    maintain.v    maintain.v    maintain.v  
1  maintain.v.3  maintain.v.2  maintain.v.0  maintain.v.0  maintain.v.0  

[2 rows x 3918 columns]


In [235]:
df_test_predict_CL.to_csv('OntologicalModel_TestPrediction_CorpusLesk.csv') # Export to csv

Defining function to build consecutive words:

In [None]:
def find_bigrams(input_list):
    return zip(input_list, input_list[1:])


Defining clean up function for consecutive words:

In [None]:
def clean_up_new(inp):
    clean_dt_new=[]
    for f in str(inp).split(" "):
        w=re.sub(r'[^a-zA-Z\.\%%]','', f)
        w=w.lower()
        if w not in stopwords:
            if w != '':
                w = porter.stem(w)
                clean_dt_new.append(w)
    clean_dt_new = find_bigrams(clean_dt_new)
    return clean_dt_new

In [280]:
# nw_validate=validate
# validate_List = [line.split('\n') for line in nw_validate]

df_o = pd.DataFrame(columns=('Given','Predicted'))

cnt=0
for each_entry in validate_List:
    w,ws=get_trainWord(each_entry) 
    clean_entry=(clean_up_new(each_entry))
    p=0
    max_ov=0
    given=w+"."+str(ws)
    for i in range(0,len(dict_gloss[w])):
        
        
        ov= set(find_bigrams(clean_up(dict_gloss[w][i]))).intersection(clean_entry)
        #ov1=set(clean_up_new(dict_eg[w][i])).intersection(clean_entry)
        nov=len(ov)
        if nov > max_ov:
            max_ov=nov
            p=i
    predicted=w+"."+str(p)
    
    #store in python data frame
    df_o.loc[cnt]=given,predicted
    cnt+=1

print df_o

            Given     Predicted
0     capital.n.1   capital.n.0
1     capital.n.1   capital.n.0
2     capital.n.1   capital.n.0
3     capital.n.1   capital.n.0
4     capital.n.1   capital.n.0
5     capital.n.1   capital.n.0
6     capital.n.1   capital.n.0
7     capital.n.1   capital.n.0
8     capital.n.1   capital.n.0
9     capital.n.1   capital.n.0
10    capital.n.1   capital.n.0
11   exchange.n.3  exchange.n.0
12   exchange.n.3  exchange.n.0
13   exchange.n.5  exchange.n.0
14   exchange.n.3  exchange.n.0
15   exchange.n.3  exchange.n.0
16   exchange.n.3  exchange.n.0
17   exchange.n.3  exchange.n.0
18   exchange.n.3  exchange.n.0
19   exchange.n.3  exchange.n.0
20   exchange.n.1  exchange.n.0
21   exchange.n.1  exchange.n.0
22   exchange.n.5  exchange.n.0
23     remove.v.1    remove.v.0
24     remove.v.1    remove.v.0
25     remove.v.1    remove.v.0
26       feel.v.1      feel.v.0
27       feel.v.3      feel.v.0
28       feel.v.1      feel.v.0
29       feel.v.1      feel.v.0
..      