### Practical 7

### Read text from files, perform preprocessing activities like word sense disambiguation, tokenization and stopword removal. Create question answer context where program can read queries and respond with the correct meaning of the input word.

In [1]:
import nltk
import codecs # defines a set of base classes which define the interface
from nltk.tokenize import PunktSentenceTokenizer # abstract class for the default sentence tokenizer, i.e. sent_tokenize()
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet

In [2]:
# Stop words in nltk:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [3]:
# Remove the stop words, word stemming, and get a list of word tokens
def filtered_sent(sentence):
    filtered_sent_list = []
    lemmatizer = WordNetLemmatizer() # lemmatizes the words
    stemmer = PorterStemmer() # Stemmer stems the root of the word
    
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sentence)
    
    for word in words:
        if word not in stop_words:
            filtered_sent_list.append(lemmatizer.lemmatize(stemmer.stem(word)))
        for i in synonyms_creator(word):
            filtered_sent_list.append(i)
    return filtered_sent_list
            
# Add synonyms to the match list
def synonyms_creator(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for i in syn.lemmas():
            synonyms.append(i.name())
    return synonyms

In [4]:
def similarity_check(word1, word2):
    word1 += '.n.01' # word.pos.nn
    word2 += '.n.01'
    try:
        w1 = wordnet.synset(word1)
        w2 = wordnet.synset(word2)
        return w1.wup_similarity(w2) # Returns a score denoting how similar two word senses are, based on the depth of the 
                                     # two senses in the taxonomy and that of their Least Common Subsumer (most specific 
                                     # ancestor node)
    except:
        return 0

In [5]:
print(similarity_check('similaritp', 'similarity'))

0


In [6]:
def simple_filter(sentence):
    filtered_sent = []
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sentence)
    
    for word in words:
        if word not in stop_words:
            filtered_sent.append(lemmatizer.lemmatize(word))
    return filtered_sent

In [7]:
if __name__ == '__main__':
    cricfile = codecs.open('cricketbat.txt', 'r', 'utf-8')
    sent2 = cricfile.read().lower()
    vampirefile = codecs.open('vampirebat.txt', 'r', 'utf-8')
    sent1 = vampirefile.read().lower()
    sent3 = input('Enter query ').lower()
    
    while(sent3 != 'end'):
        
        filtered_sent1 = []
        filtered_sent2 = []
        filtered_sent3 = []

        counter1 = 0
        counter2 = 0
        sent31_similarity = 0
        sent32_similarity = 0

        filtered_sent1 = simple_filter(sent1)
        filtered_sent2 = simple_filter(sent2)
        filtered_sent3 = simple_filter(sent3)

        for i in filtered_sent3:
            for j in filtered_sent1:
                counter1 = counter1 + 1
                sent31_similarity = sent31_similarity + similarity_check(i, j)

            for j in filtered_sent2:
                counter2 = counter2 + 1
                sent32_similarity = sent32_similarity + similarity_check(i, j)

        filtered_sent1 = []
        filtered_sent2 = []
        filtered_sent3 = []

        filtered_sent1 = filtered_sent(sent1)
        filtered_sent2 = filtered_sent(sent2)
        filtered_sent3 = filtered_sent(sent3)

        sent1_count = 0
        sent2_count = 0

        for i in filtered_sent3:
            for j in filtered_sent1:
                if(i == j):
                    sent1_count = sent1_count + 1
            for j in filtered_sent2:
                if(i == j):
                    sent2_count = sent2_count + 1
                    
        if (sent1_count+sent31_similarity) > (sent2_count+sent32_similarity):
            print('Mammal Bat')
        else:
            print('Cricket Bat')
        
        sent3 = input('Enter query ').lower()
        
print ('Terminated')

Enter query bats fly
Mammal Bat
Enter query I play with bat
Cricket Bat
Enter query bat is an animal
Cricket Bat
Enter query i hit a sixer with a bat
Cricket Bat
Enter query bats have eyes
Mammal Bat
Enter query end
Terminated
