In [None]:
import os
import math
import math as m
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

import matplotlib.pyplot as plt
from string import punctuation as puncs
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
def preprocessing(text):
    
    text = text.lower()
    # word tokenization: spliting the snetence into words
    words = [w for w in text.split(" ")]
    # making a set of all stop worked in english using nltk stopwords coupus
    stop_words = set(stopwords.words("english")) 
    
    trans = str.maketrans("", "", puncs)
    words = [w.translate(trans) for w in words]
            
    # removing stop words 
    words = [w for w in words if w not in stop_words]
            
    
    # creating a list to get all the words after preprocessing
    clean_sent = []
    # making a WordNet lemmatizer's object using nltk
    lemmatizer = WordNetLemmatizer()  
    # creating the object of porter stemmer 
    ps = PorterStemmer()  
    
    #stemming and lemmatization
    for w in words:
        w = ps.stem(w)
        w = lemmatizer.lemmatize(w)

        clean_sent.append(w)
            
    return clean_sent

def get_synonyms(word):
    return [s.name() for synonym in wordnet.synsets(word) for s in synonym.lemmas()]

def count_simlilarity(word1, word2):
    
    try:
        word1 = wordnet.synsets(word1)[0].name()
        word2 = wordnet.synsets(word2)[0].name()
        w1 = wordnet.synset(word1)
        w2 = wordnet.synset(word2)
        s = w1.wup_similarity(w2)
        
        if(s>0):
            return w1.wup_similarity(w2)
        else:
            return 0

    except:
        return 0

In [None]:
classes = ['Diabetes', 'Cardiovascular Disease', 'Chronic Obstructive Pulmonary Disease', 'Cancer']

docList = []
for cls in classes:
    #reading the documents one by one
    text = open(cls+".txt", "r").read()
    
    #preprocessing the text in each of the files
    docList.append(preprocessing(text))    

#removing all the duplicate tokens from the data
for i in range(len(docList)):
    docList[i] = list(set(docList[i]))
    
enhDocList = []
for doc in docList:
    enhDocList.append(list(set(doc + [w for word in doc for w in get_synonyms(word)])))


In [None]:
def wordSim_classify(Query):

    queryList = list(set(preprocessing(Query)))
    enhQuery = queryList + list(set([w for word in queryList for w in get_synonyms(word)]))

    similarity = [0, 0, 0, 0]
    for word in queryList:
        for i in range(len(docList)):
            for w in docList[i]:
                similarity[i] = similarity[i] + count_simlilarity(word, w)
    similarity = [n/len(queryList) for n in similarity]

    match_count= [0, 0, 0, 0]
    for word in enhQuery:
        for i in range(len(docList)):
            for w in docList[i]:
                if(word==w):
                    match_count[i] = match_count[i] + 1
    similarity = [x + y for x, y in zip(similarity, match_count)]
    
    doc_score_dic = {}
    for i in range(len(similarity)):
        doc_score_dic[i] = similarity[i]
        
    return doc_score_dic



In [None]:
Query = "high blood pressure and heart pain"
d = wordSim_classify(Query)

classes[max(d, key=d.get)]

In [None]:
d