In [1]:
import os
import math
import math as m
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

import matplotlib.pyplot as plt
from string import punctuation as puncs
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
def preprocessing(text):
    
    text = text.lower()
    # word tokenization: spliting the snetence into words
    words = [w for w in text.split(" ")]
    # making a set of all stop worked in english using nltk stopwords coupus
    stop_words = set(stopwords.words("english")) 
    
    trans = str.maketrans("", "", puncs)
    words = [w.translate(trans) for w in words]
            
    # removing stop words 
    words = [w for w in words if w not in stop_words]
            
    
    # creating a list to get all the words after preprocessing
    clean_sent = []
    # making a WordNet lemmatizer's object using nltk
    lemmatizer = WordNetLemmatizer()  
    # creating the object of porter stemmer 
    ps = PorterStemmer()  
    
    #stemming and lemmatization
    for w in words:
        w = ps.stem(w)
        w = lemmatizer.lemmatize(w)

        clean_sent.append(w)
            
    return clean_sent

### function to calculate term frequency in the doc
def termFrequencyInDoc(wordList):
#     """
#     This function should take a list of words as input argument, and output a dictionary of words such that
#     each word that appears in the document is key in the dictionary and it's value is term frequency
#     """
    termFrequency_dic={}
    for w in wordList:
        if w in termFrequency_dic.keys():
            termFrequency_dic[w]+=1
        else:
            termFrequency_dic[w]=1
    return termFrequency_dic



# termFrequencyInDoc(removePuncs(wordList(txtFile)))
def txtfToDictionary(txtFile):
    text = open(txtFile, "r").read()
    return termFrequencyInDoc(preprocessing(text))


## function to calculate word Document frequency
def wordDocFre(dicList):
    vocan = {}
    for docDic in dicList:
        for keys in docDic.keys():
            if keys in vocan.keys():
                ## add some code here
                vocan[keys]+=1
            else:
                ## add some code here
                vocan[keys]=1
    return vocan


## construct a function named inverseDocFre() that takes dictionary returned from wordDocFre functions above
## and outputs inverse document frequency of each word. You can do it!
def invrDocFre(dic,M,Base):
    dictidf = {}
    for keys in dic.keys():
        if(dic[keys]!=0):
            dictidf[keys]= math.log((float(M+1)/dic[keys]),Base)
        else:
            print(keys+"  "+str(dic[keys]))
    return dictidf



### this function will calculate tf-idf for everyword in doc
## this is the main function which calls the above functions
def tfidf(list_of_doc_dic,idf_dic): 
    #first input is the list of all disctionaries after Punctuations have been removed
    
    list_of_tfidf_dic=[] #this contains tfidf dictionaries for each document
    for dic in list_of_doc_dic:
        tfidf_dic = {}
        for keys in dic.keys():
            tfidf_dic[keys] = dic[keys] * idf_dic[keys]
        list_of_tfidf_dic.append(tfidf_dic)
        
    return list_of_tfidf_dic

In [None]:
classes = ['Diabetes', 'Cardiovascular Disease', 'Chronic Obstructive Pulmonary Disease', 'Cancer']

dicList = []
for cls in classes:
    dicList.append(txtfToDictionary(cls+".txt"))
    
WDF_dict = wordDocFre(dicList)
IDF_dict = invrDocFre(WDF_dict,len(classes),10)
TFIDF_dic_List = tfidf(dicList,IDF_dict)

Doc_V = []
Vocabulary = np.transpose(np.array(pd.Series(IDF_dict).index))
for docdic in TFIDF_dic_List:
    Document_V = np.zeros(len(Vocabulary))
    for keys in docdic.keys():
        Document_V[np.where(Vocabulary == keys)] = docdic[keys]
    Doc_V.append(Document_V)

In [None]:
def tfidf_classify(Query):

    doc_score_dic = {}
    Vocabulary = np.transpose(np.array(pd.Series(IDF_dict).index))
    Query_V = np.zeros(len(Vocabulary))

    words_q = preprocessing(Query)    

    for i in words_q:
        Query_V[np.where(Vocabulary == i)] = 1

    count = 0
    for V in Doc_V:
        doc_score_dic[count] = np.dot(Query_V, V)
        count = count + 1
    
    return doc_score_dic


In [None]:
Query = "high blood pressure and heart pain"
d = tfidf_classify(Query)

classes[max(d, key=d.get)]



In [None]:
d