In [1]:
import os
import math
import math as m
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

import matplotlib.pyplot as plt
from string import punctuation as puncs
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Dataset Cleaning

In [2]:
#cleaning the dataset
df = pd.read_csv('Chronic_Disease_dataset.csv')
df = df[['Topic', 'Question']]



  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
#printing value counts
df.Topic.value_counts()

Diabetes                                           110251
Cardiovascular Disease                             108803
Chronic Obstructive Pulmonary Disease              106860
Cancer                                             104580
Asthma                                              54884
Arthritis                                           54810
Overarching Conditions                              54531
Nutrition, Physical Activity, and Weight Status     53619
Alcohol                                             46998
Tobacco                                             41652
Older Adults                                        19536
Chronic Kidney Disease                              18024
Oral Health                                         15075
Mental Health                                        9615
Immunization                                         6960
Reproductive Health                                  5347
Disability                                           3392
Name: Topic, d

In [4]:
classes = df.Topic.value_counts().head(4).index.tolist()
#printing most common classes
print(classes)

['Diabetes', 'Cardiovascular Disease', 'Chronic Obstructive Pulmonary Disease', 'Cancer']


In [5]:
df = df.loc[df['Topic'].isin(classes)]
print(df.shape)
df.to_csv('dataset.csv', index=False)


(430494, 2)


# Splitting the dataset

In [6]:
# spliting the dataset
df = pd.read_csv('dataset.csv')
train, test = train_test_split(df, test_size=0.5, random_state=42, stratify=df[['Topic']])
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)


for cls in classes:
    out = '. '.join(df[df.Topic==cls]["Question"])
    with open(cls+".txt", "w") as text_file:
        text_file.write(out)

# Helper Functions

In [7]:
#helper functions

def preprocessing(text):
    
    text = text.lower()
    # word tokenization: spliting the snetence into words
    words = [w for w in text.split(" ")]
    
    # making a set of all stop worked in english using nltk stopwords coupus
    stop_words = set(stopwords.words("english")) 
    
    trans = str.maketrans("", "", puncs)
    words = [w.translate(trans) for w in words]
            
    # removing stop words 
    words = [w for w in words if w not in stop_words]
            
    
    # creating a list to get all the words after preprocessing
    clean_sent = []
    # making a WordNet lemmatizer's object using nltk
    lemmatizer = WordNetLemmatizer()  
    # creating the object of porter stemmer 
    ps = PorterStemmer()  
    
    #stemming and lemmatization
    for w in words:
        w = ps.stem(w)
        w = lemmatizer.lemmatize(w)

        clean_sent.append(w)
            
    return clean_sent

### function to calculate term frequency in the doc
def termFrequencyInDoc(wordList):
#     """
#     This function should take a list of words as input argument, and output a dictionary of words such that
#     each word that appears in the document is key in the dictionary and it's value is term frequency
#     """
    termFrequency_dic={}
    for w in wordList:
        if w in termFrequency_dic.keys():
            termFrequency_dic[w]+=1
        else:
            termFrequency_dic[w]=1
    return termFrequency_dic



# termFrequencyInDoc(removePuncs(wordList(txtFile)))
def txtfToDictionary(txtFile):
    text = open(txtFile, "r").read()
    return termFrequencyInDoc(preprocessing(text))


## function to calculate word Document frequency
def wordDocFre(dicList):
    vocan = {}
    for docDic in dicList:
        for keys in docDic.keys():
            if keys in vocan.keys():
                ## add some code here
                vocan[keys]+=1
            else:
                ## add some code here
                vocan[keys]=1
    return vocan


## construct a function named inverseDocFre() that takes dictionary returned from wordDocFre functions above
## and outputs inverse document frequency of each word. You can do it!
def invrDocFre(dic,M,Base):
    dictidf = {}
    for keys in dic.keys():
        if(dic[keys]!=0):
            dictidf[keys]= math.log((float(M+1)/dic[keys]),Base)
        else:
            print(keys+"  "+str(dic[keys]))
    return dictidf



### this function will calculate tf-idf for everyword in doc
## this is the main function which calls the above functions
def tfidf(list_of_doc_dic,idf_dic): 
    #first input is the list of all disctionaries after Punctuations have been removed
    
    list_of_tfidf_dic=[] #this contains tfidf dictionaries for each document
    for dic in list_of_doc_dic:
        tfidf_dic = {}
        for keys in dic.keys():
            tfidf_dic[keys] = dic[keys] * idf_dic[keys]
        list_of_tfidf_dic.append(tfidf_dic)
        
    return list_of_tfidf_dic


def get_synonyms(word):
    return [s.name() for synonym in wordnet.synsets(word) for s in synonym.lemmas()]

def count_simlilarity(word1, word2):
    
    try:
        word1 = wordnet.synsets(word1)[0].name()
        word2 = wordnet.synsets(word2)[0].name()
        w1 = wordnet.synset(word1)
        w2 = wordnet.synset(word2)
        s = w1.wup_similarity(w2)
        
        if(s>0):
            return w1.wup_similarity(w2)
        else:
            return 0

    except:
        return 0

# TF-IDF 

In [8]:
classes = ['Diabetes', 'Cardiovascular Disease', 'Chronic Obstructive Pulmonary Disease', 'Cancer']

dicList = []
for cls in classes:
    dicList.append(txtfToDictionary(cls+".txt"))
    
WDF_dict = wordDocFre(dicList)
IDF_dict = invrDocFre(WDF_dict,len(classes),10)
TFIDF_dic_List = tfidf(dicList,IDF_dict)

Doc_V = []
Vocabulary = np.transpose(np.array(pd.Series(IDF_dict).index))
for docdic in TFIDF_dic_List:
    Document_V = np.zeros(len(Vocabulary))
    for keys in docdic.keys():
        Document_V[np.where(Vocabulary == keys)] = docdic[keys]
    Doc_V.append(Document_V)
    
    
    
    
def tfidf_classify(Query):

    doc_score_dic = {}
    Vocabulary = np.transpose(np.array(pd.Series(IDF_dict).index))
    Query_V = np.zeros(len(Vocabulary))

    words_q = preprocessing(Query)    

    for i in words_q:
        Query_V[np.where(Vocabulary == i)] = 1

    count = 0
    for V in Doc_V:
        doc_score_dic[count] = np.dot(Query_V, V)
        count = count + 1
    
    return doc_score_dic


#### Predictions using TF-IDF

In [9]:
test = pd.read_csv('test.csv')
y_true = test['Topic'].tolist()

y_pre = []
for q in test['Question']:
    d = tfidf_classify(q)
    y_pre.append(classes[max(d, key=d.get)])
    

acc = accuracy_score(y_pre, y_true)
print("Accuracy", acc )
print(classification_report(y_true, y_pre, target_names=classes))

Accuracy 0.918017338861666
                                       precision    recall  f1-score   support

                             Diabetes       1.00      0.89      0.94      1220
               Cardiovascular Disease       1.00      0.87      0.93      2289
Chronic Obstructive Pulmonary Disease       1.00      1.00      1.00      1797
                               Cancer       0.00      0.00      0.00         0

                             accuracy                           0.92      5306
                            macro avg       0.75      0.69      0.72      5306
                         weighted avg       1.00      0.92      0.96      5306



  _warn_prf(average, modifier, msg_start, len(result))


# Word Similarity Method using WordNet

In [10]:
classes = ['Diabetes', 'Cardiovascular Disease', 'Chronic Obstructive Pulmonary Disease', 'Cancer']

docList = []
for cls in classes:
    #reading the documents one by one
    text = open(cls+".txt", "r").read()
    
    #preprocessing the text in each of the files
    docList.append(preprocessing(text))    

#removing all the duplicate tokens from the data
for i in range(len(docList)):
    docList[i] = list(set(docList[i]))
    
enhDocList = []
for doc in docList:
    enhDocList.append(list(set(doc + [w for word in doc for w in get_synonyms(word)])))

    
    
def wordSim_classify(Query):

    queryList = list(set(preprocessing(Query)))
    enhQuery = queryList + list(set([w for word in queryList for w in get_synonyms(word)]))

    similarity = [0, 0, 0, 0]
    for word in queryList:
        for i in range(len(docList)):
            for w in docList[i]:
                similarity[i] = similarity[i] + count_simlilarity(word, w)
    similarity = [n/len(queryList) for n in similarity]

    match_count= [0, 0, 0, 0]
    for word in enhQuery:
        for i in range(len(docList)):
            for w in docList[i]:
                if(word==w):
                    match_count[i] = match_count[i] + 1
    similarity = [x + y for x, y in zip(similarity, match_count)]
    
    doc_score_dic = {}
    for i in range(len(similarity)):
        doc_score_dic[i] = similarity[i]
        
    return doc_score_dic

In [11]:
test = pd.read_csv('test.csv')
y_true = test['Topic'].tolist()

y_pre = []
for q in test['Question']:
    d = wordSim_classify(q)
    y_pre.append(classes[max(d, key=d.get)])
    

acc = accuracy_score(y_pre, y_true)
print("Accuracy", acc )
print(classification_report(y_true, y_pre, target_names=classes))

Accuracy 0.9858650584244252
                                       precision    recall  f1-score   support

                             Diabetes       1.00      1.00      1.00      1220
               Cardiovascular Disease       1.00      0.97      0.98      2289
Chronic Obstructive Pulmonary Disease       1.00      1.00      1.00      1797
                               Cancer       0.00      0.00      0.00         0

                             accuracy                           0.99      5306
                            macro avg       0.75      0.74      0.75      5306
                         weighted avg       1.00      0.99      0.99      5306



In [12]:
Query = "high blood pressure and heart pain"
d = wordSim_classify(Query)

classes[max(d, key=d.get)]

'Cardiovascular Disease'

In [13]:
d


{0: 10.786472102029375,
 1: 11.159881576971358,
 2: 2.5339977016447612,
 3: 6.0582517668090405}