In [1]:
# LIBRARY FOR DATA LOADING
import xml.etree.ElementTree as ET

# LIBRARY FOR PREPROCESSING
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# LIBRARY FOR RANKED RETRIEVAL
import math
from collections import OrderedDict

In [2]:
# PREPROCESSING FUNCTION

def loadData(location):
    data = ET.parse(location)
    return data.getroot()

def docNumber(location):
    docNo = []
    data = loadData(location)
    for node in data.iter("number"):
        docNo.append(node.text)
    return docNo

def docDisease(location):
    docHead = []
    data = loadData(location)
    for node in data.iter("disease"):
        docHead.append(node.text)
    return docHead
    
def docSymptom(location):
    docText = []
    data = loadData(location)
    for node in data.iter("symptom"):
        docText.append(node.text)
    return docText
    
def removePunctuation(textList):
    for i in range(len(textList)):
        for punct in string.punctuation:
            textList[i] = textList[i].replace(punct, " ")
        textList[i] = re.sub(r'^https?:\/\/.*[\r\n]*', '', textList[i], flags=re.MULTILINE)
    return textList

def caseFolding(textList):
    text = []
    for i in range(len(textList)):
        text.append(textList[i].lower())
    return text

def token(sentence):
    token = []
    for word in CountVectorizer().build_tokenizer()(sentence):
        token.append(word)
    return token

def tokenize(textList):
    tokens = []
    for i in range(len(textList)):
        tokens.append(token(textList[i]))
    return tokens

def checkStopword(sentence, stop_words):
    sentence = [w for w in sentence if not w in stop_words]
    return sentence
    
def stopwordRemove(textList):
    stop_words = set(stopwords.words('english'))
    text = []
    for i in range(len(textList)):
        text.append(checkStopword(textList[i], stop_words))
    return text

def numberRemove(textList):
    text = []
    for i in range(len(textList)):
        text.append([w for w in textList[i] if not any(j.isdigit() for j in w)])
    return text

def stemming(textList):
    stemmer = PorterStemmer()
    text = textList
    for i in range(len(textList)):
        for j in range(len(textList[i])):
            text[i][j] = stemmer.stem(text[i][j])
    return text

def sorting(textList):
    for i in range(len(textList)):
        textList[i] = sorted(textList[i])
    return textList

def getAllTerms(textList):
    terms = []
    for i in range(len(textList)):
        for j in range(len(textList[i])):
            terms.append(textList[i][j])
    return sorted(set(terms))

def create_Inverted_index(all_unique_documents):
    inverted_index = {}
    for doc_id in range(len(all_unique_documents)):
        for term in all_unique_documents[doc_id]:
            if term not in inverted_index:
                inverted_index[term] = []
            inverted_index[term].append(doc_id)

In [3]:
location = 'data.xml'

In [4]:

# LOAD DATA

documentNumber   = docNumber(location)
documentDisease = docDisease(location)
documentSymptom = docSymptom(location)
documentTotal    = len(documentNumber)
text             = []


for i in range(documentTotal):
    text.append(documentHeadline[i] + documentText[i])

In [5]:
docNumber

<function __main__.docNumber(location)>

In [6]:
documentNumber  = docNumber(location)
documentDisease = docDisease(location)
documentSymptom = docSymptom(location)
documentTotal   = len(documentNumber)
text            = []

for i in range(documentTotal):
    text.append(documentHeadline[i] + documentText[i])

# PREPROCESSING
text = removePunctuation(text)
text = caseFolding(text)
text = tokenize(text)
text = stopwordRemove(text)
text = numberRemove(text)
text = stemming(text)



In [7]:
# GET ALL TERMS IN COLLECTION

terms = getAllTerms(text)



In [12]:
print(terms)

[]


In [8]:
# INDEXING

# index = createIndex(text,documentNumber, terms)
index = create_Inverted_index(text)

In [10]:
print(index)


None
