In [1]:
import os
import xml.etree.ElementTree as ET 
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords 
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn
import operator
import itertools

## Extracting Query Text from XML

In [2]:
def get_queries_from_XML(xmlfile): 
    queries = []
    query_num = []
    tree = ET.parse(xmlfile) 
    root = tree.getroot() 
    for child in root:
        for subchild in child:
            if subchild.tag == 'desc':
                queries.append(subchild.text)
            elif subchild.tag == 'num':
                query_num.append(subchild.text)
    return queries, query_num

In [3]:
queries, query_num = get_queries_from_XML("en.topics.76-125.2010.txt")

## Common Text Pre-processing

In [4]:
def preprocessing(text: list):
    tokenizer = nltk.RegexpTokenizer(r'[a-zA-Z]+')
    for i in range(len(text)):
        text[i] = tokenizer.tokenize(text[i])
    
    stop_words = set(stopwords.words('english')) 
    lemmatizer = WordNetLemmatizer()
    
    for i in range(len(text)):
        for j in range(len(text[i])):
            text[i][j] = text[i][j].lower()
            if text[i][j] in stop_words:
                text[i][j] = ""
            else:
                text[i][j] = lemmatizer.lemmatize(text[i][j]) 
    for i in range(len(text)):
        text[i] = " ".join(text[i])
    
    return text

In [5]:
query_text = preprocessing(queries)

In [6]:
query_text[:5]

['reason behind  protest  meena leader   inclusion  gurjars   scheduled tribe',
 'attack  hezbollah guerrilla  indian  israeli force',
 'conflict  ashok singhal  president  vishwa hindu parishad  l k advani  bjp leader   ram mandir issue',
 'plan  build  road  china  mount everest',
 'initiation  legal proceeding  advani   involvement   demolition  babri masjid']

In [7]:
print(len(query_num), len(query_text))

50 50


## Accessing all files of the Dataset

In [8]:
base = 'TELEGRAPH_UTF8'
all_files_path = []
for folder in os.listdir(base):
    for subfolder in os.listdir(base+"/"+folder):
        for file in os.listdir(base+"/"+folder+"/"+subfolder):
            path = base+"/"+folder+"/"+subfolder+"/"+file
            if "~" not in path:
                all_files_path.append(path)
print(len(all_files_path))

125585


## Extracting Document Text from XML

In [9]:
def get_docs_from_XML(xmlstring): 
    tree = ET.parse(xmlstring) 
    root = tree.getroot() 
    doc_text = ""
    doc_num = ""
    for child in root:
            if child.tag == 'TEXT':
                doc_text = child.text
            elif child.tag == 'DOCNO':
                doc_num= child.text
    return doc_text, doc_num

### Now onwards only experimenting on 100 documents from the dataset

In [10]:
all_doc_text = []
all_doc_num = []
for i in range(100):
    try:
        t, n = get_docs_from_XML(all_files_path[i])
        all_doc_text.append(t)
        all_doc_num.append(n)
    except:
        continue

In [21]:
print(len(all_doc_num), len(all_doc_text))
print(all_doc_num[:5])

100 100
['1041207_atleisure_index.utf8', '1041208_atleisure_index.utf8', '1041210_atleisure_index.utf8', '1041211_atleisure_index.utf8', '1041213_atleisure_index.utf8']


In [12]:
all_doc_text[:5]

['\n\n\nThe Telegraph - Calcutta : At Leisure\n\n Tuesday, December 07, 2004 \n\n For Leonardo DiCaprio, it was one of those moments when he didn?t know whether to laugh or cry. There he was, in a South American rainforest, studying the effects of mercury poisoning in the Amazon, when he was confronted by a group of naked Ind... \n\n Six Oscars and big profits for Chicago have sparked a new frenzy for filming musicals, with Andrew Lloyd Webber?s ... \n\n Couples who live together in Britain without getting married were warned today that they risk losing their homes and pos ... \n\n It may come down with a crunch ? or a squelch. Its creators are hoping for a splash. ... \n\n French fashion legend Pierre Cardin is putting much of his empire up for sale, seeking $1 billion for his couture and licens ... \n\nTHE FAMILY\n\nAstonishingly, even today, many parents \n\nbelieve in bringing up their daughters in the security of a gilded cage, insulated \n\nMy first cousin, 27 years old, wants \n

## Common Text Pre--processing for document text

In [13]:
processed_doc_text = preprocessing(all_doc_text[:100])

## Vectorization

In [15]:
def convertTextToVectorization(data):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(data)
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    retX = pd.DataFrame(denselist, columns=feature_names)
    return retX

In [18]:
vector = convertTextToVectorization(processed_doc_text + query_text)

In [19]:
vector

Unnamed: 0,ab,abadanga,abandoned,abdomen,abdominal,abdul,abetting,abhay,abhijit,ability,...,zahrili,zealand,zeppelin,zero,zilla,zoe,zonal,zone,zoo,zookeepers
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Output: Relevant documents using cosine similarity

In [28]:
numpy_array_of_vector = vector.to_numpy()
print("Top 5 similar documents for each query: document_num : cosine similarity score \n")
for i in range(100,150):
    similarity = {}
    for j in range(100):
        x = sklearn.metrics.pairwise.cosine_similarity([numpy_array_of_vector[i]], [numpy_array_of_vector[j]])
#         tup = (j, x[0][0])
        similarity[all_doc_num[j]] = x[0][0]
#     similarity.sort(reverse=True)
    sorted_d = dict(sorted(similarity.items(), key=operator.itemgetter(1),reverse=True))
    print("\nfor query ",query_num[i-100]," : ")
    d = dict(itertools.islice(sorted_d.items(), 5))
    for k in d.keys():
        print(k, end=" ")
    print()

Top 5 similar documents for each query: document_num : cosine similarity score 


for query  76  : 
1040901_bengal_index.utf8 1040902_bengal_story_3707328.utf8 1040901_bengal_story_3702361.utf8 1040901_bengal_story_3702363.utf8 1041227_atleisure_story_4179435.utf8 

for query  77  : 
1040902_bengal_story_3707327.utf8 1040902_bengal_index.utf8 1041224_atleisure_index.utf8 1041215_atleisure_story_4127714.utf8 1041214_atleisure_story_4125739.utf8 

for query  78  : 
1040901_bengal_index.utf8 1040901_bengal_story_3702361.utf8 1040901_bengal_story_3702363.utf8 1040902_bengal_story_3707852.utf8 1040902_bengal_story_3707328.utf8 

for query  79  : 
1040902_bengal_story_3707328.utf8 1040901_bengal_story_3702359.utf8 1041211_atleisure_index.utf8 1041217_atleisure_story_4139383.utf8 1041213_atleisure_index.utf8 

for query  80  : 
1041229_atleisure_story_4188371.utf8 1041214_atleisure_story_4123870.utf8 1041227_atleisure_story_4179432.utf8 1041215_atleisure_story_4131410.utf8 1041222_atleisure_s


for query  119  : 
1041214_atleisure_story_4104626.utf8 1041228_atleisure_story_4174742.utf8 1041214_atleisure_story_4125739.utf8 1040902_bengal_story_3707328.utf8 1041213_atleisure_story_4122608.utf8 

for query  120  : 
1041222_atleisure_story_4159405.utf8 1041227_atleisure_story_4179432.utf8 1040902_bengal_story_3706193.utf8 1041215_atleisure_story_4131410.utf8 1041222_atleisure_story_4157661.utf8 

for query  121  : 
1041221_atleisure_index.utf8 1041227_atleisure_story_4179435.utf8 1041231_atleisure_story_4196058.utf8 1041221_atleisure_story_4155154.utf8 1041207_atleisure_index.utf8 

for query  122  : 
1041222_atleisure_story_4157661.utf8 1041231_atleisure_story_4181188.utf8 1041217_atleisure_story_4139384.utf8 1041229_atleisure_story_4167555.utf8 1041231_atleisure_index.utf8 

for query  123  : 
1040901_bengal_index.utf8 1040901_bengal_story_3702361.utf8 1040901_bengal_story_3702363.utf8 1040902_bengal_story_3707331.utf8 1041225_atleisure_story_4172862.utf8 

for query  124  : 
