# ICS2205 Information Retrieval Task

## Libraries

In [1]:
import ssl
import re
import os
import nltk
import math
import random
from collections import Counter 
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


In [5]:
nltk.download('punkt')
nltk.download ("stopwords");
nltk.download ("wordnet");

[nltk_data] Downloading package punkt to /Users/sara/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/sara/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
stopwords = nltk.corpus.stopwords.words ("english")
stemmer = nltk.stem.PorterStemmer()

## Functions

### XML File Parser:
This function parses xml files (document files and queries) and extracts the text in the raw tag using regular expression

In [7]:
def parser(file_path):
    # Open XML file
    xml_file = open(file_path)
    # Read file content
    xml_text = xml_file.read()
    # regex that matches all text preceded by '<raw><!\[CDATA\[' and followed by '\]\]></raw>' and including new lines to get the file content
    pattern = '(?<=<raw><!\[CDATA\[).*(?=\]\]></raw>)'
    match = re.search(pattern, xml_text, re.DOTALL)
    # regex that matches all text preceded by '<fileDesc\ title=\"' and followed by '\"\ \/>' and including new lines to get the file title
    title_pattern = '(?<=<fileDesc\ title=\").*?(?=\" \/>)'
    title = re.search(title_pattern, xml_text, re.DOTALL)
    if match:
        return (match.group(0), title.group(0)) if title else (match.group(0), None)
    else:
        return "Tag not found"

### Preprocessing:
This function takes a list of tokens and performs case-folding, stopwords filtering, and stemming

In [8]:
def preprocess(text): 
    lowered = [token.lower() for token in text if token.isalpha()]
    sw_filtered = [token for token in lowered if token not in stopwords]
    stemmed = [stemmer.stem(token) for token in sw_filtered]
    return lowered, sw_filtered, stemmed
    

### Term Frequency:
This function takes a document and returns a dictionary of terms and their normalized frequencies in the document

In [9]:
def doc_term_freq(doc):
    terms_freq = Counter(doc)
    for i in terms_freq:
        terms_freq[i] = terms_freq[i] / len(doc)
    max_freq = max(terms_freq.values()) # get the maximum term frequency to normalize the term frequency
    for i in terms_freq:
        terms_freq[i] = terms_freq[i] / max_freq # normalize term frequency
    return terms_freq

### Term IDF:
This function takes a term and a list of documents and returns the IDF value of the term

In [10]:
def term_idf(term, docs):
    docs_len = len(docs)
    doc_freq = sum([1 for doc in docs if term in doc])
    return math.log(docs_len / doc_freq)

### Cosine Similarity:
This function takes a document vector and a query vector and returns the cosine similarity between them

In [11]:
def cos_sim(doc_vec, query_vec):
    # Calculate the dot product of the doc vector and the query vector
    dot_prod = sum([term1 * term2 for term1, term2 in zip(doc_vec, query_vec)])
    # Calculate the magnitude of the doc vector and the query vector
    mag_doc = math.sqrt(sum([term ** 2 for term in doc_vec]))
    mag_query = math.sqrt(sum([term ** 2 for term in query_vec]))
    
    # Calculate cosine similarity
    ## Check for division by zero
    if mag_doc * mag_query == 0:
        return 0
    else:
        return dot_prod / (mag_doc * mag_query)

## Document Indexing

### Load documents dataset

In [12]:
doc_dataset_dir = 'WES-Dataset/docs/'
doc_files = os.listdir(doc_dataset_dir)

### Parse documents and store their contents and titles in a dictionary

In [13]:
doc_dict = dict()
for doc in doc_files:
    if not doc.startswith('.'):
        xml_parsed, xml_title = parser(f'{doc_dataset_dir}{doc}')
        doc_dict[doc]=[xml_parsed, xml_title]

### Preprocess documents: tokenisation, case-folding, Stopwords filtering, and stemming

In [14]:
doc_dict_tokenised = {doc:nltk.tokenize.word_tokenize(text[0]) for doc,text in doc_dict.items()}

In [15]:
doc_dict_lowered = dict()
doc_dict_filtered = dict()
doc_dict_stemmed = dict()
for doc in doc_dict_tokenised.keys():
    lowered, sw_filtered, stemmed = preprocess(doc_dict_tokenised[doc])
    doc_dict_lowered[doc] = lowered
    doc_dict_filtered[doc] = sw_filtered
    doc_dict_stemmed[doc] = stemmed

### Creating the TF-IDF matrix

In [16]:
docs = doc_dict_stemmed.values()
term_set = set(term for doc in docs for term in doc)

# Initialize TF-IDF matrix with zeros
tfidf_matrix = pd.DataFrame(0, index=range(len(docs)), columns=list(term_set))

# Compute IDF for each term
idf_values = {}
for term in term_set:
    idf_values[term] = term_idf(term, docs)

# Compute TF and then TF-IDF for each term in each document
for doc_idx, document in enumerate(docs):
    tf_values = doc_term_freq(document)
    for term in term_set:
        tf = tf_values[term]
        tfidf_matrix.loc[doc_idx, term] = tf * idf_values[term]


In [17]:
tfidf_matrix

Unnamed: 0,cyclop,herpetologist,cheyenn,hamilton,bute,hornblow,eighteenth,cimbri,australia,rudi,...,kill,heinz,orteliu,byssh,hebrid,dodonaea,universel,saxoni,trampl,multitask
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Querying

### Load queries dataset

In [18]:
query_dataset_dir = 'WES-Dataset/queries/'
queries_files = os.listdir(query_dataset_dir) # load list of query files names

### Select a random query from the queries dataset

In [19]:
idx = random.randint(0,len(queries_files)-1) # Generate a random number from the range [0, # of queries files] 
query = queries_files[idx] # Load the query

### Preprocess the query: parsing, tokenisation, case-folding, Stopwords filtering, and stemming

In [20]:
query_parsed = parser(f'{query_dataset_dir}{query}') # Parsing
query_tokenised = nltk.tokenize.word_tokenize(query_parsed[0]) # Tokenising
query_lowered, query_sw_filtered, query_stemmed = preprocess(query_tokenised) # Case-folding, Stopwords filtering, and stemming

In [21]:
print(f'Selected query: {query}')
print(f'Query after parsing: {query_parsed[0]}')
print(f'Query after tokenisation: {query_tokenised}')
print(f'Query after case-folding: {query_lowered}')
print(f'Query after stopwords filtering: {query_sw_filtered}')
print(f'Query after stemming: {query_stemmed}')

Selected query: wes2015.q02.naf
Query after parsing: famous German poetry
Query after tokenisation: ['famous', 'German', 'poetry']
Query after case-folding: ['famous', 'german', 'poetry']
Query after stopwords filtering: ['famous', 'german', 'poetry']
Query after stemming: ['famou', 'german', 'poetri']


### Creating the query TF-IDF vector

In [22]:
# getting the query term frequencies
query_tf = doc_term_freq(query_stemmed) 
# creating a dictionary of terms - tfidf values using docs idf values
query_tfidf = {term: query_tf.get(term, 0) * idf_values.get(term, 0) for term in idf_values.keys()} 
# getting the tf-idf values in a vector
query_vec = query_tfidf.values()

### Calculating the similarity between the query and the documents

In [23]:
sim_matrix = dict() # doc - query similarity dictionary
for i in range(len(docs)):
    sim_matrix[i] = cos_sim(tfidf_matrix.loc[i], query_vec) # storing cos similarity scores in the dictionary

### Results

In [27]:
# sorting the dictionary based on the sim score values
sim_matrix_sorted = dict(sorted(sim_matrix.items(), key=lambda x:x[1], reverse=True))
# printing results
for doc, sim_score in sim_matrix_sorted.items():
    if doc_files[doc].startswith('.'):
        continue
    else:
        print(f'File index: {doc}, File name: {doc_files[doc]}, Title: {doc_dict[doc_files[doc]][1]}, Similarit score: {sim_score}')

File index: 287, File name: wes2015.d153.naf, Title: Niccoló Paganini – the Devil’s Violinist, Similarit score: 0.12519047646188491
File index: 247, File name: wes2015.d283.naf, Title: Georg Friedrich Philipp von Hardenberg aka Novalis, Similarit score: 0.07911908341142271
File index: 55, File name: wes2015.d149.naf, Title: The Poetry of Walt Whitman, Similarit score: 0.06417806044588133
File index: 195, File name: wes2015.d291.naf, Title: August Wilhelm Schlegel and his Shakespeare Translations, Similarit score: 0.05282726698187367
File index: 214, File name: wes2015.d134.naf, Title: Niemand hat die Absicht eine Mauer zu bauen!, Similarit score: 0.04830334140833313
File index: 157, File name: wes2015.d293.naf, Title: The Works of Heinrich Mann, Similarit score: 0.04099612191871252
File index: 294, File name: wes2015.d037.naf, Title: Albrecht von Haller – Father of Modern Physiology, Similarit score: 0.03892203667084831
File index: 243, File name: wes2015.d136.naf, Title: William Butle