## Semantic Search using Topic Modelling
### [CISI Dataset](https://www.kaggle.com/datasets/dmaso01dsta/cisi-a-dataset-for-information-retrieval)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import collections
#Libraries for NLP
import re #for regex removal
import nltk
nltk.download('punkt') #for tokenization
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords #for removing stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
from nltk.stem.wordnet import WordNetLemmatizer #for lemmatization
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
from functools import reduce #concatenate words into sentence using reduce
#Libraries for LDA
from gensim.corpora import Dictionary
from gensim.models import LsiModel
from gensim.models.ldamodel import LdaModel
from gensim.similarities import MatrixSimilarity
#Libraries for cosine similarity
import numpy as np
from numpy.linalg import norm
#Libraries for visualization
import seaborn as sns
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### 1.Load Dataset Files  <a id="load_data">

In [None]:
def load_data(path):
    import os

    #_____________ Read data from CISI.ALL file and store in dictinary ________________

    with open(os.path.join(path, 'CISI.ALL')) as f:
        lines = ""
        for l in f.readlines():
            # add new line char for lines starting with (.) and add space for other lines
            lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip()
        #split a string into list based on newline(\n) separator.
        lines = lines.lstrip("\n").split("\n")

    # Put each DOCUMENT into a dictionary doc_set
    doc_set = {}
    doc_id = ""
    doc_text = ""

    for l in lines:
        if l.startswith(".I"):
            doc_id = l.split(" ")[1].strip() # l='.I 1' -> doc_id = 1
        elif l.startswith(".X"):
            # indicate document ending
            doc_set[doc_id] = doc_text.lstrip(" ")
            doc_id = ""
            doc_text = ""
        else:
            # ignore first 3 characters of a line. l='.T 18 Editions..' -> doc_text = 19 Editions..
            doc_text += l.strip()[3:] + " "

    print(f"Number of documents = {len(doc_set)}")
    print(doc_set["1"]) # note that the dictionary indexes are strings, not numbers.


    #_____________ Read data from CISI.QRY file and store in dictinary ________________

    with open(os.path.join(path, 'CISI.QRY')) as f:
        lines = ""
        for l in f.readlines():
            lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip()
        lines = lines.lstrip("\n").split("\n")

    # Put each QUERY into a dictionary doc_set.
    qry_set = {}
    qry_id = ""
    for l in lines:
        if l.startswith(".I"):
            qry_id = l.split(" ")[1].strip() # l='.I 1' -> doc_id = 1
        elif l.startswith(".W"):
            # ignore first 3 characters of a line. l='.T 18 Editions..' -> doc_text = 19 Editions..
            qry_set[qry_id] = l.strip()[3:]
            qry_id = ""

    print(f"\n\nNumber of queries = {len(qry_set)}")
    print(qry_set["1"]) # note that the dictionary indexes are strings, not numbers.


    #_____________ Read data from CISI.REL file and store in dictinary ________________

    rel_set = {}
    with open(os.path.join(path, 'CISI.REL')) as f:
        for l in f.readlines():
            qry_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[0] #split("\t")[0] -> '1     28'
            doc_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[-1] #                 [0]   [-1]

            if qry_id in rel_set:
                rel_set[qry_id].append(doc_id)
            else:
                rel_set[qry_id] = []
                rel_set[qry_id].append(doc_id)

    print(f"\n\nNumber of mappings = {len(rel_set)}")
    print(rel_set["1"]) # note that the dictionary indexes are strings, not numbers.

    return doc_set, qry_set, rel_set

In [None]:
# returns dictionary with key->doc_id/query_id and value->data
doc_set, qry_set, rel_set = load_data('/content/drive/MyDrive/Internship Material/Wolters Kluwer/data/Information Retrieval')

Number of documents = 1460
18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification.  The first edition of the DDC was published in 1876, the eighteenth edition in 1971, and future editions will continue to appear as needed.  In spite of the DDC's long and healthy life, however, its full story has never been told.  There have been biographies of Dewey that briefly describe his system, but this is the first attempt to provide a detailed history of the work that more than any other has spurred the growth of librarianship in this country and abroad. 


Number of queries = 112
What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from approximate titles? What is the usual relevance of the content of articles to their titles?


Number of mappings = 76
['28', '35', '38', '42', '43', '52', '65', '76', '86', '150', '189', '192', '193', '1

#### 1.1 Length of documents and Queries

In [None]:
def get_length(doc_set, qry_set):
    # Length of documents
    min_d = 9999
    max_d = 0
    for i in doc_set:
        min_d = min(min_d, len(doc_set[i]))
        max_d = max(max_d, len(doc_set[i]))
    print(f"Doc length\n no. of characters min: {min_d}, max: {max_d}")

    # Length of queries
    min_q = 9999
    max_q = 0
    for i in qry_set:
        min_d = min(min_q, len(qry_set[i]))
        max_d = max(max_q, len(qry_set[i]))
    print(f"\nQuery length\n no. of characters min: {min_d}, max: {max_d}")

In [None]:
get_length(doc_set, qry_set)

Doc length
 no. of characters min: 99, max: 3903

Query length
 no. of characters min: 432, max: 432


#### 1.2. LDA Model hyperparameter configurations

In [None]:
config = {'use_noun_only': False, 'num_topics': 15, 'passes': 10, 'iterations': 50,
          'chunksize': 50, 'stem_lemma': 1}

### 2.Data Wrangling <a id="preprocess_data">

#### 2.1.Preprocess Documents and Queries Data


In [None]:
#Customize stopwords list
for i in ["not", "no"]:
    stop_words.remove(i)
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess(data_dict):
    docs = {}

    for key in data_dict:

        line = data_dict[key]
        #1. Removing upper brackets to keep words like [user\'s]
        line = line.replace("\'", "")

        #2. Removing symbols and numbers
        line = re.sub(r"[^a-zA-Z]+", ' ', line).strip()

        #3. Convert string to tokens, lowercase it & remove stop words
        tokens = word_tokenize(line)  #to convert a string to tokens
        tokens = [w.lower() for w in tokens if len(w)>1] #convert to lowercase
        tokens = [w for w in tokens if w not in stop_words] #remove stop words

        #4. Convert word to its base form
        lemmatized_tokens = []
        for word, tag in pos_tag(tokens):
            if tag.startswith('NN'):
                pos = 'n'
            elif tag.startswith('VB'):
                pos = 'v'
            else:
                pos = 'a'
            lemmatized_tokens.append(lemmatizer.lemmatize(word, pos))

        docs[key] = lemmatized_tokens

    return docs

In [None]:
# preprocessing documents
preprocessed_doc_dict = preprocess(doc_set)
print(preprocessed_doc_dict['1'])

['edition', 'dewey', 'decimal', 'classification', 'comaromi', 'present', 'study', 'history', 'dewey', 'decimal', 'classification', 'first', 'edition', 'ddc', 'publish', 'eighteenth', 'edition', 'future', 'edition', 'continue', 'appear', 'need', 'spite', 'ddc', 'long', 'healthy', 'life', 'however', 'full', 'story', 'never', 'tell', 'biography', 'dewey', 'briefly', 'describe', 'system', 'first', 'attempt', 'provide', 'detailed', 'history', 'work', 'spur', 'growth', 'librarianship', 'country', 'abroad']


In [None]:
# preprocessing queries
preprocessed_qry_dict = preprocess(qry_set)
print(preprocessed_qry_dict['1'])

['problem', 'concern', 'make', 'descriptive', 'title', 'difficulty', 'involve', 'automatically', 'retrieve', 'article', 'approximate', 'title', 'usual', 'relevance', 'content', 'article', 'title']


### 3.Data Modelling <a id='train_nn_system'>

#### 3.1. Training LDA Model

In [None]:
# Create dictionary of unique words in corpus
dictionary = Dictionary(preprocessed_doc_dict.values())
dictionary.filter_extremes(no_below=5, no_above=0.5)
print(dictionary.token2id)

{'abroad': 0, 'appear': 1, 'attempt': 2, 'briefly': 3, 'classification': 4, 'continue': 5, 'country': 6, 'ddc': 7, 'decimal': 8, 'describe': 9, 'detailed': 10, 'dewey': 11, 'edition': 12, 'first': 13, 'full': 14, 'future': 15, 'growth': 16, 'history': 17, 'however': 18, 'librarianship': 19, 'life': 20, 'long': 21, 'need': 22, 'never': 23, 'present': 24, 'provide': 25, 'publish': 26, 'spite': 27, 'study': 28, 'system': 29, 'tell': 30, 'work': 31, 'account': 32, 'act': 33, 'also': 34, 'analysis': 35, 'aspect': 36, 'channel': 37, 'colleague': 38, 'collection': 39, 'contact': 40, 'current': 41, 'document': 42, 'doubt': 43, 'even': 44, 'information': 45, 'kingdom': 46, 'less': 47, 'library': 48, 'major': 49, 'make': 50, 'no': 51, 'not': 52, 'one': 53, 'organization': 54, 'outside': 55, 'particularly': 56, 'pattern': 57, 'people': 58, 'periodical': 59, 'person': 60, 'personal': 61, 'proportion': 62, 'rarely': 63, 'rather': 64, 'receive': 65, 'regular': 66, 'rely': 67, 'report': 68, 'restrict

In [None]:
train_corpus_doc2bow = [dictionary.doc2bow(text) for text in preprocessed_doc_dict.values()]
print(train_corpus_doc2bow[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 2), (8, 2), (9, 1), (10, 1), (11, 3), (12, 4), (13, 2), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)]


In [None]:
lda_model = LdaModel(train_corpus_doc2bow,
                     num_topics = config['num_topics'],
                     id2word=dictionary,
                     passes = config['passes'],
                     iterations=config['iterations'],
                     chunksize = config['chunksize'],
                     random_state = 0)

In [None]:
index = MatrixSimilarity(lda_model[train_corpus_doc2bow])



In [None]:
topics = lda_model.show_topics()
topics

[(13,
  '0.047*"serial" + 0.042*"view" + 0.036*"survey" + 0.028*"health" + 0.027*"acquisition" + 0.026*"conference" + 0.024*"use" + 0.024*"medical" + 0.023*"ever" + 0.023*"group"'),
 (5,
  '0.023*"work" + 0.022*"not" + 0.019*"one" + 0.016*"new" + 0.015*"science" + 0.014*"world" + 0.014*"book" + 0.013*"make" + 0.013*"problem" + 0.013*"research"'),
 (8,
  '0.041*"rule" + 0.032*"country" + 0.028*"language" + 0.027*"theory" + 0.024*"show" + 0.020*"rate" + 0.017*"world" + 0.017*"linguistic" + 0.017*"propose" + 0.017*"within"'),
 (12,
  '0.028*"research" + 0.026*"study" + 0.025*"title" + 0.022*"program" + 0.016*"percent" + 0.015*"policy" + 0.015*"collection" + 0.013*"university" + 0.013*"user" + 0.012*"large"'),
 (10,
  '0.047*"scientific" + 0.035*"journal" + 0.030*"literature" + 0.023*"number" + 0.022*"year" + 0.019*"citation" + 0.019*"article" + 0.018*"publication" + 0.017*"subject" + 0.017*"publish"'),
 (6,
  '0.050*"line" + 0.034*"international" + 0.032*"catalogue" + 0.026*"formula" + 0.

In [None]:
test_corpus_doc2bow = [dictionary.doc2bow(doc) for doc in preprocessed_qry_dict.values()]
print(test_corpus_doc2bow[1])

[(45, 1), (352, 1), (386, 1), (412, 1), (437, 1), (555, 1), (951, 1), (1133, 1), (1205, 1), (1241, 1), (1808, 1)]


In [None]:
vec_lda = lda_model[test_corpus_doc2bow]
sims = index[vec_lda]

In [None]:
sims.shape

(112, 1460)

#### 3.2. Sort documents as per similarity for each query

In [None]:
def get_sorted_docid_per_query(doc_scores, doc_set):
    docid_sortedBy_score = {} #key-> query_id, value-> list of doc_id in descending order of doc scores

    for qid, query_similarity_score in enumerate(sims):
        # dictionary, key->doc_id and value->doc score
        doc_score_dict = {}
        for score, doc_id in zip(query_similarity_score, doc_set):
          doc_score_dict[doc_id] = round(score, 4)

        # sort doc_id based on scores, high score first
        # sorted(..,key=function) function transforms each element before sorting, it takes the value and returns 1 value which is then used within sort instead of the original value.
        docid_sortedBy_score[str(qid)] = sorted(doc_score_dict, key=doc_score_dict.get, reverse=True)
        #print(f"query_id: {query_id}")

    return docid_sortedBy_score

In [None]:
doc_scores1 = get_sorted_docid_per_query(sims, doc_set)
print(doc_scores1['1'])

['1366', '507', '512', '809', '705', '734', '1396', '980', '698', '696', '970', '495', '648', '560', '80', '1362', '637', '799', '529', '677', '190', '1013', '732', '1160', '672', '645', '721', '1109', '450', '579', '150', '1089', '1368', '467', '1375', '692', '1364', '360', '703', '875', '725', '465', '642', '517', '148', '724', '914', '842', '153', '690', '728', '730', '687', '1058', '520', '1012', '1114', '670', '506', '695', '421', '1104', '508', '1207', '474', '1377', '180', '805', '490', '1078', '1367', '870', '83', '897', '321', '993', '330', '1197', '1059', '54', '175', '674', '511', '76', '454', '1121', '737', '1096', '1248', '158', '871', '700', '1100', '341', '790', '738', '1295', '1356', '1431', '145', '583', '866', '706', '604', '129', '701', '252', '116', '1415', '872', '411', '318', '731', '127', '1359', '704', '378', '850', '1193', '347', '1312', '482', '717', '830', '611', '673', '179', '1353', '682', '66', '429', '671', '890', '77', '1080', '502', '135', '733', '601',

### 4.Performance Metrics <a id='performance'>


#### 4.1 Recall@K [order-unaware]

In [None]:
# Recall@K = TP/(TP+FN)
def recall_k(ground_truth, predictions, k):
  avg_recall = 0
  for query_id in ground_truth:
    truth_set = set(ground_truth[query_id]) #set doesn't maintain order of id's
    pred_set = set(predictions[query_id][:k])
    result = round(len(truth_set & pred_set) / float(len(truth_set)), 2) # len(truth_set & pred_set)-> len(intersection of 2 sets)
    avg_recall += result
    #print(f"act_set: {len(truth_set)}, pred_set: {len(pred_set)}, &: {len(truth_set & pred_set)}")
  avg_recall /= len(ground_truth)

  return round(avg_recall, 3)

In [None]:
print(f"Recall using LDA\n")
print(f"Recall@5 = {recall_k(rel_set, doc_scores1, 5)}") #Top-5 results
print(f"Recall@10 = {recall_k(rel_set, doc_scores1, 10)}") #Top-10 results

Recall using LDA

Recall@5 = 0.007
Recall@10 = 0.012


#### 4.2 Precision@K [order-unaware]

In [None]:
# Precision@K = TP/(TP+FP)
def precision_k(ground_truth, predictions, k):
  avg_precision = 0
  for query_id in ground_truth:
    truth_set = set(ground_truth[query_id]) #set doesn't maintain order of id's
    pred_set = set(predictions[query_id][:k])
    result = round(len(truth_set & pred_set) / float(len(pred_set)), 2) # len(truth_set & pred_set)-> len(intersection of 2 sets)
    avg_precision += result
    #print(f"act_set: {len(truth_set)}, pred_set: {len(pred_set)}, &: {len(truth_set & pred_set)}")
  avg_precision /= len(ground_truth)

  return round(avg_precision, 3)

In [None]:
print(f"Precision using LDA\n")
print(f"Precision@5 = {precision_k(rel_set, doc_scores1, 5)}") #Top-5 results
print(f"Precision@10 = {precision_k(rel_set, doc_scores1, 10)}") #Top-10 results

Precision using LDA

Precision@5 = 0.05
Precision@10 = 0.046


#### 4.3 Mean Reciprocal Rank (MRR) [order-aware]

In [None]:
# first occurance of true positive i.e. index of first actual relevant docid predicted.
def get_first_relevent_docid(predictions, truth):
    for doc_id in predictions:
        is_exist = doc_id in truth #truth.count(doc_id)
        #print(f"predictions: {doc_id}, is_exist: {is_exist}")
        if is_exist:
            return predictions.index(doc_id)+1
    else:
        return -1

In [None]:
def mrr(doc_scores, rel_set):
    Q = len(rel_set) # number of queries with known ground-truth
    cumulative_reciprocal = 0  # summation of reciprocals of the first actual relevant ranks

    for query_id in rel_set:
        # index of first actual relevant docid predicted
        first_result = get_first_relevent_docid(doc_scores[query_id], rel_set[query_id])
        # check if function returned index or -1 (for -1 use no. of docs+1 = 1461)
        first_result_rank = len(doc_scores['1'])+1 if first_result<1 else first_result
        reciprocal = 1 / first_result_rank
        cumulative_reciprocal += reciprocal
        #print(f"query #{query_id} = 1/{first_result} = {round(reciprocal,2)}")

    mrr = 1/Q * cumulative_reciprocal # mean of summation of cumulative_reciprocal
    return round(mrr,3)

In [None]:
mrr = mrr(doc_scores1, rel_set)
print(f"Mean Reciprocal Rank (MRR) using LDA: {mrr}")

Mean Reciprocal Rank (MRR) using LDA: 0.121


#### 4.4 Mean Average Precision (MAP) [order-aware]

In [None]:
def map_k(rel_set, doc_scores, K):
    Q = len(rel_set) # number of queries with known ground-truth
    avg_precision = [] #AP of all queries

    for query_id in rel_set:
        precision_relevance_summation = 0

        for k in range(0,K):
            # calculate precision@k
            truth_set = set(rel_set[query_id])
            pred_set = set(doc_scores[query_id][:k+1])
            precision_at_k = round(len(truth_set & pred_set) / float(len(pred_set)), 2) # len(truth_set & pred_set)-> len(intersection of 2 sets)
            # check relevance of predicted docid at k
            rel_k = 1 if doc_scores[query_id][k] in rel_set[query_id] else 0
            precision_relevance_summation += precision_at_k * rel_k # summation of precision*relevance
            #print(f"qid: {query_id}, k: {k}, precision_at_k : {precision_at_k}, rel_k : {rel_k}, ap_num: {ap_num}, docid_sortedBy_score[query_id][k]: {docid_sortedBy_score[query_id][k]}")

        # AP value of query qid
        avg_precision_q = precision_relevance_summation / len(rel_set[query_id])
        #print(f"len(rel_set[q]): {len(rel_set[query_id])}, AP@{K}_{int(query_id)} = {round(avg_precision_q,2)}")
        avg_precision.append(avg_precision_q)

    map_k = sum(avg_precision) / Q # mean of all AP values of a query
    return round(map_k, 3)

In [None]:
print(f"MAP using LDA")
map_5 = map_k(rel_set, doc_scores1, K=5)
print(f"MAP@5 = {map_5}")
map_10 = map_k(rel_set, doc_scores1, K=10)
print(f"MAP@10 = {map_10}")

MAP using LDA
MAP@5 = 0.004
MAP@10 = 0.006


### 4.6 Composite Score

In [None]:
# first occurance of true positive i.e. index of first actual relevant docid predicted.
def get_first_relevent_docid(predictions, truth):
    for doc_id in predictions:
        is_exist = doc_id in truth #truth.count(doc_id)
        #print(f"predictions: {doc_id}, is_exist: {is_exist}")
        if is_exist:
            return predictions.index(doc_id)+1
    else:
        return -1

In [None]:
def get_composite_score(predictions, truth):
    total_scores = []
    succeed = 0
    failed = 0
    for query_id in rel_set:
        # index of first actual relevant docid predicted
        score = get_first_relevent_docid(predictions[query_id], truth[query_id])
        if score >= 0:
            total_scores.append(score)
            succeed += 1
        else:
            failed += 1

    return 100*failed + (sum(total_scores)/succeed)

In [None]:
composite_score = get_composite_score(doc_scores1, rel_set)
print(f"composite score: {composite_score}")
wandb.log({"composite_score": composite_score})

#### LDA Model performs worst as compared to BM-25 and Transformer based approaches