<a href="https://colab.research.google.com/github/reeveboy/Simple-Information-Retrieval/blob/main/Simple_Information_Retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# loading basic packages
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from nltk import word_tokenize
import re

In [None]:
df = pd.read_xml("/content/drive/MyDrive/datasets/Mechanics of Search/cran.all.1400.xml")
df = df[~df['text'].isna()] # remove null data

queries = pd.read_xml("/content/drive/MyDrive/datasets/Mechanics of Search/cran.qry.xml")

In [None]:
from nltk.corpus import stopwords as nltk_en_stopwords
from spacy.lang.en import stop_words as spacy_en_stopwords
NLTK_EN = set(nltk_en_stopwords.words('english'))
SPACY_EN = spacy_en_stopwords.STOP_WORDS
stop_words = NLTK_EN.union(SPACY_EN)

In [None]:
def preprocess(doc_text):
  text = doc_text.lower()
  text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) # Removes special characteres

  tokens = word_tokenize(text) # tokenize text

  text = [words for words in tokens if words not in stop_words] # remove stop words

  ps = nltk.stem.PorterStemmer()
  stemmed = [ps.stem(words) for words in text] # Stem words to the root form

  return stemmed

In [None]:
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import math
from collections import Counter

# Vector Space Model

In [None]:
df.columns

Index(['docno', 'title', 'author', 'bib', 'text'], dtype='object')

In [None]:
corpus = [' '.join([row['title'] or '', row['text'] or '']) for index, row in df.iterrows()]
processed_corpus = [preprocess(doc) for doc in corpus]

In [None]:
class VSM:
  def __init__(self):
    self.corpus = [' '.join(doc) for doc in processed_corpus]
    self.vectorizer = TfidfVectorizer()
    self.documents_vector = self.vectorizer.fit_transform(self.corpus)

  def rank(self, query, top_n=100):
    q = ' '.join(preprocess(query))
    query_vector = self.vectorizer.transform([q])

    cosineSimilarities = cosine_similarity(self.documents_vector, query_vector).flatten() # Calculate cosine similarities
    related_docs_indices = cosineSimilarities.argsort()[::-1][:top_n]  # Get top_n indices

    ranked_results = []
    for rank, i in enumerate(related_docs_indices, start=1):
        doc_id = df.iloc[i]['docno']
        similarity_score = cosineSimilarities[i]
        ranked_results.append((doc_id, similarity_score))

    return ranked_results

vsm_model = VSM()

# BM25 Model



In [None]:
class BM25:
  def __init__(self, k1=1.2, b=0.75):
    self.corpus = processed_corpus
    self.k1 = k1
    self.b = b
    self.avgdl = sum(len(doc) for doc in self.corpus) / len(self.corpus)
    self.idf = self.calculate_idf()

  def calculate_idf(self):
    idf = {}
    doc_count = len(self.corpus)
    for doc in self.corpus:
      word_set = set(doc)
      for word in word_set:
        idf[word] = idf.get(word, 0) + 1

    for word, count in idf.items():
      idf[word] = math.log((doc_count - count + 0.5) / (count + 0.5) + 1)

    return idf

  def get_bm25_score(self, query: list[str], doc: list[str]) -> float:
    score = 0
    doc_counter = Counter(doc)
    doc_len = len(doc)
    for word in query:
      if word not in doc_counter:
        continue
      idf = self.idf.get(word, 0)
      f = doc_counter[word]
      score += idf * ((f * (self.k1 + 1)) / (f + self.k1 * (1 - self.b + self.b * (doc_len / self.avgdl))))
    return score

  def rank(self, query: str, top_n: int = 100) -> list[tuple[str, float]]:
    q = preprocess(query)  # You should define preprocess function
    scores = []

    for i, doc in enumerate(self.corpus):
      score = self.get_bm25_score(q, doc)
      scores.append((i, score))

    scores.sort(key=lambda x: x[1], reverse=True)

    ranked_results = []
    for rank, (idx, score) in enumerate(scores[:top_n], start=1):
      doc_id = df.iloc[idx]['docno']  # Assuming df is defined somewhere
      similarity_score = score
      ranked_results.append((doc_id, similarity_score))

    return ranked_results

bm25_model = BM25()

# Relevance Model

In [None]:
class RelevanceModel:
  def __init__(self, mu=1000, lambda_=0.6, top_n=10, top_terms=15):
    self.corpus = [' '.join(doc) for doc in processed_corpus]
    self.mu = mu
    self.lambda_ = lambda_
    self.top_n = top_n
    self.top_terms = top_terms
    self.vectorizer = TfidfVectorizer()
    self.documents_tfidf = self.vectorizer.fit_transform(self.corpus)
    self.vocab = np.array(self.vectorizer.get_feature_names_out())

  def rank(self, query, n=100):
    query = ' '.join(preprocess(query))

    # Initial ranking
    initial_scores = self.rank_documents(query)
    top_docs = [idx for idx, _ in initial_scores[:self.top_n]]  # Select top documents

    # Feedback query expansion using top documents
    relevant_docs_tfidf = self.documents_tfidf[top_docs]
    new_query = self.expand_query(query, relevant_docs_tfidf)

    # Re-rank documents using the new query
    re_ranked_scores = self.rank_documents(new_query)

    ranked_results = []
    for idx, score in re_ranked_scores[:n]:
      doc_id = df.iloc[idx]['docno']
      similarity_score = score
      ranked_results.append((doc_id, similarity_score))

    return ranked_results

  def rank_documents(self, query):
    query_tfidf = self.vectorizer.transform([query])
    similarities = cosine_similarity(query_tfidf, self.documents_tfidf)
    similarities = similarities.flatten()
    document_scores = [(i, score) for i, score in enumerate(similarities)]
    document_scores.sort(key=lambda x: x[1], reverse=True)
    return document_scores

  def expand_query(self, original_query, relevant_docs_tfidf):
    # Calculate query likelihood P(t | R) for each term
    query_tfidf = self.vectorizer.transform([original_query])
    doc_length = relevant_docs_tfidf.sum(axis=1)
    doc_length_norm = doc_length / (doc_length + self.mu)
    query_likelihood = np.asarray(relevant_docs_tfidf.sum(axis=0) / doc_length_norm).flatten()

    # Get top terms with higher P(t | R)
    top_term_indices = np.argsort(query_likelihood)[::-1][:self.top_terms]
    top_term_indices = top_term_indices[top_term_indices < len(self.vocab)]
    top_terms = self.vocab[top_term_indices]

    # Interpolate with original query
    original_query_tfidf = self.vectorizer.transform([original_query])
    new_query_tfidf = self.lambda_ * query_likelihood[top_term_indices] + (1 - self.lambda_) * original_query_tfidf[:, top_term_indices]

    # Construct new query
    new_query = " ".join(top_terms)
    return new_query

relevance_model = RelevanceModel()

In [None]:
!pip install rank-bm25



In [None]:
def output_results_to_trec_format(results, model_name, output_file):
    with open(output_file, 'w') as f:
        for query_id, docs in results.items():
            for rank, (doc_id, score) in enumerate(docs, start=1):
                line = f"{query_id} 0 {doc_id} {rank} {score} {model_name}\n"
                f.write(line)

In [None]:
# Dictionary to store results for each query
vsm_results = {}
bm25_results = {}
relevance_results = {}

# Execute VSM model for each query
for query_info in queries.to_dict(orient='records'):
  query_num = query_info['num']
  query_title = query_info["title"]
  vsm_result = vsm_model.rank(query_title, top_n=100)  # Execute VSM model for the query
  vsm_results[query_num] = vsm_result

  bm25_result = bm25_model.rank(query_title, top_n=100)  # Execute BM25 model for the query
  bm25_results[query_num] = bm25_result

  relevance_result = relevance_model.rank(query_title, n=100)  # Execute RelevanceModel for the query
  relevance_results[query_num] = relevance_result

!rm vsm_results.txt
!rm bm25_results.txt
!rm relevance_results.txt

# Output VSM results to file in TREC format
output_results_to_trec_format(vsm_results, "vsm", "vsm_results.txt")
output_results_to_trec_format(bm25_results, "bm25", "bm25_results.txt")
output_results_to_trec_format(relevance_results, "relevance", "relevance_results.txt")

In [None]:
!git clone https://github.com/usnistgov/trec_eval.git
!make -C trec_eval

fatal: destination path 'trec_eval' already exists and is not an empty directory.
make: Entering directory '/content/trec_eval'
make: 'trec_eval' is up to date.
make: Leaving directory '/content/trec_eval'


In [None]:
# Download cranqrel.trec.txt file
!wget https://raw.githubusercontent.com/oussbenk/cranfield-trec-dataset/main/cranqrel.trec.txt

--2024-03-15 15:20:00--  https://raw.githubusercontent.com/oussbenk/cranfield-trec-dataset/main/cranqrel.trec.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23217 (23K) [text/plain]
Saving to: ‘cranqrel.trec.txt.2’


2024-03-15 15:20:01 (12.9 MB/s) - ‘cranqrel.trec.txt.2’ saved [23217/23217]



In [None]:
# Path to your cranqrel.trec.txt and vsm_results.txt files
path_to_cranfield_qrel = "/content/cranqrel.trec.txt"
path_to_vsm_results = "/content/vsm_results.txt"
path_to_bm25_results = "/content/bm25_results.txt"
path_to_relevance_results = "/content/relevance_results.txt"

# Run trec_eval with specific evaluation measures
!./trec_eval/trec_eval -m P.5 -m ndcg -m map {path_to_cranfield_qrel} {path_to_vsm_results}

map                   	all	0.0105
P_5                   	all	0.0158
ndcg                  	all	0.0408


In [None]:
!./trec_eval/trec_eval -m P.5 -m ndcg -m map {path_to_cranfield_qrel} {path_to_bm25_results}

map                   	all	0.0086
P_5                   	all	0.0132
ndcg                  	all	0.0365


In [None]:
!./trec_eval/trec_eval -m P.5 -m ndcg -m map {path_to_cranfield_qrel} {path_to_relevance_results}

map                   	all	0.0116
P_5                   	all	0.0145
ndcg                  	all	0.0420
