# **Importing the libraries**

In [None]:
# importing the libraries
import pickle
import math
from collections import Counter
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from tabulate import tabulate
from numpy.linalg import norm
import pandas as pd
import numpy as np
import warnings
import nltk
import re

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
!pip install tabulate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# **Utility Functions**

In [None]:
# loading the indexes
def load_indexes():
  with open('/content/inverted_index.pickle', 'rb') as file:
    inverted_index = pickle.load(file)
    for key, val in inverted_index.items():
      val[1] = list(val[1])

  with open('/content/file_content.pkl', 'rb') as file:
    file_content = pickle.load(file)

  return inverted_index, file_content

In [None]:
def create_query_vectors(query_list, vocab, idf_mapper):
  freq_dict = Counter(query_list)

  # binary query vector
  binary_query_vector = [0] * len(vocab)
  for index, query in enumerate(query_list):
    index = vocab.index(query)
    binary_query_vector[index] = 1 * idf_mapper[query]

  # raw query vector
  raw_query_vector = [0] * len(vocab)
  for index, query in enumerate(query_list):
    index = vocab.index(query)
    raw_query_vector[index] = freq_dict[query] * idf_mapper[query]

  # term frequency vector
  tf_query_vector = [0] * len(vocab)
  for index, query in enumerate(query_list):
    tf = freq_dict[query] / sum(freq_dict.values())
    index = vocab.index(query)
    tf_query_vector[index] = tf * idf_mapper[query]

  # log normalization vector
  ln_query_vector = [0] * len(vocab)
  for index, query in enumerate(query_list):
    tf = math.log10(1 + freq_dict[query])
    index = vocab.index(query)
    ln_query_vector[index] = tf * idf_mapper[query]

  # double normalization vector
  dn_query_vector = [0.5] * len(vocab)
  for index, query in enumerate(query_list):
    tf = 0.5 + ((0.5 * freq_dict[query])/max(freq_dict.values()))
    index = vocab.index(query)
    dn_query_vector[index] = tf * idf_mapper[query]


  return [
      binary_query_vector,
      raw_query_vector,
      tf_query_vector, 
      ln_query_vector,
      dn_query_vector
  ]

In [None]:
# utility method to process input
def process_query(query, vocab, idf_mapper):
  query = query.lower()
  query = re.sub(r'[^\w\s]', '', query).split()

  without_stopwords = []
  for word in query:
    if word not in stop_words:
      without_stopwords.append(word)

  query_vectors = create_query_vectors(without_stopwords, vocab, idf_mapper)
  return query_vectors

In [None]:
# utility function to create a dictionary that maps term to an index: {'term':'index'}
def create_termId_mapper(inverted_index):
  terms = list(inverted_index.keys())
  term_id_mapper = {}
  for index, term in enumerate(terms):
    term_id_mapper[term] = index

  return term_id_mapper

In [None]:
# utility function to create a dictionary: {'term':'IDF-value'} 
DOC_COUNT = 1400
def create_termIDF_mapper(inverted_index):
  idf_mapper = {}               
  for term, posting_list in inverted_index.items():
    doc_freq = posting_list[0]
    idf_mapper[term] = math.log10(DOC_COUNT/(1+doc_freq))

  return idf_mapper

In [None]:
# utility function to create term frequency dictionary
def create_tf_mapper(file_content):
  tf_counter = dict()         # dictionary of dictionary {'docid': {'term1':'frequency'}, {'term2':'frequency'}}

  for i in range(1, 1401):
    frequencyDict = dict(Counter(file_content[i]))
    tf_counter[i] = frequencyDict

  return tf_counter

In [None]:
# utility function to create binary TF-IDF Matrix
def binary_tfidf_matrix(inverted_index, term_id_mapper, idf_mapper):
  tf_idf_matrix_binary = pd.DataFrame(np.zeros((1400, len(inverted_index))), 
                                      columns=term_id_mapper.keys())

  for term, term_id in term_id_mapper.items():
    posting_list = inverted_index[term][1]
    for doc_no in posting_list:
      tf_idf_matrix_binary.loc[doc_no-1][term] = 1*idf_mapper[term]

  print("Binary TF-IDF Matrix Created !!")
  return tf_idf_matrix_binary

In [None]:
# utility function to create raw count TF-IDF Matrix
def raw_tfidf_matrix(inverted_index, term_id_mapper, tf_counter, idf_mapper):
  tf_idf_matrix_raw = pd.DataFrame(np.zeros((1400, len(inverted_index))), 
                                   columns=term_id_mapper.keys())

  for doc_id, terms in tf_counter.items():
    for term, freq in terms.items():
      term_id = term_id_mapper[term]
      tf_idf_matrix_raw.loc[doc_id-1][term_id] = freq*idf_mapper[term]

  print("Raw Count TF-IDF Matrix Created !!")
  return tf_idf_matrix_raw

In [None]:
# utility function to create term frequency TF-IDF Matrix
def term_frequency_tfidf_matrix(inverted_index, term_id_mapper, tf_counter, idf_mapper):
  tf_idf_matrix_term_freq = pd.DataFrame(np.zeros((1400, len(inverted_index))), 
                                         columns=term_id_mapper.keys())

  for doc_id, terms in tf_counter.items():
    for term, freq in terms.items():
      tf = freq / sum(terms.values())
      term_id = term_id_mapper[term]
      tf_idf_matrix_term_freq.loc[doc_id-1][term_id] = tf*idf_mapper[term]

  print("Term Frequency TF-IDF Matrix Created !!")
  return tf_idf_matrix_term_freq

In [None]:
# utility function to create log normalisation TF-IDF Matrix
def log_norm_tfidf_matrix(inverted_index, term_id_mapper, tf_counter, idf_mapper):
  tf_idf_matrix_log_norm = pd.DataFrame(np.zeros((1400, len(inverted_index))), 
                                         columns=term_id_mapper.keys())

  for doc_id, terms in tf_counter.items():
    for term, freq in terms.items():
      tf = math.log10(1 + freq)
      term_id = term_id_mapper[term]
      tf_idf_matrix_log_norm.loc[doc_id-1][term_id] = tf*idf_mapper[term]

  print("Log Normalization TF-IDF Matrix Created !!")
  return tf_idf_matrix_log_norm

In [None]:
# utility function to create double log normalisation TF-IDF Matrix
def double_norm_tfidf_matrix(inverted_index, term_id_mapper, tf_counter, idf_mapper):
  tf_idf_matrix_double_norm = pd.DataFrame(np.full((1400, len(inverted_index)), 0.5), 
                                         columns=term_id_mapper.keys())

  for doc_id, terms in tf_counter.items():
    MAX_FREQ = max(terms.values())
    for term, freq in terms.items():
      tf = 0.5 + ((0.5 * freq)/MAX_FREQ)
      term_id = term_id_mapper[term]
      tf_idf_matrix_double_norm.at[doc_id-1, term] = tf*idf_mapper[term]

  print("Double Normalization TF-IDF Matrix Created !!")
  return tf_idf_matrix_double_norm

# **Creating Tables**

In [None]:
inverted_index, file_content = load_indexes()
term_id_mapper = create_termId_mapper(inverted_index)
idf_mapper = create_termIDF_mapper(inverted_index)
tf_counter = create_tf_mapper(file_content)

binary_tfidf = binary_tfidf_matrix(inverted_index, term_id_mapper, idf_mapper)
raw_freq_tfidf = raw_tfidf_matrix(inverted_index, term_id_mapper, tf_counter, idf_mapper)
term_freq_tfidf = term_frequency_tfidf_matrix(inverted_index, term_id_mapper, tf_counter, idf_mapper)
log_norm_tfidf = log_norm_tfidf_matrix(inverted_index, term_id_mapper, tf_counter, idf_mapper)
double_norm_tfidf = double_norm_tfidf_matrix(inverted_index, term_id_mapper, tf_counter, idf_mapper)

Binary TF-IDF Matrix Created !!
Raw Count TF-IDF Matrix Created !!
Term Frequency TF-IDF Matrix Created !!
Log Normalization TF-IDF Matrix Created !!
Double Normalization TF-IDF Matrix Created !!


# **All TF-IDF Matrices**



###### **Binary TF-IDF Matrix**

In [None]:
binary_tfidf

Unnamed: 0,experimental,investigation,aerodynamics,wing,slipstream,study,propeller,made,order,determine,...,hoshizaki,recoverable,thermometer,incipientmergedlayer,simplysupported,fralich,prevented,thirds,ing,ob
0,0.645069,0.809668,1.765917,0.915679,2.032185,0.996909,1.845098,0.598353,0.987766,1.133291,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.996909,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.598353,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,0.645069,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.987766,0.000000,...,0.0,0.0,0.0,0.0,2.669007,2.845098,2.845098,0.000000,0.000000,0.000000
1396,0.645069,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1397,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.598353,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,2.845098,0.000000,0.000000
1398,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,2.845098,0.000000


###### **Raw Count TF-IDF Matrix**

In [None]:
raw_freq_tfidf

Unnamed: 0,experimental,investigation,aerodynamics,wing,slipstream,study,propeller,made,order,determine,...,hoshizaki,recoverable,thermometer,incipientmergedlayer,simplysupported,fralich,prevented,thirds,ing,ob
0,1.290138,0.809668,1.765917,2.747037,10.160923,0.996909,1.845098,1.196707,0.987766,1.133291,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,1.993818,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.598353,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,0.645069,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.987766,0.000000,...,0.0,0.0,0.0,0.0,5.338014,2.845098,2.845098,0.000000,0.000000,0.000000
1396,0.645069,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1397,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.598353,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,2.845098,0.000000,0.000000
1398,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,2.845098,0.000000


###### **Term Frequency TF-IDF Matrix**

In [None]:
term_freq_tfidf

Unnamed: 0,experimental,investigation,aerodynamics,wing,slipstream,study,propeller,made,order,determine,...,hoshizaki,recoverable,thermometer,incipientmergedlayer,simplysupported,fralich,prevented,thirds,ing,ob
0,0.016755,0.010515,0.022934,0.035676,0.13196,0.012947,0.023962,0.015542,0.012828,0.014718,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.00000,0.018126,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.013915,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,0.010933,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.016742,0.000000,...,0.0,0.0,0.0,0.0,0.090475,0.048222,0.048222,0.00000,0.000000,0.000000
1396,0.013439,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
1397,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.005203,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.02474,0.000000,0.000000
1398,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.056902,0.000000


###### **Log Normalization TF-IDF Matrix**

In [None]:
log_norm_tfidf

Unnamed: 0,experimental,investigation,aerodynamics,wing,slipstream,study,propeller,made,order,determine,...,hoshizaki,recoverable,thermometer,incipientmergedlayer,simplysupported,fralich,prevented,thirds,ing,ob
0,0.307776,0.243734,0.531594,0.551294,1.581347,0.300099,0.55543,0.285487,0.297347,0.341155,...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.475646,0.00000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.180122,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,0.194185,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.297347,0.000000,...,0.0,0.0,0.0,0.0,1.27344,0.85646,0.85646,0.00000,0.00000,0.00000
1396,0.194185,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
1397,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.180122,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.85646,0.00000,0.00000
1398,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.00000,0.85646,0.00000


###### **Double Normalization**

In [None]:
double_norm_tfidf

Unnamed: 0,experimental,investigation,aerodynamics,wing,slipstream,study,propeller,made,order,determine,...,hoshizaki,recoverable,thermometer,incipientmergedlayer,simplysupported,fralich,prevented,thirds,ing,ob
0,0.451548,0.485801,1.05955,0.732543,2.032185,0.598145,1.107059,0.418847,0.592659,0.679974,...,0.5,0.5,0.5,0.5,0.500000,0.500000,0.500000,0.50000,0.500000,0.500000
1,0.500000,0.500000,0.50000,0.500000,0.500000,0.664606,0.500000,0.500000,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.500000,0.500000,0.500000,0.50000,0.500000,0.500000
2,0.500000,0.500000,0.50000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.500000,0.500000,0.500000,0.50000,0.500000,0.500000
3,0.500000,0.500000,0.50000,0.500000,0.500000,0.500000,0.500000,0.398902,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.500000,0.500000,0.500000,0.50000,0.500000,0.500000
4,0.500000,0.500000,0.50000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.500000,0.500000,0.500000,0.50000,0.500000,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,0.430046,0.500000,0.50000,0.500000,0.500000,0.500000,0.500000,0.500000,0.658510,0.500000,...,0.5,0.5,0.5,0.5,2.224172,1.896732,1.896732,0.50000,0.500000,0.500000
1396,0.430046,0.500000,0.50000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.500000,0.500000,0.500000,0.50000,0.500000,0.500000
1397,0.500000,0.500000,0.50000,0.500000,0.500000,0.500000,0.500000,0.332419,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.500000,0.500000,0.500000,1.58061,0.500000,0.500000
1398,0.500000,0.500000,0.50000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.500000,0.500000,0.500000,0.50000,2.133824,0.500000


# **Driver Section**

In [None]:
# main function to return top 5 documents * calculated using score
vocab = list(term_id_mapper.keys())
def return_top5_relevant_docs_using_sum(query, tfidf_matrix):
  relevant_matrix = tfidf_matrix.copy()

  query_vector = np.array(query)
  relevant_matrix += query_vector
  relevant_matrix['tfidf_score'] = relevant_matrix.sum(axis=1)
  sorted_relevant_matrix = relevant_matrix.sort_values(by='tfidf_score', ascending=False)
  # return sorted_relevant_matrix.index[:5] + 1, sorted_relevant_matrix['tfidf_score'].head(5).values
  return list(sorted_relevant_matrix.index[:5] + 1)

In [None]:
# main function to return top 5 documents - calculated using cosine similarity
vocab = list(term_id_mapper.keys())
def return_top5_relevant_docs_using_similarity(query, tfidf_matrix):
  # Calculate cosine similarity between input vector and each document
  cosine_similarities = tfidf_matrix.apply(lambda row: cosine_similarity([query], [row])[0][0], axis=1)

  temp_matrix = tfidf_matrix.copy()
  temp_matrix['tfidf_score'] = cosine_similarities
  sorted_relevant_matrix = temp_matrix.sort_values(by='tfidf_score', ascending=False)
  # return sorted_relevant_matrix.index[:5] + 1, sorted_relevant_matrix['tfidf_score'].head(5).values
  return list(sorted_relevant_matrix.index[:5] + 1)

In [None]:
if __name__ == '__main__':
  print('\n')
  query = input('Enter your query: ')
  print('\n')

  # generate query vectors
  query_vectors = process_query(query, vocab, idf_mapper)

  # using tf-idf score
  binary_doc_ids_score = return_top5_relevant_docs_using_sum(query_vectors[0], binary_tfidf)
  raw_freq_doc_ids_score = return_top5_relevant_docs_using_sum(query_vectors[1], raw_freq_tfidf)
  term_freq_doc_ids_score = return_top5_relevant_docs_using_sum(query_vectors[2], term_freq_tfidf)
  log_norm_doc_ids_score = return_top5_relevant_docs_using_sum(query_vectors[3], log_norm_tfidf)
  double_norm_doc_ids_score = return_top5_relevant_docs_using_sum(query_vectors[4], double_norm_tfidf)

  # using cosine similarity
  binary_doc_ids_similarity = return_top5_relevant_docs_using_similarity(query_vectors[0], binary_tfidf)
  raw_freq_doc_ids_similarity = return_top5_relevant_docs_using_similarity(query_vectors[1], raw_freq_tfidf)
  term_freq_doc_ids_similarity = return_top5_relevant_docs_using_similarity(query_vectors[2], term_freq_tfidf)
  log_norm_doc_ids_similarity = return_top5_relevant_docs_using_similarity(query_vectors[3], log_norm_tfidf)
  double_norm_doc_ids_similarity = return_top5_relevant_docs_using_similarity(query_vectors[4], double_norm_tfidf)

  result_similarity = [
      ['Binary', binary_doc_ids_similarity],
      ['Raw Count', raw_freq_doc_ids_similarity],
      ['Term Frequency', term_freq_doc_ids_similarity],
      ['Log Normalization', log_norm_doc_ids_similarity],
      ['Double Normalization', double_norm_doc_ids_similarity]
  ]

  result_score = [
      ['Binary', binary_doc_ids_score],
      ['Raw Count', raw_freq_doc_ids_score],
      ['Term Frequency', term_freq_doc_ids_score],
      ['Log Normalization', log_norm_doc_ids_score],
      ['Double Normalization', double_norm_doc_ids_score]
  ]

  print('Relevant Documents based on Tf-idf Score: ')
  print(tabulate(result_score, ['Weighting Scheme', 'Document Ids'], tablefmt="github"))
  print('\n')

  print('Relevant Documents based on Similarity: ')
  print(tabulate(result_similarity, ['Weighting Scheme', 'Document Ids'], tablefmt="github"))
  print('\n')



Enter your query: experimental investigation slipstream


Relevant Documents based on Tf-idf Score: 
| Weighting Scheme     | Document Ids               |
|----------------------|----------------------------|
| Binary               | [244, 1313, 344, 798, 792] |
| Raw Count            | [1313, 244, 329, 798, 721] |
| Term Frequency       | [471, 995, 718, 1168, 83]  |
| Log Normalization    | [244, 1313, 798, 792, 344] |
| Double Normalization | [344, 244, 792, 262, 163]  |


Relevant Documents based on Similarity: 
| Weighting Scheme     | Document Ids               |
|----------------------|----------------------------|
| Binary               | [409, 1, 1090, 1091, 1094] |
| Raw Count            | [1, 453, 484, 1144, 1064]  |
| Term Frequency       | [1, 453, 484, 1144, 1064]  |
| Log Normalization    | [1, 484, 453, 1144, 1064]  |
| Double Normalization | [3, 389, 4, 382, 774]      |


