#Information Retrieval System Using TF-IDF & BM-25

In [1]:
#Importing the required libraries
import re 
import string 
import requests 
import numpy as np 
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#Function to retireve the document from the url 
def retrieve_docs_and_clean():

  r = requests.get('https://sports.ndtv.com/fifa-world-cup-2022/news')#loading the url in the variable 'r'
  soup = BeautifulSoup(r.content, 'html.parser')

  #code to extract required details from the url
  link = []
  for i in soup.find('div', {'class':'lst-pg_hd'}).find_all('a',{'class':'lst-pg_ttl'}):
      i['href'] ='https://sports.ndtv.com/'+ i['href'] + '?page=all'
      link.append(i['href'])
  

  # Retrieve Paragraphs
  documents = []
  for i in link:
      r = requests.get(i)
      soup = BeautifulSoup(r.content, 'html.parser')

      sen = []
      for i in soup.find('div', {'class':'sp-cn pg-str-com js-ad-section'}).find_all('p'):
          sen.append(i.text)
      documents.append(' '.join(sen))

  # Clean Paragraphs
  documents_clean = []
  for d in documents:
      document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
      document_test = re.sub(r'@\w+', '', document_test)
      document_test = document_test.lower()
      document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
      document_test = re.sub(r'[0-9]', '', document_test)
      document_test = re.sub(r'\s{2,}', ' ', document_test)
      documents_clean.append(document_test)

  return documents_clean

In [3]:
docs = retrieve_docs_and_clean()
# Create Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names_out())
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
able,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aboubakar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101531,0.0,0.050543,0.0,0.0,0.034235
about,0.0,0.0,0.0,0.0,0.0,0.0,0.070699,0.0,0.0,0.0,0.0,0.051342,0.0,0.0,0.0,0.0,0.0,0.0
above,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004664,0.0,0.0
accepting,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df.shape

(1773, 18)

In [5]:
def get_similar_articles(q, df):
  print("query:", q)
  print("The following are articles with the highest cosine similarity values: ")
  print()
  q = [q]
  q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
  sim = {}
  for i in range(10):
    sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
  sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
  
  for k, v in sim_sorted:
    if v != 0.0:
      print("Similarity Values:", v)
      print(docs[k])
      print()


q1 = 'barcelona'
q2 = 'spain'
q3 = 'argentina'

get_similar_articles(q1, df)
print('-'*100)
get_similar_articles(q2, df)
print('-'*100)
get_similar_articles(q3, df)

query: barcelona
The following are articles with the highest cosine similarity values: 

----------------------------------------------------------------------------------------------------
query: spain
The following are articles with the highest cosine similarity values: 

Similarity Values: 0.1633129978424704
japan stunned spain in their group e match of the fifa world cup to storm into the round of stage as the table toppers despite losing the match spain still managed to qualify for the knockouts due to a better goal difference in the process however the champions germany were knocked out however japan s encounter with spain had a controversial moment that triggered a storm on social media ao tanaka s goal which gave japan a lead in the st minute wasn t given right away as the officials checked whether the ball went past the touchline after a var check the decision was given in the favor of japan but many images from different angles surfaced on social media which suggested that th

In [6]:
from gensim.summarization.bm25 import BM25

def simple_tok(sent:str):
    return sent.split()

def bm25_similar_articles(query):
  print("query:", query)
  print("The following are articles with the highest BM25 scores: ")
  print()
  tok_corpus = [simple_tok(s) for s in docs]
  query = simple_tok(query)
  bm25 = BM25(tok_corpus)
  scores = bm25.get_scores(query, average_idf = 100)
  best_docs = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:3]
  for i, b in enumerate(best_docs):
      print(f"rank {i+1}: {docs[b]}")
      print()


q1 = 'barcelona'
q2 = 'spain'
q3 = 'argentina'


bm25_similar_articles(q1)
print('-'*100)
bm25_similar_articles(q2)
print('-'*100)
bm25_similar_articles(q3)
print('-'*100)

query: barcelona
The following are articles with the highest BM25 scores: 

rank 1: dani alves became the oldest player to represent brazil at the men s world cup as the year old captained the tournament favourites in their final group game against cameroon at lusail stadium on friday alves who turned in may and has since left barcelona to play in mexico broke the record that had been set by current teammate thiago silva who is playing in this world cup at the age of window rrcode window rrcode rrcode push function function v d o ai ai d createelement script ai defer true ai async true ai src v location protocol o d head appendchild ai window document a vdo ai core v ndtv vdo ai js however alves fell short of the overall brazilian record of formiga who played at the women s world cup aged he is also a few months younger than canada s atiba hutchinson and portugal defender pepe two year olds who have already played at this world cup coach tite makes nine changes in total to the brazil t