In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.2-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.2


In [2]:
!pip install wikipedia

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11696 sha256=4a139d74728c3a2d25e85e5795c039cbfaabaf5d93445936af8d6b865fa1814c
  Stored in directory: /root/.cache/pip/wheels/c2/46/f4/caa1bee71096d7b0cdca2f2a2af45cacf35c5760bee8f00948
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [3]:
import wikipedia
import torch
from transformers import AutoTokenizer, AutoModel
from scipy.spatial import distance

In [4]:
# load pre-trained BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# set device to CUDA if available, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [5]:
# Returns the BERT embeddings for the given text
def get_bert_embeddings(text):
    
    input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True, max_length = 512)).unsqueeze(0)
    input_ids = input_ids.to(device)
    outputs = model(input_ids)
    last_hidden_state = outputs.last_hidden_state
    embeddings = torch.mean(last_hidden_state, dim=1).squeeze()
    return embeddings.detach().cpu().numpy()

In [6]:
# Returns the top 'n_results' search results from Wikipedia for the given query
def search_wikipedia(query, n_results=5, similarity_threshold = 0.5 ):
    
    # get search results from Wikipedia API
    search_results = wikipedia.search(query, results=n_results)

    # initialize list to store results
    results = []

    # iterate over search results and get BERT embeddings for each page summary
    try:
      for result in search_results:
          try:
              # get page summary
              page = wikipedia.page(result)
              summary = page.summary

              # get BERT embeddings for query and page summary
              query_embeddings = get_bert_embeddings(query)
              summary_embeddings = get_bert_embeddings(summary)

              # calculate cosine similarity between query and page summary embeddings
              similarity = 1 - (distance.cosine(query_embeddings, summary_embeddings))
        
              # filter out results with similarity score below threshold
              if similarity < similarity_threshold:
                  continue
              # add result to list
              results.append({'title': page.title, 'url': page.url, 'summary': summary, 'similarity': similarity})
          except wikipedia.exceptions.DisambiguationError as e:
              # if page is disambiguation page, skip it
              continue
    except:
      print("Page Not Found")

    # sort results by similarity in descending order
    results = sorted(results, key=lambda x: x['similarity'], reverse=True)

    # return top n_results results
    return results[:n_results]


In [12]:
# get user input
query = input("Enter your query: ")

# search wikipedia for query
results = search_wikipedia(query)

# print search results
for i, result in enumerate(results):
    print(f"\nResult {i+1}:")
    print(f"Title: {result['title']}")
    print(f"URL: {result['url']}")
    print(f"Summary: {result['summary']}")

Enter your query: Indraprastha Institute of Information Technology

Result 1:
Title: Indraprastha Institute of Information Technology, Delhi
URL: https://en.wikipedia.org/wiki/Indraprastha_Institute_of_Information_Technology,_Delhi
Summary: Indraprastha Institute of Information Technology, Delhi (Iṃdraprastha Sūcanā Praudyōgikī Saṃsthān Dillī, IIIT-Delhi or IIIT-D) is an autonomous State University located in Delhi, India. It is a research-oriented institute with a focus on Computer Science and allied areas. IIIT delhi offers B.Tech,M.Tech and Ph.D degrees.

Result 2:
Title: Guru Gobind Singh Indraprastha University
URL: https://en.wikipedia.org/wiki/Guru_Gobind_Singh_Indraprastha_University
Summary: Guru Gobind Singh Indraprastha University, formerly Indraprastha University (IP or IPU), is a state university located in Dwarka, Delhi, India. The campus occupies 78 acres (31.56 hectares), among the largest in Delhi, and enrolls over 8,000 students.Indraprastha University was established

In [13]:
import pandas as pd

# create a test dataset with query and relevant page titles
test_data = {'query': ['Narendra Modi','Indraprastha Institute of Information Technology'],
             'relevant_pages': [['Narendra Modi', 'PM Narendra Modi'],['Indraprastha Institute of Information Technology, Delhi','Higher Education']]}
test_df = pd.DataFrame(test_data)

# evaluate search engine on test dataset
def evaluate_search_engine(test_df, n_results=5):
    results = []
    for index, row in test_df.iterrows():
        query = row['query']
        relevant_pages = row['relevant_pages']
        search_results = search_wikipedia(query, n_results=n_results)
        retrieved_pages = [result['title'] for result in search_results]
        precision = len(set(retrieved_pages).intersection(set(relevant_pages))) / n_results
        print(precision)
        recall = len(set(retrieved_pages).intersection(set(relevant_pages))) / len(relevant_pages)
        print(recall)
        f1_score = 2 * precision * recall / (precision + recall)
        results.append({'query': query, 'precision': precision, 'recall': recall, 'f1_score': f1_score})
    return pd.DataFrame(results)

# evaluate search engine on test dataset
evaluation_results = evaluate_search_engine(test_df)

# print evaluation results
print(evaluation_results)


0.4
1.0
0.2
0.5
                                              query  precision  recall  \
0                                     Narendra Modi        0.4     1.0   
1  Indraprastha Institute of Information Technology        0.2     0.5   

   f1_score  
0  0.571429  
1  0.285714  


In [14]:
# define a set of test queries and expected results
test_queries = ["Narendra Modi",'Indraprastha Institute of Information Technology']
expected_results = [
    ["Narendra Modi", "PM Narendra Modi"],['Indraprastha Institute of Information Technology, Delhi','Higher Education']
]

# search wikipedia for each test query and compare results to expected results
for i, query in enumerate(test_queries):
    print(f"Test query {i+1}: {query}")
    
    # get search results from wikipedia search engine
    search_results = search_wikipedia(query)
    
    # extract titles from search results
    search_titles = [result["title"] for result in search_results]
    
    # compare search titles to expected titles
    expected_titles = expected_results[i]
    num_correct = len(set(search_titles) & set(expected_titles))
    num_total = len(expected_titles)
    accuracy = num_correct / num_total
    
    # print evaluation results
    print(f"Expected titles: {expected_titles}")
    print(f"Search titles: {search_titles}")
    print(f"Accuracy: {accuracy}\n")

Test query 1: Narendra Modi
Expected titles: ['Narendra Modi', 'PM Narendra Modi']
Search titles: ['PM Narendra Modi', 'Jashodaben Modi', 'Narendra Modi', 'Narendra Modi Stadium', 'Premiership of Narendra Modi']
Accuracy: 1.0

Test query 2: Indraprastha Institute of Information Technology
Expected titles: ['Indraprastha Institute of Information Technology, Delhi', 'Higher Education']
Search titles: ['Indraprastha Institute of Information Technology, Delhi', 'Guru Gobind Singh Indraprastha University', 'Education in Delhi', 'List of colleges affiliated with Guru Gobind Singh Indraprastha University', 'List of institutions of higher education in Delhi']
Accuracy: 0.5



In [15]:
# define a set of test queries and expected results
test_queries = ["Narendra Modi",'Indraprastha Institute of Information Technology']
expected_results = [
    ["Narendra Modi", "PM Narendra Modi"],['Indraprastha Institute of Information Technology, Delhi','Higher Education']
]

# calculate MRR for each test query
mrr_sum = 0.0
for i, query in enumerate(test_queries):
    print(f"Test query {i+1}: {query}")
    
    # get search results from wikipedia search engine
    search_results = search_wikipedia(query)
    
    # extract titles from search results
    search_titles = [result["title"] for result in search_results]
    
    # calculate MRR for this query
    expected_titles = expected_results[i]
    mrr = 0.0
    for j, title in enumerate(search_titles):
        if title in expected_titles:
            rank = j + 1
            mrr = 1.0 / rank
            break
    
    # add MRR for this query to sum
    mrr_sum += mrr
    
    # print evaluation results
    print(f"Expected titles: {expected_titles}")
    print(f"Search titles: {search_titles}")
    print(f"MRR: {mrr}\n")

# calculate average MRR across all test queries
num_queries = len(test_queries)
avg_mrr = mrr_sum / num_queries
print(f"Average MRR: {avg_mrr}")


Test query 1: Narendra Modi
Expected titles: ['Narendra Modi', 'PM Narendra Modi']
Search titles: ['PM Narendra Modi', 'Jashodaben Modi', 'Narendra Modi', 'Narendra Modi Stadium', 'Premiership of Narendra Modi']
MRR: 1.0

Test query 2: Indraprastha Institute of Information Technology
Expected titles: ['Indraprastha Institute of Information Technology, Delhi', 'Higher Education']
Search titles: ['Indraprastha Institute of Information Technology, Delhi', 'Guru Gobind Singh Indraprastha University', 'Education in Delhi', 'List of colleges affiliated with Guru Gobind Singh Indraprastha University', 'List of institutions of higher education in Delhi']
MRR: 1.0

Average MRR: 1.0


In [16]:
# define a set of test queries and expected results
test_queries = ["Narendra Modi",'Indraprastha Institute of Information Technology']
expected_results = [
    ["Narendra Modi", "PM Narendra Modi"],['Indraprastha Institute of Information Technology, Delhi','Higher Education']
]

# calculate MAP for each test query
map_sum = 0.0
for i, query in enumerate(test_queries):
    print(f"Test query {i+1}: {query}")
    
    # get search results from wikipedia search engine
    search_results = search_wikipedia(query)
    
    # extract titles from search results
    search_titles = [result["title"] for result in search_results]
    
    # calculate MAP for this query
    expected_titles = expected_results[i]
    num_correct = 0.0
    precision_sum = 0.0
    for j, title in enumerate(search_titles):
        if title in expected_titles:
            num_correct += 1.0
            precision = num_correct / (j+1)
            precision_sum += precision
    
    # calculate average precision for this query
    avg_precision = precision_sum / len(expected_titles)
    
    # add average precision for this query to sum
    map_sum += avg_precision
    
    # print evaluation results
    print(f"Expected titles: {expected_titles}")
    print(f"Search titles: {search_titles}")
    print(f"Average precision: {avg_precision}\n")

# calculate average MAP across all test queries
num_queries = len(test_queries)
avg_map = map_sum / num_queries
print(f"Average MAP: {avg_map}")


Test query 1: Narendra Modi
Expected titles: ['Narendra Modi', 'PM Narendra Modi']
Search titles: ['PM Narendra Modi', 'Jashodaben Modi', 'Narendra Modi', 'Narendra Modi Stadium', 'Premiership of Narendra Modi']
Average precision: 0.8333333333333333

Test query 2: Indraprastha Institute of Information Technology
Expected titles: ['Indraprastha Institute of Information Technology, Delhi', 'Higher Education']
Search titles: ['Indraprastha Institute of Information Technology, Delhi', 'Guru Gobind Singh Indraprastha University', 'Education in Delhi', 'List of colleges affiliated with Guru Gobind Singh Indraprastha University', 'List of institutions of higher education in Delhi']
Average precision: 0.5

Average MAP: 0.6666666666666666
