In [279]:
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import pandas as pd
import numpy as np


# Initialize tokenizer and model for DistilBERT
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
model = TFAutoModel.from_pretrained("distilbert-base-multilingual-cased")

def compute_embedding(text):
    encoded_input = tokenizer(text, return_tensors="tf", padding=True, truncation=True, max_length=512)
    outputs = model(**encoded_input)
    embeddings = tf.reduce_mean(outputs.last_hidden_state, axis=1)
    return embeddings.numpy()

# Load a subset of the wikipedia dataset (assuming structure and availability)
dataset = load_dataset("Cohere/wikipedia-22-12-en-embeddings",split="train", streaming=True)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Resolving data files:   0%|          | 0/253 [00:00<?, ?it/s]

In [309]:

#========Exercise 3.1 =========== 
# Fill in the following code
# ===============================
def find_most_relevant_article(query_embedding, dataset,max_num_of_articles=None):
    data_list = []
    if max_num_of_articles:
        # print(f"using {max_num_of_articles} articles")
        for i, example in enumerate(dataset):
            if i >= max_num_of_articles:
                break
            data_list.append(example)
        print(f"using {i} articles")

        # Convert to DataFrame
    else: # get all the data fronm the iterator 
        print("using all articles")
        for example in tqdm(dataset):
            data_list.append(example)

        # Convert to DataFrame
    df = pd.DataFrame(data_list)


    min_similarity = np.inf

    most_relevant_article = ''
    
    query_embedding = np.array(query_embedding[0][:])
    query_embedding = query_embedding.reshape(1,-1)
    # print(query_embedding.shape)

    for row in df.iterrows():
        article_embedding = row[:]

        article_embedding = np.array(article_embedding[1]['emb'])
        article_embedding = article_embedding.reshape(1,-1)
        # print('using the following article:\n', row[1]['text'])
        # print(article_embedding.shape)

        similarity_vec = cosine_similarity(query_embedding, article_embedding)
        # print(similarity_vec)
        if similarity_vec < min_similarity:
            min_similarity = similarity_vec
            most_relevant_article = row[1]['text']



    return most_relevant_article, min_similarity
 

In [314]:

# Example input string
input_text = "Russia and Ukraine"

# # Compute the embedding for the input text
input_embedding = compute_embedding(input_text)

# print(input_embedding)

# Find the most relevant article
# To reduce the runtime, look at only the first N articles
article, similarity = find_most_relevant_article(input_embedding, dataset,max_num_of_articles=2000)
print("Most Relevant Article:", article)
print("Similarity Score:", similarity)

using 2000 articles
Most Relevant Article: Cleopatra was depicted in various ancient works of art, in the Egyptian as well as Hellenistic-Greek and Roman styles. Surviving works include statues, busts, reliefs, and minted coins, as well as ancient carved cameos, such as one depicting Cleopatra and Antony in Hellenistic style, now in the Altes Museum, Berlin. Contemporary images of Cleopatra were produced both in and outside of Ptolemaic Egypt. For instance, a large gilded bronze statue of Cleopatra once existed inside the Temple of Venus Genetrix in Rome, the first time that a living person had their statue placed next to that of a deity in a Roman temple. It was erected there by Caesar and remained in the temple at least until the 3rd century AD, its preservation perhaps owing to Caesar's patronage, although Augustus did not remove or destroy artworks in Alexandria depicting Cleopatra.
Similarity Score: [[-0.06628241]]


In [311]:
input = ['Leonardo DiCaprio',
         'France',
         'Python',
         'Deep Learning']

for i in input:
    input_embedding = compute_embedding(i)
    article, similarity = find_most_relevant_article(input_embedding, dataset,max_num_of_articles=1000)
    print("Input embdding of:", i)
    print("Most Relevant Article:", article)
    print("Similarity Score:", similarity)
    print("\n")


using 1000 articles
Input embdding of: Leonardo DiCaprio
Most Relevant Article: On 3 March 2013, Elizabeth stayed overnight at King Edward VII's Hospital as a precaution after developing symptoms of gastroenteritis. A week later, she signed the new Charter of the Commonwealth. Because of her age and the need for her to limit travelling, in 2013 she chose not to attend the biennial Commonwealth Heads of Government Meeting for the first time in 40 years. She was represented at the summit in Sri Lanka by Prince Charles. On 20 April 2018, the Commonwealth heads of government announced that she would be succeeded by Charles as Head of the Commonwealth, which she stated was her "sincere wish". She underwent cataract surgery in May 2018. In March 2019, she gave up driving on public roads, largely as a consequence of a car crash involving her husband two months earlier.
Similarity Score: [[-0.0569665]]


using 1000 articles
Input embdding of: France
Most Relevant Article: The final was played 

In [240]:
data_list = []
for i, example in enumerate(dataset):
    if i >= 2000:
        break
    data_list.append(example)


df = pd.DataFrame(data_list)


In [241]:
df.tail(400)

Unnamed: 0,id,title,text,url,wiki_id,views,paragraph_id,langs,emb
1600,1600,Facebook,"In 2018, Special Counsel Robert Mueller indict...",https://en.wikipedia.org/wiki?curid=7529378,7529378,4679.586914,170,194,"[-0.02236877754330635, -0.021932709962129593, ..."
1601,1601,Facebook,Mueller contacted Facebook subsequently to the...,https://en.wikipedia.org/wiki?curid=7529378,7529378,4679.586914,171,194,"[-0.024141181260347366, -0.10890068113803864, ..."
1602,1602,Facebook,The company pledged full cooperation in Muelle...,https://en.wikipedia.org/wiki?curid=7529378,7529378,4679.586914,172,194,"[-0.26512399315834045, -0.18133266270160675, -..."
1603,1603,Facebook,"Russian-American billionaire Yuri Milner, who ...",https://en.wikipedia.org/wiki?curid=7529378,7529378,4679.586914,173,194,"[-0.07663505524396896, 0.16205301880836487, -0..."
1604,1604,Facebook,"In January 2019, Facebook removed 289 pages an...",https://en.wikipedia.org/wiki?curid=7529378,7529378,4679.586914,174,194,"[-0.18163779377937317, -0.029511980712413788, ..."
...,...,...,...,...,...,...,...,...,...
1995,1995,Russo-Ukrainian War,"In early September 2014, Russian state-owned t...",https://en.wikipedia.org/wiki?curid=42085878,42085878,4608.533691,29,86,"[0.42205536365509033, -0.30303043127059937, 0...."
1996,1996,Russo-Ukrainian War,"On 3 September, Poroshenko said he and Putin h...",https://en.wikipedia.org/wiki?curid=42085878,42085878,4608.533691,30,86,"[0.5118040442466736, 0.07615064084529877, 0.54..."
1997,1997,Russo-Ukrainian War,"On 5 September 2014, the Minsk Protocol ceasef...",https://en.wikipedia.org/wiki?curid=42085878,42085878,4608.533691,31,86,"[0.4486839473247528, 0.22937458753585815, 0.10..."
1998,1998,Russo-Ukrainian War,"On 7 and 12 November, NATO officials reconfirm...",https://en.wikipedia.org/wiki?curid=42085878,42085878,4608.533691,32,86,"[0.31511324644088745, -0.11626601964235306, 0...."
