In [5]:
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import pandas as pd
import numpy as np
from textwrap import fill



# Initialize tokenizer and model for DistilBERT
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
model = TFAutoModel.from_pretrained("distilbert-base-multilingual-cased")

def compute_embedding(text):
    encoded_input = tokenizer(text, return_tensors="tf", padding=True, truncation=True, max_length=512)
    outputs = model(**encoded_input)
    embeddings = tf.reduce_mean(outputs.last_hidden_state, axis=1)
    return embeddings.numpy()

# Load a subset of the wikipedia dataset (assuming structure and availability)
dataset = load_dataset("Cohere/wikipedia-22-12-en-embeddings",split="train", streaming=True)
d0 = dataset

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Resolving data files:   0%|          | 0/253 [00:00<?, ?it/s]

In [76]:

#========Exercise 3.1 =========== 
# Fill in the following code
# ===============================

def make_dataset(dataset,max_num_of_articles=None):
    data_list = []
    if max_num_of_articles:
        # print(f"using {max_num_of_articles} articles")
        for i, example in enumerate(dataset):
            if i >= max_num_of_articles:
                break
            data_list.append(example)
        print(f"using {i} articles")

        # Convert to DataFrame
    else: # get all the data fronm the iterator 
        print("using all articles")
        for example in tqdm(dataset):
            data_list.append(example)

        # Convert to DataFrame
    df = pd.DataFrame(data_list)
    return df

def find_most_relevant_article(query_embedding, df,max_num_of_articles=None):
    # df = make_dataset(dataset,max_num_of_articles)
    max_similarity = -1

    most_relevant_article = ''
    
    query_embedding = np.array(query_embedding[0][:])
    query_embedding = query_embedding.reshape(1,-1)
    # print(query_embedding.shape)
    for row in range(len(df)):
        # article_embedding = row[:]

        # article_embedding = np.array(article_embedding[1]['emb'])
        article_embedding = df.iloc[row]['emb']
        article_embedding = np.array(article_embedding).reshape(1,-1)

        # article_embedding = np.array(article_embedding)

        # print('using the following article:\n', row[1]['text'])
        # print(article_embedding.shape)

        similarity_vec = cosine_similarity(query_embedding, article_embedding)
        # print(similarity_vec)
        if similarity_vec > max_similarity:
            max_similarity = similarity_vec
            most_relevant_article = df.iloc[row]['text']
        # i+=1




    return most_relevant_article, max_similarity

# def make_dataset_and_find_most_relevant_article(query_embedding, dataset,max_num_of_articles=None):
#     df = make_dataset(dataset,max_num_of_articles)
#     most_relevant_article, similarity_amount = find_most_relevant_article(query_embedding, df)
#     return most_relevant_article, similarity_amount

NUM = 20000
# df = make_dataset(dataset,max_num_of_articles=NUM)


In [7]:
df = make_dataset(dataset,NUM)
# Example input string
input_text = "Elon Musk"

# # Compute the embedding for the input text
input_embedding = compute_embedding(input_text)



'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 007ff12f-c119-4916-8852-c2348ec0c5fb)')' thrown while requesting GET https://huggingface.co/datasets/Cohere/wikipedia-22-12-en-embeddings/resolve/85c2eca83d4b9dcecc043c23748cb8c1047f683f/data/train-00000-of-00253-8d3dffb4e6ef0304.parquet
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs.hf.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 5f0b46c7-7932-4d5a-8fc7-4798436c0999)')' thrown while requesting GET https://huggingface.co/datasets/Cohere/wikipedia-22-12-en-embeddings/resolve/85c2eca83d4b9dcecc043c23748cb8c1047f683f/data/train-00000-of-00253-8d3dffb4e6ef0304.parquet
Retrying in 2s [Retry 2/5].


using 20000 articles


In [77]:
# Example input string
input_text = "Elon Musk"

# # Compute the embedding for the input text
input_embedding = compute_embedding(input_text)

# Find the most relevant article
# To reduce the runtime, look at only the first N articles
article_idx, similarity = find_most_relevant_article(input_embedding, df,max_num_of_articles=NUM)
print(fill("Most Relevant Article: \n "+article_idx, width=100))
print("Similarity Score:", similarity)

Most Relevant Article:   Unlike previous "Batman" films, "The Batman" focuses on Batman's detective
skills, with Reeves describing it as an "almost-noir driven, detective version of Batman"
emphasizing the character's heart and mind. He said the film blended the detective, action, horror,
and psychological thriller genres, which he felt hewed closer to the comics than previous
adaptations had. He also felt this approach made it the most frightening "Batman" film. Reeves
looked to films and filmmakers from the New Hollywood era for inspiration, including "The French
Connection" (1971), "Klute" (1971), "Chinatown" (1974), "All the President's Men" (1976), and "Taxi
Driver" (1976), as well as the works of Alfred Hitchcock and Wong Kar-wai's short film "The Hand"
(2004). "Chinatown" and "All the President's Men" influenced "The Batman" depiction of a corrupt,
decaying Gotham, while the relationship between Donald Sutherland and Jane Fonda's characters in
"Klute" inspired the dynamic betwee

In [78]:
input = ['Leonardo DiCaprio',
         'France',
         'Python',
         'Deep Learning']

for i in input:
    input_embedding = compute_embedding(i)
    article, similarity =find_most_relevant_article(input_embedding, df,max_num_of_articles=1000)
    print("Input embdding of:", i)
    print(fill("Most Relevant Article: \n "+ article,width=100))
    print("Similarity Score:", similarity)
    print("\n")


Input embdding of: Leonardo DiCaprio
Most Relevant Article:   Sessions for West's sixth solo effort begin to take shape in early 2013 in
his own personal loft's living room at a Paris hotel. Determined to "undermine the commercial", he
once again brought together close collaborators and attempted to incorporate Chicago drill,
dancehall, acid house, and industrial music. Primarily inspired by architecture, West's
perfectionist tendencies led him to contact producer Rick Rubin fifteen days shy of its due date to
strip down the record's sound in favor of a more minimalist approach. Initial promotion of his sixth
album included worldwide video projections of the album's music and live television performances.
"Yeezus", West's sixth album, was released June 18, 2013, to rave reviews from critics. It became
his sixth consecutive number one debut, but also marked his lowest solo opening week sales.
Similarity Score: [[0.07458331]]


Input embdding of: France
Most Relevant Article:   There are