In [11]:
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import pandas as pd
import numpy as np
from textwrap import fill



# Initialize tokenizer and model for DistilBERT
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
model = TFAutoModel.from_pretrained("distilbert-base-multilingual-cased")

def compute_embedding(text):
    encoded_input = tokenizer(text, return_tensors="tf", padding=True, truncation=True, max_length=512)
    outputs = model(**encoded_input)
    embeddings = tf.reduce_mean(outputs.last_hidden_state, axis=1)
    return embeddings.numpy()

# Load a subset of the wikipedia dataset (assuming structure and availability)
dataset = load_dataset("Cohere/wikipedia-22-12-en-embeddings",split="train", streaming=True)
d0 = dataset

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Resolving data files:   0%|          | 0/253 [00:00<?, ?it/s]

In [12]:
def compute_embedding_for_df(df):
    # Create a list to store embeddings
    embeddings = []
    for row in range(len(df)):
        embedding = compute_embedding(df.iloc[row]['text'])
        embeddings.append(embedding)
        if row % 10 == 0:
            print(f"computed {row} embeddings")
    # Assign all embeddings at once
    df['embedding'] = embeddings
    return df

In [13]:

#========Exercise 3.1 =========== 
# Fill in the following code
# ===============================

def make_dataset(dataset,max_num_of_articles=None):
    data_list = []
    if max_num_of_articles:
        # print(f"using {max_num_of_articles} articles")
        for i, example in enumerate(dataset):
            if i >= max_num_of_articles:
                break
            data_list.append(example)
        print(f"using {i} articles")

        # Convert to DataFrame
    else: # get all the data fronm the iterator 
        print("using all articles")
        for example in tqdm(dataset):
            data_list.append(example)

        # Convert to DataFrame
    df = pd.DataFrame(data_list)
    return df

def find_most_relevant_article(query_embedding, df,max_num_of_articles=None):
    max_similarity = -1

    most_relevant_article = ''
    
    query_embedding = np.array(query_embedding[0][:])
    query_embedding = query_embedding.reshape(1,-1)
    for row in range(len(df)):

        article_embedding = df.iloc[row]['embedding']
        # article_embedding = np.array(article_embedding).reshape(1,-1)

        similarity_vec = cosine_similarity(query_embedding, article_embedding)
        if similarity_vec > max_similarity:
            max_similarity = similarity_vec
            most_relevant_article = df.iloc[row]['text']



    return most_relevant_article, max_similarity


NUM = 1000


In [14]:
df = make_dataset(dataset,NUM)
df = compute_embedding_for_df(df)
input = ['Leonardo DiCaprio',
         'France',
         'Python',
         'Deep Learning']

for i in input:
    input_embedding = compute_embedding(i)
    article, similarity =find_most_relevant_article(input_embedding, df,max_num_of_articles=1000)
    print("Input embdding of:", i)
    print(fill("Most Relevant Article: \n "+ article,width=100))
    print("Similarity Score:", similarity)
    print("\n")


using 1000 articles


KeyboardInterrupt: 

In [None]:
s = 'France'
input_embedding = compute_embedding(s)
article, similarity =find_most_relevant_article(input_embedding, df,max_num_of_articles=1000)

In [None]:
print(fill("Most Relevant Article: \n "+ article,width=100))
print("Similarity Score:", similarity)
print("\n")


In [None]:
for row in range(0,3):
    print(df.iloc[row]['text'])

In [None]:
df.iloc[1]['embedding'] = compute_embedding(df.iloc[1]['text'])