In [None]:
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import pandas as pd
import numpy as np
from textwrap import fill



# Initialize tokenizer and model for DistilBERT
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
model = TFAutoModel.from_pretrained("distilbert-base-multilingual-cased")

def compute_embedding(text):
    encoded_input = tokenizer(text, return_tensors="tf", padding=True, truncation=True, max_length=512)
    outputs = model(**encoded_input)
    embeddings = tf.reduce_mean(outputs.last_hidden_state, axis=1)
    return embeddings.numpy()

# Load a subset of the wikipedia dataset (assuming structure and availability)
dataset = load_dataset("Cohere/wikipedia-22-12-en-embeddings",split="train", streaming=True)
d0 = dataset

In [88]:

#========Exercise 3.1 =========== 
# Fill in the following code
# ===============================

def make_dataset(dataset,max_num_of_articles=None):
    data_list = []
    if max_num_of_articles:
        # print(f"using {max_num_of_articles} articles")
        for i, example in enumerate(dataset):
            if i >= max_num_of_articles:
                break
            data_list.append(example)
        print(f"using {i} articles")

        # Convert to DataFrame
    else: # get all the data fronm the iterator 
        print("using all articles")
        for example in tqdm(dataset):
            data_list.append(example)

        # Convert to DataFrame
    df = pd.DataFrame(data_list)
    return df

def find_most_relevant_article(query_embedding, df,max_num_of_articles=None):
    max_similarity = -1

    most_relevant_article = ''
    
    query_embedding = np.array(query_embedding[0][:])
    query_embedding = query_embedding.reshape(1,-1)
    for row in range(len(df)):

        article_embedding = df.iloc[row]['emb']
        article_embedding = np.array(article_embedding).reshape(1,-1)

        similarity_vec = cosine_similarity(query_embedding, article_embedding)
        if similarity_vec > max_similarity:
            max_similarity = similarity_vec
            most_relevant_article = df.iloc[row]['text']



    return most_relevant_article, max_similarity


NUM = 10000


In [89]:
df = make_dataset(dataset,NUM)
input = ['Leonardo DiCaprio',
         'France',
         'Python',
         'Deep Learning']

for i in input:
    input_embedding = compute_embedding(i)
    article, similarity =find_most_relevant_article(input_embedding, df,max_num_of_articles=1000)
    print("Input embdding of:", i)
    print(fill("Most Relevant Article: \n "+ article,width=100))
    print("Similarity Score:", similarity)
    print("\n")


using 10000 articles
Input embdding of: Leonardo DiCaprio
Most Relevant Article:   Sessions for West's sixth solo effort begin to take shape in early 2013 in
his own personal loft's living room at a Paris hotel. Determined to "undermine the commercial", he
once again brought together close collaborators and attempted to incorporate Chicago drill,
dancehall, acid house, and industrial music. Primarily inspired by architecture, West's
perfectionist tendencies led him to contact producer Rick Rubin fifteen days shy of its due date to
strip down the record's sound in favor of a more minimalist approach. Initial promotion of his sixth
album included worldwide video projections of the album's music and live television performances.
"Yeezus", West's sixth album, was released June 18, 2013, to rave reviews from critics. It became
his sixth consecutive number one debut, but also marked his lowest solo opening week sales.
Similarity Score: [[0.07458331]]


Input embdding of: France
Most Relevant