In [None]:
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import pandas as pd
import numpy as np


# Initialize tokenizer and model for DistilBERT
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
model = TFAutoModel.from_pretrained("distilbert-base-multilingual-cased")

def compute_embedding(text):
    encoded_input = tokenizer(text, return_tensors="tf", padding=True, truncation=True, max_length=512)
    outputs = model(**encoded_input)
    embeddings = tf.reduce_mean(outputs.last_hidden_state, axis=1)
    return embeddings.numpy()

# Load a subset of the wikipedia dataset (assuming structure and availability)
dataset = load_dataset("Cohere/wikipedia-22-12-en-embeddings",split="train", streaming=True)


In [227]:

#========Exercise 3.1 =========== 
# Fill in the following code
# ===============================
def find_most_relevant_article(query_embedding, dataset,max_num_of_articles=None):
    data_list = []
    if max_num_of_articles:
        # print(f"using {max_num_of_articles} articles")
        for i, example in enumerate(dataset):
            if i >= max_num_of_articles:
                break
            data_list.append(example)
        print(f"using {i} articles")

        # Convert to DataFrame
    else: # get all the data fronm the iterator 
        print("using all articles")
        for example in tqdm(dataset):
            data_list.append(example)

        # Convert to DataFrame
    df = pd.DataFrame(data_list)


    min_similarity = np.inf

    most_relevant_article = ''
    for row in df.iterrows():
        
        query_embedding = np.array(query_embedding[0][:])
        r = row[:]

        r=np.array(r[1]['emb'])
        r=r.reshape(-1,1)
        query_embedding = query_embedding.reshape(-1,1)
        # print('using the following article:\n', row[1]['text'])
        # print(query_embedding.shape)
        # print(r.shape)

        similarity_vec = cosine_similarity(query_embedding, r)
        if np.linalg.norm(similarity_vec) < min_similarity:
            min_similarity = np.linalg.norm(similarity_vec)
            most_relevant_article = row[1]['text']



    return most_relevant_article, min_similarity
 

In [228]:

# Example input string
input_text = "Women's rights"

# # Compute the embedding for the input text
input_embedding = compute_embedding(input_text)

# print(input_embedding)

# Find the most relevant article
# To reduce the runtime, look at only the first N articles
article, similarity = find_most_relevant_article(input_embedding, dataset,max_num_of_articles=1000)
print("Most Relevant Article:", article)
print("Similarity Score:", similarity)

using 1000 articles
Most Relevant Article: YouTube is a global online video sharing and social media platform headquartered in San Bruno, California. It was launched on February 14, 2005, by Steve Chen, Chad Hurley, and Jawed Karim. It is owned by Google, and is the second most visited website, after Google Search. YouTube has more than 2.5 billion monthly users who collectively watch more than one billion hours of videos each day. , videos were being uploaded at a rate of more than 500 hours of content per minute.
Similarity Score: 27.712812921102035


In [223]:
input = ['Leonardo DiCaprio',
         'France',
         'Python',
         'Deep Learning']

for i in input:
    input_embedding = compute_embedding(i)
    article, similarity = find_most_relevant_article(input_embedding, dataset,max_num_of_articles=1000)
    print("Input embdding of:", i)
    print("Most Relevant Article:", article)
    print("Similarity Score:", similarity)
    print("\n")


Input embdding of: Leonardo DiCaprio
Most Relevant Article: YouTube is a global online video sharing and social media platform headquartered in San Bruno, California. It was launched on February 14, 2005, by Steve Chen, Chad Hurley, and Jawed Karim. It is owned by Google, and is the second most visited website, after Google Search. YouTube has more than 2.5 billion monthly users who collectively watch more than one billion hours of videos each day. , videos were being uploaded at a rate of more than 500 hours of content per minute.
Similarity Score: 27.712812921102035


Input embdding of: France
Most Relevant Article: YouTube is a global online video sharing and social media platform headquartered in San Bruno, California. It was launched on February 14, 2005, by Steve Chen, Chad Hurley, and Jawed Karim. It is owned by Google, and is the second most visited website, after Google Search. YouTube has more than 2.5 billion monthly users who collectively watch more than one billion hours o

In [None]:
# data_list = []
# for i, example in enumerate(dataset):
#     if i >= 50:
#         break
#     data_list.append(example)


# df = pd.DataFrame(data_list)


In [225]:
# df.head(100)

Unnamed: 0,id,title,text,url,wiki_id,views,paragraph_id,langs,emb
0,0,Deaths in 2022,The following notable deaths occurred in 2022....,https://en.wikipedia.org/wiki?curid=69407798,69407798,5674.449219,0,38,"[0.2865696847438812, -0.03181683272123337, 0.0..."
1,1,YouTube,YouTube is a global online video sharing and s...,https://en.wikipedia.org/wiki?curid=3524766,3524766,5409.561035,0,184,"[-0.09689381718635559, 0.1619211882352829, -0...."
2,2,YouTube,"In October 2006, YouTube was bought by Google ...",https://en.wikipedia.org/wiki?curid=3524766,3524766,5409.561035,1,184,"[0.1302049309015274, 0.265736848115921, 0.4018..."
3,3,YouTube,"Since its purchase by Google, YouTube has expa...",https://en.wikipedia.org/wiki?curid=3524766,3524766,5409.561035,2,184,"[-0.09791257232427597, 0.13586106896400452, -0..."
4,4,YouTube,YouTube has had an unprecedented social impact...,https://en.wikipedia.org/wiki?curid=3524766,3524766,5409.561035,3,184,"[-0.2641527056694031, 0.06968216598033905, -0...."
5,5,YouTube,"YouTube was founded by Steve Chen, Chad Hurley...",https://en.wikipedia.org/wiki?curid=3524766,3524766,5409.561035,4,184,"[0.24761267006397247, 0.513164758682251, -0.08..."
6,6,YouTube,According to a story that has often been repea...,https://en.wikipedia.org/wiki?curid=3524766,3524766,5409.561035,5,184,"[0.10319796204566956, 0.3075602352619171, -0.3..."
7,7,YouTube,Karim said the inspiration for YouTube first c...,https://en.wikipedia.org/wiki?curid=3524766,3524766,5409.561035,6,184,"[0.12825779616832733, 0.7862905859947205, -0.0..."
8,8,YouTube,YouTube began as a venture capital–funded tech...,https://en.wikipedia.org/wiki?curid=3524766,3524766,5409.561035,7,184,"[0.003740631742402911, 0.49030137062072754, 0...."
9,9,YouTube,YouTube was not the first video-sharing site o...,https://en.wikipedia.org/wiki?curid=3524766,3524766,5409.561035,8,184,"[-0.28407734632492065, 0.2493772953748703, -0...."
