In [3]:
import random
import wikipedia
from transformers import pipeline
import numpy as np

european_cities = [
    "Porto",
    "Krakow",
    "Edinburgh",
    "Dubrovnik",
    "Reykjavik",
    "Bruges",
    "Salzburg",
    "Tallinn",
    "Seville",
    "Bologna",
    "Ljubljana",
    "Zurich",
    "Copenhagen",
    "Bratislava",
    "Gothenburg",
    "Marseille",
    "Bucharest",
    "Cologne",
    "Riga",
    "Thessaloniki"
]

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
random_cities =random.sample(european_cities, 3)
results = wikipedia.search(random_cities[0])
wiki_pagename1 = results[0]
page_content1 = wikipedia.summary(wiki_pagename1)
page_content1

"Ljubljana (also known by other historical names) is the capital and largest city of Slovenia, located along a trade route between the northern Adriatic Sea and the Danube region, north of the country's largest marsh, inhabited since prehistoric times. It is the country's cultural, educational, economic, political and administrative center.\nDuring antiquity, a Roman city called Emona stood in the area. The city was first mentioned in the first half of the 12th century. It was the historical capital of Carniola, one of the Slovene-inhabited parts of the Habsburg monarchy. It was under Habsburg rule from the Middle Ages until the dissolution of the Austro-Hungarian Empire in 1918. After World War II, Ljubljana became the capital of the Socialist Republic of Slovenia, part of the Socialist Federal Republic of Yugoslavia. The city retained this status until Slovenia became independent in 1991 and Ljubljana became the capital of the newly formed state."

In [5]:
random_cities = random.sample(european_cities, 3)
results = wikipedia.search(random_cities[0])
wiki_pagename2 = results[0]
page_content2 = wikipedia.summary(wiki_pagename2)
page_content2

"Zurich (German: Zürich; Alemannic German: Züri) is the largest city in Switzerland and the capital of the canton of Zurich. It is located in north-central Switzerland, at the northwestern tip of Lake Zurich. As of January 2023, the municipality had 443,037 inhabitants, the urban area 1.315 million (2009), and the Zurich metropolitan area 1.83 million (2011). Zurich is a hub for railways, roads, and air traffic. Both Zurich Airport and Zurich's main railway station are the largest and busiest in the country.\nPermanently settled for over 2,000 years, Zurich was founded by the Romans, who called it Turicum. However, early settlements have been found dating back more than 6,400 years (although this only indicates human presence in the area and not the presence of a town that early). During the Middle Ages, Zurich gained the independent and privileged status of imperial immediacy and, in 1519, became a primary centre of the Protestant Reformation in Europe under the leadership of Huldrych

In [6]:
random_cities =random.sample(european_cities, 3)
results = wikipedia.search(random_cities[0])
wiki_pagename3 = results[0]
page_content3 = wikipedia.summary(wiki_pagename3)
page_content3

'Tallinn (, Estonian: [ˈtɑlʲːinː] ) is the capital and most populous city of Estonia. Situated on a bay in north Estonia, on the shore of the Gulf of Finland of the Baltic Sea, Tallinn has a population of about 461,000 (as of 2024) and administratively lies in the Harju maakond (county). Tallinn is the main governmental, financial, industrial, and cultural centre of Estonia. It is located 187 km (116 mi) northwest of the country\'s second largest city, Tartu; however, only 80 km (50 mi) south of Helsinki, Finland, also 320 km (200 mi) west of Saint Petersburg, Russia, 300 km (190 mi) north of Riga, Latvia, and 380 km (240 mi) east of Stockholm, Sweden. From the 13th century until the first half of the 20th century, Tallinn was known in most of the world by variants of its other historical name Reval.\nTallinn received Lübeck city rights in 1248; however, the earliest evidence of human population in the area dates back nearly 5,000 years. The medieval indigenous population of what is no

## Embeddings

In [8]:
# Import the required libraries from Hugging Face Transformers
from transformers import AutoTokenizer, AutoModel

# There are a lot of BERT based models available on HuggingFace,
# and you have to pick one that is suitable for you.
BERT_Model = "bert-base-uncased"

# Initialise the BERT Transformer model
tokenizer = AutoTokenizer.from_pretrained(BERT_Model)
model = AutoModel.from_pretrained(BERT_Model)

# Function to compute the sentence embedding using BERT
def sent_embedding(sent):
    
    # Tokenize the sentence
    # This basically converts the sentence into a sequence of tokens
    # Each token is either a complete word or a sub-word
    tokens = tokenizer.encode_plus(sent, max_length=128, truncation=True,
                                    padding='max_length', return_tensors='pt')
    
    # Now feed the tokens into the model and get the embeddings as the output
    outputs = model(**tokens)

    # Create an empty list to store two different kinds of embeddings
    embedding_list = []

    # last_hidden_state contains the output at the last hidden layer of all the sentence tokens
    # pooler_output contains the embedding corresponding to only the [CLS] token, which in a way represents the whole sentence. 
    # This pooler_output is, however, different from the embeddings corresponding to the 1st token of last_hidden_state
    # Although both represent the CLS token, the pooler_output is after some more processing, 
    # and more suitable for use in sentence classification tasks.

    # This stores the embedding corresponding to the CLS token
    embedding_list.append(outputs.last_hidden_state[0][0].detach().numpy().reshape(1,-1))

    # This stores the embedding corresponding to the pooler_output
    embedding_list.append(outputs.pooler_output.detach().numpy())

    return embedding_list

sent1 = page_content1
sent2 = page_content2
sent3 = page_content3

from sklearn.metrics.pairwise import cosine_similarity

# Sentence similarity using CLS token embedding
print(f"{wiki_pagename1} vs {wiki_pagename2}: {cosine_similarity(sent_embedding(sent1)[0],sent_embedding(sent2)[0])}")

# Sentence similarity using pooler_output 
print(f"{wiki_pagename1} vs {wiki_pagename2}: {cosine_similarity(sent_embedding(sent1)[1],sent_embedding(sent2)[1])}")


# Sentence similarity using CLS token embedding
print(f"{wiki_pagename1} vs {wiki_pagename3}: {cosine_similarity(sent_embedding(sent1)[0],sent_embedding(sent3)[0])}")

# Sentence similarity using pooler_output 
print(f"{wiki_pagename1} vs {wiki_pagename3}: {cosine_similarity(sent_embedding(sent3)[1],sent_embedding(sent3)[1])}")

# Sentence similarity using CLS token embedding
print(f"{wiki_pagename2} vs {wiki_pagename3}: {cosine_similarity(sent_embedding(sent2)[0],sent_embedding(sent3)[0])}")

# Sentence similarity using pooler_output 
print(f"{wiki_pagename2} vs {wiki_pagename3}: {cosine_similarity(sent_embedding(sent2)[1],sent_embedding(sent3)[1])}")



Ljubljana vs Zurich: [[0.84674156]]
Ljubljana vs Zurich: [[0.9255438]]
Ljubljana vs Tallinn: [[0.82952684]]
Ljubljana vs Tallinn: [[1.]]
Zurich vs Tallinn: [[0.8399194]]
Zurich vs Tallinn: [[0.97567177]]
