In [61]:
import random
import wikipedia
from transformers import pipeline
import numpy as np

european_cities = [
    "Porto",
    "Krakow",
    "Edinburgh",
    "Dubrovnik",
    "Reykjavik",
    "Bruges",
    "Salzburg",
    "Tallinn",
    "Seville",
    "Bologna",
    "Ljubljana",
    "Zurich",
    "Copenhagen",
    "Bratislava",
    "Gothenburg",
    "Marseille",
    "Bucharest",
    "Cologne",
    "Riga",
    "Thessaloniki"
]

In [68]:
random_cities =random.sample(european_cities, 3)
results = wikipedia.search(random_cities[0])
wiki_pagename1 = results[0]
page_content1 = wikipedia.summary(wiki_pagename1)
page_content1

"Edinburgh (  ED-in-bər-ə, Scots: [ˈɛdɪnbʌrə]; Scottish Gaelic: Dùn Èideann [t̪un ˈeːtʲən̪ˠ]) is the capital city of Scotland and one of its 32 council areas. The city is located in southeast Scotland and is bounded to the north by the Firth of Forth estuary and to the south by the Pentland Hills. Edinburgh had a population of\n506,520 in mid-2020, making it the second-most populous city in Scotland and the seventh-most populous in the United Kingdom. The wider metropolitan area has a population of 912,490.\nRecognised as the capital of Scotland since at least the 15th century, Edinburgh is the seat of the Scottish Government, the Scottish Parliament, the highest courts in Scotland, and the Palace of Holyroodhouse, the official residence of the British monarch in Scotland. It is also the annual venue of the General Assembly of the Church of Scotland. The city has long been a centre of education, particularly in the fields of medicine, Scottish law, literature, philosophy, the sciences 

In [69]:
random_cities = random.sample(european_cities, 3)
results = wikipedia.search(random_cities[0])
wiki_pagename2 = results[0]
page_content2 = wikipedia.summary(wiki_pagename2)
page_content2

"Copenhagen (Danish: København [kʰøpm̩ˈhɑwˀn] ) is the capital and most populous city of Denmark, with a population of 1.4 million in the urban area. The city is situated on the islands of Zealand and Amager, separated from Malmö, Sweden, by the Øresund strait. The Øresund Bridge connects the two cities by rail and road.\nOriginally a Viking fishing village established in the 10th century in the vicinity of what is now Gammel Strand, Copenhagen became the capital of Denmark in the early 15th century. During the 16th century, the city served as the de facto capital of the Kalmar Union and the seat of the Union's monarchy, which governed most of the modern-day Nordic region as part of a Danish confederation with Sweden and Norway. The city flourished as the cultural and economic centre of Scandinavia during the Renaissance, and by the 17th century, it had become a regional centre of power, serving as the heart of the Danish government and military. During the 18th century, Copenhagen suf

In [70]:
random_cities =random.sample(european_cities, 3)
results = wikipedia.search(random_cities[0])
wiki_pagename3 = results[0]
page_content3 = wikipedia.summary(wiki_pagename3)
page_content3

"Gothenburg ( ; abbreviated Gbg; Swedish: Göteborg [jœtɛˈbɔrj] ) is the capital of Västra Götaland County in Sweden. It is the second-largest city in Sweden, after the capital Stockholm, and the fifth-largest in the Nordic countries. It is situated by the Kattegat on the west coast of Sweden, with a population of approximately 600,000 in the city proper and about 1.1 million inhabitants in the metropolitan area.\nKing Gustavus Adolphus founded Gothenburg by royal charter in 1621 as a heavily fortified, primarily Dutch, trading colony. In addition to the generous privileges given to his Dutch allies during the ongoing Thirty Years' War, e.g. tax relaxation, he also attracted significant numbers of his German and Scottish allies to populate his only town on the western coast; this trading status was furthered by the founding of the Swedish East India Company. At a key strategic location at the mouth of the Göta älv, where Scandinavia's largest drainage basin enters the sea, the Port of G

## Embeddings

In [72]:
# Import the required libraries from Hugging Face Transformers
from transformers import AutoTokenizer, AutoModel

# There are a lot of BERT based models available on HuggingFace,
# and you have to pick one that is suitable for you.
BERT_Model = "bert-base-uncased"

# Initialise the BERT Transformer model
tokenizer = AutoTokenizer.from_pretrained(BERT_Model)
model = AutoModel.from_pretrained(BERT_Model)

# Function to compute the sentence embedding using BERT
def sent_embedding(sent):
    
    # Tokenize the sentence
    # This basically converts the sentence into a sequence of tokens
    # Each token is either a complete word or a sub-word
    tokens = tokenizer.encode_plus(sent, max_length=128, truncation=True,
                                    padding='max_length', return_tensors='pt')
    
    # Now feed the tokens into the model and get the embeddings as the output
    outputs = model(**tokens)

    # Create an empty list to store two different kinds of embeddings
    embedding_list = []

    # last_hidden_state contains the output at the last hidden layer of all the sentence tokens
    # pooler_output contains the embedding corresponding to only the [CLS] token, which in a way represents the whole sentence. 
    # This pooler_output is, however, different from the embeddings corresponding to the 1st token of last_hidden_state
    # Although both represent the CLS token, the pooler_output is after some more processing, 
    # and more suitable for use in sentence classification tasks.

    # This stores the embedding corresponding to the CLS token
    embedding_list.append(outputs.last_hidden_state[0][0].detach().numpy().reshape(1,-1))

    # This stores the embedding corresponding to the pooler_output
    embedding_list.append(outputs.pooler_output.detach().numpy())

    return embedding_list

sent1 = page_content1
sent2 = page_content2
sent3 = page_content3

from sklearn.metrics.pairwise import cosine_similarity

# Sentence similarity using CLS token embedding
print(f"{wiki_pagename1} vs {wiki_pagename2}: {cosine_similarity(sent_embedding(sent1)[0],sent_embedding(sent2)[0])}")

# Sentence similarity using pooler_output 
print(f"{wiki_pagename1} vs {wiki_pagename2}: {cosine_similarity(sent_embedding(sent1)[1],sent_embedding(sent2)[1])}")


# Sentence similarity using CLS token embedding
print(f"{wiki_pagename1} vs {wiki_pagename3}: {cosine_similarity(sent_embedding(sent1)[0],sent_embedding(sent3)[0])}")

# Sentence similarity using pooler_output 
print(f"{wiki_pagename1} vs {wiki_pagename3}: {cosine_similarity(sent_embedding(sent3)[1],sent_embedding(sent3)[1])}")

# Sentence similarity using CLS token embedding
print(f"{wiki_pagename2} vs {wiki_pagename3}: {cosine_similarity(sent_embedding(sent2)[0],sent_embedding(sent3)[0])}")

# Sentence similarity using pooler_output 
print(f"{wiki_pagename2} vs {wiki_pagename3}: {cosine_similarity(sent_embedding(sent2)[1],sent_embedding(sent3)[1])}")



Edinburgh vs Copenhagen: [[0.6982373]]
Edinburgh vs Copenhagen: [[-0.22247049]]
Edinburgh vs Gothenburg: [[0.8446304]]
Edinburgh vs Gothenburg: [[0.9999999]]
Copenhagen vs Gothenburg: [[0.81458306]]
Copenhagen vs Gothenburg: [[0.2378582]]
