# Lesson 2: Embeddings

Note: The numeric values of embeddings you see in your notebook may vary slightly from those filmed.

### Setup
Load needed API keys and relevant Python libaries.

In [1]:
# !pip install cohere umap-learn altair datasets

In [2]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [3]:
import cohere
co = cohere.Client(os.environ['COHERE_API_KEY'])

In [4]:
import pandas as pd

## Word Embeddings

Consider a very small dataset of three words.

In [5]:
three_words = pd.DataFrame({'text':
  [
      'joy',
      'happiness',
      'potato'
  ]})

three_words

Unnamed: 0,text
0,joy
1,happiness
2,potato


Let's create the embeddings for the three words:
You may see an 'unknown field' warning which can be ignored.

In [6]:
three_words_emb = co.embed(texts=list(three_words['text']),
                           model='embed-english-v2.0').embeddings

In [7]:
word_1 = three_words_emb[0]
word_2 = three_words_emb[1]
word_3 = three_words_emb[2]

In [8]:
word_1[:10]

[2.3203125,
 -0.18334961,
 -0.578125,
 -0.7314453,
 -2.2050781,
 -2.59375,
 0.35205078,
 -1.6220703,
 0.27954102,
 0.3083496]

## Sentence Embeddings

Consider a very small dataset of three sentences.

In [20]:
sentences = pd.DataFrame({'text':
  [
   'Where should we study?',
   'We can study in the park near Cafe',
   'What color is the sky?',
   'I like the colour blue',
   'Where do you stay?',
   'Monkeys live on trees',
   'Are you a doctor?',
   'I am a doctor',
  ]})

sentences

Unnamed: 0,text
0,Where should we study?
1,We can study in the park near Cafe
2,What color is the sky?
3,I like the colour blue
4,Where do you stay?
5,Monkeys live on trees
6,Are you a doctor?
7,I am a doctor


Let's create the embeddings for the three sentences:

In [21]:
emb = co.embed(texts=list(sentences['text']),
               model='embed-english-v2.0').embeddings

# Explore the 10 first entries of the embeddings of the 3 sentences:
for e in emb:
    print(e[:3])

[-4.3320312, -0.35595703, 1.4267578]
[-4.1445312, -0.25732422, 1.671875]
[-0.23400879, -0.9375, 0.9604492]
[-1.8203125, 0.3466797, 0.56884766]
[0.33032227, 0.22070312, -0.022949219]
[2.6621094, 0.0053710938, 0.7080078]
[0.4711914, -0.86083984, 1.1953125]
[0.50097656, -0.20715332, 1.9394531]


In [22]:
len(emb[0])

4096

In [23]:
#import umap
#import altair as alt

The next code cell is for hiding some warnings that appear when importing the `umap_plot` library.

In [24]:
# hide the warnings that would appear when importing the UMAP library
from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning
import warnings
warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning)

In [25]:
from utils import umap_plot

In [26]:
chart = umap_plot(sentences, emb)

In [27]:
chart.interactive()

## Articles Embeddings

In [28]:
import pandas as pd
wiki_articles = pd.read_pickle('wikipedia.pkl')
wiki_articles

Unnamed: 0,id,title,text,url,wiki_id,views,paragraph_id,langs,emb
0,0,24-hour clock,The 24-hour clock is a way of telling the time...,https://simple.wikipedia.org/wiki?curid=9985,9985,2450.625488,0,30,"[0.07711287587881088, 0.3197174072265625, -0.2..."
1,14,The Dark Knight Trilogy,"The ""Dark Knight"" Series is a set of three Chr...",https://simple.wikipedia.org/wiki?curid=377304,377304,1897.982666,0,13,"[0.2566547989845276, -0.17023412883281708, 0.1..."
2,19,Abella Danger,"Abella Danger (born November 19, 1995) is an A...",https://simple.wikipedia.org/wiki?curid=797944,797944,1748.024170,0,30,"[-0.20083625614643097, -0.14190533757209778, -..."
3,24,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro (born 5 Fe...,https://simple.wikipedia.org/wiki?curid=65655,65655,1564.210449,0,156,"[-0.4638298451900482, -0.0351627953350544, 0.7..."
4,61,Carles Puigdemont,Carles Puigdemont i Casamajó (born 29 December...,https://simple.wikipedia.org/wiki?curid=540154,540154,1542.519531,0,81,"[-0.12089978903532028, 0.06191902980208397, 0...."
...,...,...,...,...,...,...,...,...,...
1995,34633,1st century,"During this period Europe, North Africa and th...",https://simple.wikipedia.org/wiki?curid=24589,24589,122.295425,0,133,"[-0.33043625950813293, -0.234648197889328, -0...."
1996,34653,Operation Restore Hope,The Operation Restore Hope was an operation of...,https://simple.wikipedia.org/wiki?curid=427200,427200,122.192032,0,12,"[0.5195494294166565, -0.06794795393943787, 0.2..."
1997,34655,Rumi,Jalal ad-Din Muhammad Rumi (30 September 1207 ...,https://simple.wikipedia.org/wiki?curid=64995,64995,122.192032,0,101,"[-0.254226416349411, 0.6597043871879578, -0.00..."
1998,34669,Korean War,"The Korean War (Korean: 한국전잴, Russian: Корейск...",https://simple.wikipedia.org/wiki?curid=7537,7537,122.175140,0,120,"[0.22879508137702942, -0.1242295652627945, -0...."


In [29]:
import numpy as np
from utils import umap_plot_big

In [30]:
articles = wiki_articles[['title', 'text']]
embeds = np.array([d for d in wiki_articles['emb']])

chart = umap_plot_big(articles, embeds)
chart.interactive()

In [None]:
# All the similar topics are near eachother