<a href="https://colab.research.google.com/github/sandeepjunaghare/llm/blob/main/cohere_ex3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
! pip install cohere altair umap-learn > /dev/null

In [3]:
import cohere
import pandas as pd
import numpy as np
import altair as alt
import textwrap as tr

co = cohere.Client("EwcsPBLocPue4X4ZwYKNIMiUM1pKgLd43JMl7bZo")


In [4]:
# get a list of texts and add to a dataframe
df = pd.read_csv("https://github.com/cohere-ai/notebooks/raw/main/notebooks/data/hello-world-kw.csv", names=["search_term"])
df.head()


Unnamed: 0,search_term
0,how to print hello world in python
1,what is hello world
2,how do you write hello world in an alert box
3,how to print hello world in java
4,how to write hello world in eclipse


In [5]:
# a function that classify a list of inputs given the examples
def embed_text(texts):
  """
  turns a piece of text into embeddings
  Arguments:
    text(str): the text to be turned into embeddings
  Returns:
    embedding(list): the embeddings
  """
  # embed the text by calling the Embed endpoint
  output = co.embed(
      model="embed-english-v2.0",
      texts=texts)
  embedding = output.embeddings

  return embedding

In [6]:
# Get embeddings of all search terms
df["search_term_embeds"] = embed_text(df["search_term"].tolist())
embeds = np.array(df["search_term_embeds"].tolist())


In [7]:
# Add a new query
new_query = "what is the history of hello world"

# Get embeddings of the new query
new_query_embeds = embed_text([new_query])[0]


In [8]:

# Calculate cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

def get_similarity(target, candidates):
  """
  Compute the similarity between a target text and a list of other texts
  Arguments:
    target(list[float]): the target text
    candidates(list[list[float]]): a list of other texts, or candidates
  Returns:
    sim(list[tuple]): candidate IDs and the similarity scores

  """


  # Turn list into array
  candidate_array = np.array(candidates)
  target = np.expand_dims(np.array(target), axis=0)

  # Calculate cosine similarity

  sim = cosine_similarity(target, candidate_array,target)
  sim = np.squeeze(sim).tolist()

  # Sort by descending order in similarity

  sim = list(enumerate(sim))
  sim = sorted(sim, key=lambda x:x[1], reverse=True)


  # Return similarity scores

  return sim



In [16]:
# finally, get similarities between new query embeds and embeds
similarity = get_similarity(new_query_embeds, embeds)

print("new query: " )
print(new_query,'\n')

print("Top faqs are",'\n')
for idx,score in similarity[:5]:
  print(f"Similarity: {score:.2f};", df.iloc[idx]["search_term"])



new query: 
what is the history of hello world 

Top faqs are 

Similarity: 0.91; how did hello world originate
Similarity: 0.88; where did hello world come from
Similarity: 0.86; what is hello world
Similarity: 0.77; why is hello world so famous
Similarity: 0.70; why hello world


In [18]:
# reduce embedings to 2 dimensions
import umap
reducer = umap.UMAP(n_neighbors=49)
umap_embeds = reducer.fit_transform(embeds)

# add 2 dimension to dataframe
df['x'] =  umap_embeds[:,0]
df['y'] =  umap_embeds[:,1]



In [19]:
# Plot the 2-dimension embeddings on a chart
chart = alt.Chart(df).mark_circle(size=500).encode(
  x=
  alt.X('x',
      scale=alt.Scale(zero=False),
      axis=alt.Axis(labels=False, ticks=False, domain=False)
  ),

  y=
  alt.Y('y',
      scale=alt.Scale(zero=False),
      axis=alt.Axis(labels=False, ticks=False, domain=False)
  ),

  tooltip=['search_term']
  )

text = chart.mark_text(align='left', dx=15, size=12, color='black'
          ).encode(text='search_term', color= alt.value('black'))

result = (chart + text).configure(background="#FDF7F0"
      ).properties(
      width=1000,
      height=700,
      title="2D Embeddings"
      )

result.interactive()