In [1]:
import pandas as pd
import numpy as np
import altair as alt
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

In [None]:
!pip install cohere
import cohere
co = cohere.ClientV2('Secret Key')



Step 1: Prepare the **Dataset**

We'll work a subset of the Airline Travel Information System (ATIS) intent classification dataset [Source]. The following code loads the dataset into a pandas Dataframe df with a single column "queries" containing 91 inquiries coming to airline travel inquiry systems.

In [10]:
# Load the dataset to a dataframe
df_orig = pd.read_csv('https://raw.githubusercontent.com/cohere-ai/notebooks/main/notebooks/data/atis_intents_train.csv', names=['intent','query'])
df_orig


Unnamed: 0,intent,query
0,atis_flight,i want to fly from boston at 838 am and arriv...
1,atis_flight,what flights are available from pittsburgh to...
2,atis_flight_time,what is the arrival time in san francisco for...
3,atis_airfare,cheapest airfare from tacoma to orlando
4,atis_airfare,round trip fares from pittsburgh to philadelp...
...,...,...
4829,atis_airfare,what is the airfare for flights from denver t...
4830,atis_flight,do you have any flights from denver to baltim...
4831,atis_airline,which airlines fly into and out of denver
4832,atis_flight,does continental fly from boston to san franc...


In [11]:
# Take a small sample for illustration purposes
sample_classes = ['atis_airfare', 'atis_airline', 'atis_ground_service']
df = df_orig.sample(frac=0.1, random_state=30)
df = df[df.intent.isin(sample_classes)]
df_orig = df_orig.drop(df.index)
df.reset_index(drop=True,inplace=True)
df

Unnamed: 0,intent,query
0,atis_airline,which airlines fly from boston to washington ...
1,atis_airline,show me the airlines that fly between toronto...
2,atis_airfare,show me round trip first class tickets from n...
3,atis_airfare,i'd like the lowest fare from denver to pitts...
4,atis_ground_service,show me a list of ground transportation at bo...
...,...,...
86,atis_ground_service,what ground transportation is there in atlanta
87,atis_airline,can i take a single airline from la to charlo...
88,atis_airfare,what is the cost for a one way trip from pitt...
89,atis_ground_service,what ground transportation is available in ba...


In [12]:

# Remove unnecessary column
intents = df['intent'] #save for a later need
df.drop(columns=['intent'], inplace=True)
df

Unnamed: 0,query
0,which airlines fly from boston to washington ...
1,show me the airlines that fly between toronto...
2,show me round trip first class tickets from n...
3,i'd like the lowest fare from denver to pitts...
4,show me a list of ground transportation at bo...
...,...
86,what ground transportation is there in atlanta
87,can i take a single airline from la to charlo...
88,what is the cost for a one way trip from pitt...
89,what ground transportation is available in ba...


**Step 1: Turn Text into Embeddings** \  
Next, we embed each inquiry by calling Cohere’s Embed endpoint with co.embed(). It takes in texts as input and returns embeddings as output. We supply three parameters:

**texts:** The list of texts you want to embed \\
**model:** The model to use to generate the embedding \\
**input_type** — Specifies the type of document to be embedded. At the time of writing, there are four options: \\
**search_document:** For documents against which search is performed \\
**search_query:** For query documents \\
classification: For when the embeddings will be used as an input to a text classifier \\
**clustering:** For when you want to cluster the embeddings


For every piece of text passed to the Embed endpoint, a sequence of 1024 numbers will be generated. Each number represents a piece of information about the meaning contained in that piece of text.

In [20]:
def get_embeddings(texts, model="embed-v4.0", input_type="search_document"):
    output = co.embed(
        texts=texts,
        model=model,
        input_type=input_type,
        embedding_types=["float"]
    )
    return output.embeddings.float

df['query_embeds'] = get_embeddings(df['query'].tolist())
df


Unnamed: 0,query,query_embeds
0,which airlines fly from boston to washington ...,"[0.05444336, -0.021362305, -0.002029419, -0.03..."
1,show me the airlines that fly between toronto...,"[0.022460938, 0.010925293, -0.015136719, -0.01..."
2,show me round trip first class tickets from n...,"[-0.053466797, 0.029296875, -0.0048828125, 0.0..."
3,i'd like the lowest fare from denver to pitts...,"[0.048583984, 0.017456055, -0.020751953, -0.01..."
4,show me a list of ground transportation at bo...,"[0.046875, -0.0038146973, 0.008178711, -0.0532..."
...,...,...
86,what ground transportation is there in atlanta,"[0.026489258, -0.0013275146, -0.020996094, -0...."
87,can i take a single airline from la to charlo...,"[0.044433594, 0.012084961, 0.055664062, 0.0227..."
88,what is the cost for a one way trip from pitt...,"[0.026245117, -0.0010147095, -0.010314941, -0...."
89,what ground transportation is available in ba...,"[0.055419922, -0.0022735596, -0.0059509277, -0..."


In [23]:
# Function to return the principal components
def get_pc(arr, n):
    pca = PCA(n_components=n)
    embeds_transform = pca.fit_transform(arr)
    return embeds_transform

# Reduce embeddings to 10 principal components to aid visualization
embeds = np.array(df['query_embeds'].tolist())
embeds_pc = get_pc(embeds, 10)

**Step 2: Embed the Search Query**

In [21]:
# Define new query
new_query = "How can I find a taxi or a bus when the plane lands?"

# Get embeddings of the new query
new_query_embeds = get_embeddings([new_query], input_type="search_query")[0]

In [32]:
# Calculate cosine similarity between the search query and existing queries
def get_similarity(target, candidates):
    # Turn list into array
    candidates = np.array(candidates)
    target = np.expand_dims(np.array(target),axis=0)

    # Calculate cosine similarity
    sim = cosine_similarity(target, candidates)
    sim = np.squeeze(sim).tolist()
    sort_index = np.argsort(sim)[::-1]
    sort_score = [sim[i] for i in sort_index]
    similarity_scores = zip(sort_index,sort_score)

    # Return similarity scores
    return similarity_scores

# Get the similarity between the search query and existing queries
similarity = get_similarity(new_query_embeds, embeds)


In [33]:
# View the top 5 articles
print('Query:')
print(new_query,'\n')

print('Most Similar Documents:')
for idx, sim in similarity:
    print(f'Similarity: {sim:.2f};', df.iloc[idx]['query'])

Query:
How can I find a taxi or a bus when the plane lands? 

Most Similar Documents:
Similarity: 0.41;  can you find out about the ground transportation available in atlanta
Similarity: 0.41;  can you help me with ground transportation information i need to get from the airport in philadelphia to downtown philadelphia
Similarity: 0.40;  i need your help with information on ground transportation from the airport in philadelphia to downtown
Similarity: 0.38;  i need to get downtown from the denver airport
Similarity: 0.35;  is there ground transportation in boston from the airport
Similarity: 0.35;  what type of ground transportation is available in washington
Similarity: 0.35;  what ground transportation is available in san francisco
Similarity: 0.34;  what ground transportation is available in boston
Similarity: 0.34;  and i'll need ground transportation for tuesday july sixth to wednesday july seventh in san diego
Similarity: 0.33;  what kind of ground transportation is available in 