# Semantic search
https://docs.cohere.com/v2/docs/semantic-search-with-cohere

In [None]:
import logging
import os
import json
from dotenv import load_dotenv
import cohere
import numpy as np

load_dotenv()
co = cohere.ClientV2(api_key=os.environ['COHERE_API_KEY'])

## ドキュメントから埋め込みを生成する

In [None]:
# Define the documents
faq_long = [
    {
        "text": "Joining Slack Channels: You will receive an invite via email. Be sure to join relevant channels to stay informed and engaged."
    },
    {
        "text": "Finding Coffee Spots: For your caffeine fix, head to the break room's coffee machine or cross the street to the café for artisan coffee."
    },
    {
        "text": "Team-Building Activities: We foster team spirit with monthly outings and weekly game nights. Feel free to suggest new activity ideas anytime!"
    },
    {
        "text": "Working Hours Flexibility: We prioritize work-life balance. While our core hours are 9 AM to 5 PM, we offer flexibility to adjust as needed."
    },
    {
        "text": "Side Projects Policy: We encourage you to pursue your passions. Just be mindful of any potential conflicts of interest with our business."
    },
    {
        "text": "Reimbursing Travel Expenses: Easily manage your travel expenses by submitting them through our finance tool. Approvals are prompt and straightforward."
    },
    {
        "text": "Working from Abroad: Working remotely from another country is possible. Simply coordinate with your manager and ensure your availability during core hours."
    },
    {
        "text": "Health and Wellness Benefits: We care about your well-being and offer gym memberships, on-site yoga classes, and comprehensive health insurance."
    },
    {
        "text": "Performance Reviews Frequency: We conduct informal check-ins every quarter and formal performance reviews twice a year."
    },
    {
        "text": "Proposing New Ideas: Innovation is welcomed! Share your brilliant ideas at our weekly team meetings or directly with your team lead."
    },
]

documents = faqs_long

# Embed the documents
doc_emb = co.embed(
    model="embed-v4.0",
    input_type="search_document",
    texts=[doc["text"] for doc in documents],
).embeddings

## ユーザークエリの埋め込み化


In [None]:
# Add the user query
query = "How do I stay connected to what's happening at the company?"

# Embed the query
query_emb = co.embed(
    model="embed-v4.0",
    input_type="search_query",
    texts=[query],
).embeddings


## セマンティックサーチの実行

NOTE: `np.dot(query_emb, np.transpose(doc_emb)` しようとすると以下のエラー。
```
TypeError: unsupported operand type(s) for *: 'EmbedByTypeResponseEmbeddings' and 'EmbedByTypeResponseEmbeddings'
```
`return_results` に渡す `EmbedByTypeResponseEmbeddings` の時点で、`float` を渡しておく。

In [None]:
# Compute dot product similarity and display results
def return_results(query_emb, doc_emb, documents):
    n = 2
    scores = np.dot(query_emb, np.transpose(doc_emb))[0]
    scores_sorted = sorted(
        enumerate(scores), key=lambda x: x[1], reverse=True
    )[:n]

    for idx, item in enumerate(scores_sorted):
        print(f"Rank: {idx+1}")
        print(f"Score: {item[1]}")
        print(f"Document: {documents[item[0]]}\n")


return_results(query_emb.float, doc_emb.float, documents)

## 多言語でのセマンティックサーチ

In [None]:
# Define the documents
faqs_short_ja = [
    {
        "text": "旅費精算：当社の財務ツールから申請することで、旅費を簡単に管理できます。承認は迅速かつ簡単です。"
    },
    {
        "text": "海外勤務：海外からリモートワークすることも可能です。上司と調整し、勤務時間中に連絡が取れるようにしてください。"
    },
    {
        "text": "健康とウェルネスの特典: 私たちはあなたの健康を重視し、ジムの会員権、施設内でのヨガ クラス、包括的な健康保険を提供しています。"
    },
    {
        "text": "業績レビューの頻度: 非公式レビューは四半期ごとに、公式レビューは年に 2 回実施します。"
    },
]

documents = faqs_short_ja

# Embed the documents
doc_emb = co.embed(
    model="embed-v4.0",
    input_type="search_document",
    texts=[doc["text"] for doc in documents],
).embeddings

# Add the user query
query = "What's your remote-working policy?"

# Embed the query
query_emb = co.embed(
    model="embed-v4.0",
    input_type="search_query",
    texts=[query],
).embeddings

# Compute dot product similarity and display results
return_results(query_emb.float, doc_emb.float, documents)

# 埋め込みの圧縮を変更する。

In [None]:
# Define the documents
documents = faqs_long

# Embed the documents with the given embedding types
doc_emb = co.embed(
    model="embed-v4.0",
    embedding_types=["float", "int8"],
    input_type="search_document",
    texts=[doc["text"] for doc in documents],
).embeddings

# Add the user query
query = "How do I stay connected to what's happening at the company?"

# Embed the query
query_emb = co.embed(
    model="embed-v4.0",
    embedding_types=["float", "int8"],
    input_type="search_query",
    texts=[query],
).embeddings

In [None]:
# Compute dot product similarity and display results
return_results(query_emb.float, doc_emb.float, faqs_long)

In [None]:
# Compute dot product similarity and display results
return_results(query_emb.int8, doc_emb.int8, documents)