## Install Requirements

In [None]:
!pip install -q datasets qdrant_client=="0.11.0" cohere

In [48]:
COHERE_API_KEY = "COHERE_API_KEY"
QDRANT_API_KEY = "QDRANT_API_KEY"
QDRANT_HOST = "QDRANT_HOST"

## Imports

In [45]:
import os
import cohere
from qdrant_client import QdrantClient
from qdrant_client import models
from qdrant_client.http import models as rest

In [46]:
# load environment variables
# QDRANT_HOST = os.environ.get("QDRANT_HOST")
# QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY")
# COHERE_API_KEY = os.environ.get("COHERE_API_KEY")

## Create Cohere client and check embedding size

In [49]:
cohere_client = cohere.Client(COHERE_API_KEY)

- Give a sample input and check size of the embedding returned by the model

In [21]:
embeddings = cohere_client.embed(
    texts=["A test sentence"],
    model="multilingual-22-12",
)
vector_size = len(embeddings.embeddings[0])
vector_size

768

## Create Qdrant client & create a collection to store similar legal docs

In [53]:
qdrant_client = QdrantClient(
    host=QDRANT_HOST, 
    prefer_grpc=True, 
    api_key=QDRANT_API_KEY,
)

In [54]:
qdrant_client.recreate_collection(
    collection_name="legal_qa",
    vectors_config=models.VectorParams(
        size=vector_size, 
        distance=rest.Distance.COSINE
    ),
)

True

## Load Terms of Service Dataset

In [50]:
from datasets import load_dataset

dataset = load_dataset("joelito/plain_english_contracts_summarization", split='train')

Found cached dataset json (/home/shivalika/.cache/huggingface/datasets/joelito___json/joelito--plain_english_contracts_summarization-25f5156f2d2d542c/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


In [33]:
len(dataset)

446

In [34]:
dataset

Dataset({
    features: ['doc', 'id', 'original_text', 'reference_summary', 'title', 'uid', 'case_code', 'case_text', 'note', 'title_code', 'title_text', 'urls', 'tldr_code', 'tldr_text'],
    num_rows: 446
})

In [35]:
dataset['original_text'][0]

'welcome to the pokémon go video game services which are accessible via the niantic inc niantic mobile device application the app. to make these pokémon go terms of service the terms easier to read our video game services the app and our websites located at http pokemongo nianticlabs com and http www pokemongolive com the site are collectively called the services. please read carefully these terms our trainer guidelines and our privacy policy because they govern your use of our services.'

## Prepare documents and add to Qdrant DB

In [60]:
MLLM_MODEL = "multilingual-22-12"

legal_docs = dataset['original_text']

legal_doc_response = cohere_client.embed(
    texts=legal_docs,
    model=MLLM_MODEL,
)
vectors = [list(map(float, vector)) for vector in legal_doc_response.embeddings]
ids = [id for id,entry in enumerate(legal_docs)]

In [61]:
qdrant_client.upsert(
    collection_name="legal_qa", 
    points=rest.Batch(
        ids=ids,
        vectors=vectors,
        payloads=list(dataset),
    )
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

## Prepare Query Embedding and perform search on Qdrant Collection

In [55]:
query_embeddings = cohere_client.embed(
    texts=["I am 12 years old, can I play Pokemon go?"],
    model=MLLM_MODEL,
)

In [62]:
result = qdrant_client.search(
    collection_name="legal_qa",
    query_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="doc",
                match=models.MatchValue(
                    value="Pokemon GO Terms of Service",
                ),
            )
        ]
    ),
    search_params=models.SearchParams(
        hnsw_ef=128,
        exact=False
    ),
    query_vector=query_embeddings.embeddings[0],
    limit=3,
)

In [68]:
result[0].payload['original_text']

'welcome to the pokémon go video game services which are accessible via the niantic inc niantic mobile device application the app. to make these pokémon go terms of service the terms easier to read our video game services the app and our websites located at http pokemongo nianticlabs com and http www pokemongolive com the site are collectively called the services. please read carefully these terms our trainer guidelines and our privacy policy because they govern your use of our services.'

In [69]:
len(result)

3