In [None]:
# !pip install langchain-huggingface
# !pip install qdrant-client
# !pip install langchain-qdrant
# !pip install langchain-community
# !pip install lark

In [59]:
import json
dataset = json.load(open("dataset.json"))['cases_list']

In [88]:
dataset[0]

{'page_content': "The Supreme Court upheld the Centre's 2016 demonetisation scheme in a 4:1 majority, ruling that demonetisation was proportionate to the Union’s objectives and implemented reasonably.",
 'metadata': {'year': 2023,
  'court': 'Supreme Court of India',
  'judges': ['S. Abdul Nazeer',
   'B.R. Gavai',
   'A.S. Bopanna',
   'V. Ramasubramanian',
   'B.V. Nagarathna'],
  'legal_topics': ['Constitutional Law', 'Economic Policy'],
  'relevant_laws': ['Article 370 of the Indian Constitution']}}

In [89]:
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "BAAI/bge-small-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)




In [93]:
hf.embed_query("demonitization case")

[-0.08326546847820282,
 0.022219663485884666,
 -0.017243655398488045,
 -0.053167618811130524,
 0.05313488468527794,
 -0.016739359125494957,
 0.045978739857673645,
 -0.013614344410598278,
 0.014157246798276901,
 -0.002596378792077303,
 -0.007901007309556007,
 -0.038068678230047226,
 -0.018489299342036247,
 0.014195187948644161,
 0.04925695061683655,
 0.030631814152002335,
 -0.0024389438331127167,
 0.03351893648505211,
 -0.027505427598953247,
 0.013234520331025124,
 0.05578543618321419,
 -0.09169687330722809,
 0.008015230298042297,
 0.002464858815073967,
 0.0008859741501510143,
 -0.0645192414522171,
 0.04931394010782242,
 0.02818586304783821,
 -0.02374214492738247,
 -0.15073446929454803,
 0.04553466662764549,
 -0.04948839917778969,
 -0.010095835663378239,
 -0.017972297966480255,
 0.04311444237828255,
 -0.0001938668719958514,
 0.009973517619073391,
 0.05950810760259628,
 -0.045189421623945236,
 0.00553807383403182,
 -0.0160306878387928,
 0.05328197032213211,
 -0.01825469173491001,
 -0.040

In [94]:
langchain_dataset = json.load(open("dataset.json"))['cases_list']

In [95]:
import re

def remove_punctuation(text):
    no_punct_text = re.sub(r'[^\w\s,]', '', text)
    return no_punct_text.lower()

In [96]:
for x in langchain_dataset:
    for key, val in x['metadata'].items():
        x['metadata'][key] = remove_punctuation(str(val))
    for val in x['page_content']:
        x['page_content'] = remove_punctuation(str(x['page_content']))

In [97]:
langchain_dataset[0]

{'page_content': 'the supreme court upheld the centres 2016 demonetisation scheme in a 41 majority, ruling that demonetisation was proportionate to the unions objectives and implemented reasonably',
 'metadata': {'year': '2023',
  'court': 'supreme court of india',
  'judges': 's abdul nazeer, br gavai, as bopanna, v ramasubramanian, bv nagarathna',
  'legal_topics': 'constitutional law, economic policy',
  'relevant_laws': 'article 370 of the indian constitution'}}

In [70]:
from langchain_core.documents import Document
from langchain_community.vectorstores.qdrant import Qdrant
for i, x in enumerate(langchain_dataset):
    langchain_dataset[i] = Document(
        page_content=x['page_content'],
        metadata=x['metadata'],
    )


vectorstore = Qdrant.from_documents(langchain_dataset, hf, 
                                    location=":memory:",  
                                    collection_name="langchain_legal",)

## Langchain self query filter

In [None]:
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="court",
        description="The judiciary court's name.",
        type="string",
    ),
    AttributeInfo(
        name="year",
        description="The year the court decision was given.",
        type="string",
    ),
    AttributeInfo(
        name="judges",
        description="The list of names of the case judges.",
        type="string",
    ),
    AttributeInfo(
        name="legal_topics",
        description="list of the topic names of the case",
        type="string",
    ),
    AttributeInfo(
        name="relevant_laws",
        description="list of relevant laws that applied on the case decision",
        type="string",
    )
]
document_content_description = "Brief summary of a court case"

In [105]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_groq import ChatGroq
llm = ChatGroq(temperature=0,api_key="your-groq-key")
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
)

In [104]:
retriever.invoke("demonetization case of Supreme court of India")

[]

In [103]:
retriever.invoke("cases where Judge Sylvia Steiner was the judge")

OutputParserException: Parsing text
```json
{
    "query": "",
    "filter": "eq(contains(judges, \"Sylvia Steiner\"), true)"
}
```
Explanation:
- The query is an empty string because the user is not looking for any specific text in the content of the cases.
- The filter uses the `eq` comparison statement to check if the `judges` attribute contains the name "Sylvia Steiner".
- The `contains` function is used to check if the `judges` attribute is a string that includes the name "Sylvia Steiner".
- The `true` value is used as the comparison value for the `eq` statement to check if the `contains` function returns `true`.
 raised following error:
Unexpected token Token('COMMA', ',') at line 1, column 19.
Expected one of: 
	* LPAR
Previous tokens: [Token('CNAME', 'judges')]


In [101]:
retriever.invoke("demonetization case summary")

[Document(page_content="The Supreme Court upheld the Centre's 2016 demonetisation scheme in a 4:1 majority, ruling that demonetisation was proportionate to the Union’s objectives and implemented reasonably.", metadata={'year': '2023', 'court': 'supreme court of india', 'judges': 's abdul nazeer, br gavai, as bopanna, v ramasubramanian, bv nagarathna', 'legal_topics': 'constitutional law, economic policy', 'relevant_laws': 'article 370 of the indian constitution', '_id': 'aaf5d793903e4a6aa2bcd68ee11ccfad', '_collection_name': 'langchain_legal'}),
 Document(page_content='The Supreme Court of the United Kingdom ruled that Uber drivers are workers and entitled to employment rights such as minimum wage and holiday pay.', metadata={'year': '2021', 'court': 'supreme court of the united kingdom', 'judges': 'lord reed, lord hodge, lady arden', 'legal_topics': 'employment law, gig economy', 'relevant_laws': 'employment rights act 1996', '_id': '07dc3ad5dbf243cfa6c67e3bea2589a0', '_collection_nam

In [102]:
# This example only specifies a filter
retriever.invoke("cases in year 2020")

[Document(page_content='The International Court of Justice ruled that Myanmar must take measures to prevent the genocide of the Rohingya people.', metadata={'year': '2020', 'court': 'international court of justice', 'judges': 'president abdulqawi yusuf', 'legal_topics': 'international law, human rights', 'relevant_laws': 'convention on the prevention and punishment of the crime of genocide', '_id': '7829c80e30eb4d31948623f6873ff46f', '_collection_name': 'langchain_legal'}),
 Document(page_content='The Constitutional Court of South Africa ruled that domestic workers are entitled to the same compensation benefits as other workers.', metadata={'year': '2020', 'court': 'constitutional court of south africa', 'judges': 'chief justice mogoeng mogoeng', 'legal_topics': 'labor law, equality', 'relevant_laws': 'compensation for occupational injuries and diseases act', '_id': 'bb34eeb9ce5a49f39da01e4ed3764367', '_collection_name': 'langchain_legal'})]

In [84]:
# This example only specifies a filter
retriever.invoke("cases with topic Constitutional Law")

[Document(page_content='The U.S. Supreme Court decided that same-sex marriage is a constitutional right under the Fourteenth Amendment.', metadata={'year': '2015', 'court': 'supreme court of the united states', 'judges': 'majority opinion by justice anthony kennedy', 'legal_topics': 'constitutional law, civil rights', 'relevant_laws': 'fourteenth amendment of the us constitution', '_id': '6480fdbd1a8143b383b8fa03b8748cab', '_collection_name': 'langchain_legal'}),
 Document(page_content="The U.S. Supreme Court ruled that state laws banning abortion are unconstitutional, recognizing a woman's right to privacy in making medical decisions.", metadata={'year': '1973', 'court': 'supreme court of the united states', 'judges': 'majority opinion by justice harry blackmun', 'legal_topics': 'constitutional law, reproductive rights', 'relevant_laws': 'fourteenth amendment of the us constitution', '_id': '0b193c8b40794f6dac578dfd046fbbd1', '_collection_name': 'langchain_legal'}),
 Document(page_con

## Qdrant Payload filter

In [None]:
from qdrant_client.models import PointStruct
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams


client = QdrantClient(":memory:")


client.recreate_collection(
    collection_name="law_docs",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

# NOTE: consider splitting the data into chunks to avoid hitting the server's payload size limit
# or use `upload_collection` or `upload_points` methods which handle this for you
# WARNING: uploading points one-by-one is not recommended due to requests overhead
client.upsert(
    collection_name="law_docs",
    points=[
        PointStruct(
            id=idx,
            vector=hf.embed_query(element['page_content']),
            payload=element['metadata']
        )
        for idx, element in enumerate(dataset)
    ]
)

In [106]:
from qdrant_client import models
client.scroll(
    collection_name="law_docs",
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="year",
                match=models.MatchValue(value=2023),
            ),
            # models.FieldCondition(
            #     key="color",
            #     match=models.MatchValue(value="red"),
            # ),
        ]
    ),
)

([Record(id=0, payload={'year': 2023, 'court': 'Supreme Court of India', 'judges': ['S. Abdul Nazeer', 'B.R. Gavai', 'A.S. Bopanna', 'V. Ramasubramanian', 'B.V. Nagarathna'], 'legal_topics': ['Constitutional Law', 'Economic Policy'], 'relevant_laws': ['Article 370 of the Indian Constitution']}, vector=None, shard_key=None),
  Record(id=1, payload={'year': 2023, 'court': 'Supreme Court of the United States', 'judges': ['K.M. Joseph', 'Ajay Rastogi', 'Aniruddha Bose', 'Hrishikesh Roy', 'C.T. Ravikumar'], 'legal_topics': ['Election Law', 'Constitutional Law'], 'relevant_laws': ['Election Commission Act']}, vector=None, shard_key=None),
  Record(id=4, payload={'year': 2023, 'court': 'Supreme Court of India', 'judges': ['K.M. Joseph', 'Ajay Rastogi', 'Aniruddha Bose', 'Hrishikesh Roy', 'C.T. Ravikumar'], 'legal_topics': ['Health Law', 'Right to Die'], 'relevant_laws': ['Guidelines for Passive Euthanasia']}, vector=None, shard_key=None)],
 None)

## Semantic filtering

In [113]:
metadata_fields = [x['metadata'] for x in dataset]
metadata_list = []
for elem in metadata_fields:
    s = ''
    for key in elem:
        s = s + f"{key} : {elem[key]}\n"
    s = s.strip().lower().replace('.','').replace("'",'').replace('[','').replace(']','')
    # s = remove_punctuation(s)
    metadata_list.append(s)

metadata_client = QdrantClient(":memory:")
metadata_client.recreate_collection(
    collection_name="metadata",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

metadata_client.upsert(
    collection_name="metadata",
    points=[
        PointStruct(
            id=idx,
            vector=hf.embed_query(element),
        )
        for idx, element in enumerate(metadata_list)
    ]
)

  metadata_client.recreate_collection(


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [118]:
def find_hit_details(hit_list, hit):
    for i, x in enumerate(hit_list):
        if hit == x.id:
            return i
    return -1

def semantic_filtering(text):
    first_level = set()
    second_level = set()
    matching_hits = {}

    query_vector = hf.embed_query(text)
    hits = client.search("law_docs", query_vector, limit=5)
    for h in hits:
        first_level.add(h.id)
    
    filter_hits = metadata_client.search("metadata", query_vector, limit=5)
    filter_hits_dict = {fh.id: fh for fh in filter_hits}
    for fh in filter_hits:
        second_level.add(fh.id)
    
    common_hits = first_level & second_level
    for hit in common_hits:
        filter_hit_detail = filter_hits_dict[hit]
        if filter_hit_detail.score > 0.65:
            matching_hits[filter_hit_detail.score] = hit

    sorted_matching_hits = sorted(matching_hits.items(), reverse=True)
    
    if sorted_matching_hits:
        print("semantic_filtering")
        return [dataset[hit] for score, hit in sorted_matching_hits]
    else:
        print("No filter found")
        return [dataset[hit] for hit in first_level]


In [119]:
# Example usage
text = "gay marriage cases"
results = semantic_filtering(text)
results

semantic_filtering


[{'page_content': 'The U.S. Supreme Court held that the Defense of Marriage Act (DOMA) was unconstitutional as it violated the Fifth Amendment by denying federal recognition of same-sex marriages.',
  'metadata': {'year': 2013,
   'court': 'Supreme Court of the United States',
   'judges': ['Majority opinion by Justice Anthony Kennedy'],
   'legal_topics': ['Constitutional Law', 'LGBTQ+ Rights'],
   'relevant_laws': ['Fifth Amendment of the U.S. Constitution']}},
 {'page_content': 'The Supreme Court of Japan upheld a law requiring married couples to share a surname, ruling it did not violate constitutional guarantees of equality.',
  'metadata': {'year': 2015,
   'court': 'Supreme Court of Japan',
   'judges': ['Chief Justice Itsuro Terada'],
   'legal_topics': ['Family Law', 'Equality'],
   'relevant_laws': ['Japanese Civil Code']}}]

In [143]:
semantic_filtering("cases about money".lower())

No filter found


[{'page_content': "The Supreme Court upheld the Centre's 2016 demonetisation scheme in a 4:1 majority, ruling that demonetisation was proportionate to the Union’s objectives and implemented reasonably.",
  'metadata': {'year': 2023,
   'court': 'Supreme Court of India',
   'judges': ['S. Abdul Nazeer',
    'B.R. Gavai',
    'A.S. Bopanna',
    'V. Ramasubramanian',
    'B.V. Nagarathna'],
   'legal_topics': ['Constitutional Law', 'Economic Policy'],
   'relevant_laws': ['Article 370 of the Indian Constitution']}},
 {'page_content': 'In a case concerning the rights of students with disabilities, the United States Department of Justice concluded that Alabama’s foster care system discriminates against students with emotional and behavioral disabilities.',
  'metadata': {'year': 2022,
   'court': 'United States Department of Justice',
   'judges': ['Investigation by the Civil Rights Division'],
   'legal_topics': ['Disability Law', 'Education Law'],
   'relevant_laws': ['Title II of the Am