In [None]:
! pip install -q pypdf litellm openai scikit-learn numpy upstash_vector cohere tqdm sentence-transformers

#### Extracting Text from pdf

In [None]:
from pypdf import PdfReader
import uuid
from typing import List, Tuple
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re


reader = PdfReader("judgment.pdf")
text = ''
for page_number in range(len(reader.pages)):
    text += reader.pages[page_number].extract_text()

#### Class for Chunking, Embedding and Processing Extracted text

In [None]:
class SemanticChunker:
    def __init__(self, buffer_size: int = 1, embedding_client=None,breakpoint_percentile_threshold=90):
        self.buffer_size = buffer_size
        self.client = embedding_client
        self.breakpoint_percentile_threshold = breakpoint_percentile_threshold
        self.abbreviations = {'E.g.', 'e.g.', 'i.e.', 'etc.'}

    def combine_sentences(self, sentences: List[dict]) -> List[dict]:
        for i in range(len(sentences)):
            combined_sentence = ""
            for j in range(max(0, i - self.buffer_size), i + 1 + self.buffer_size):
                if 0 <= j < len(sentences):
                    combined_sentence += (sentences[j]["sentence"] + " ").strip()
            sentences[i]["combined_sentence"] = combined_sentence.strip()
        return sentences

    def get_embedding(self, text: str) -> np.ndarray:
        response = self.client.embeddings.create(
            input=text,
            model="text-embedding-ada-002"
        )
        return np.array(response.data[0].embedding)

    def calculate_cosine_distances(self, sentences: List[dict]) -> Tuple[List[float], List[dict]]:
        distances = []
        for i in range(len(sentences) - 1):
            embedding_current = sentences[i]["combined_sentence_embedding"]
            embedding_next = sentences[i + 1]["combined_sentence_embedding"]
            similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
            distance = 1 - similarity
            distances.append(distance)
            sentences[i]["distance_to_next"] = distance
        return distances, sentences

    def preprocess_text(self, text: str) -> str:
        """Preprocess text for sentence splitting, handling abbreviations and no space after periods."""
        for abbrev in self.abbreviations:
            text = text.replace(abbrev, abbrev.replace('.', '<prd>'))
        text = re.sub(r'(?<=[.?!])(?=[^\s])', r' <nospace>', text)
        return text

    def post_process_chunks(self, chunks: List[str]) -> List[str]:
        refined_chunks = []
        for chunk in chunks:
            if len(chunk.split()) < 3:  # Adjust the number based on your needs
                continue
            if chunk.startswith('<nospace>'):  # Specific pattern you might want to check
                continue
            # Add more conditions as needed
            refined_chunks.append(chunk)
        return refined_chunks


    def split_content(self, text: str) -> List[str]:
        if not text.strip():  # Check if the text is empty or whitespace
            return []
        
        preprocessed_text = self.preprocess_text(text)

        sentences = [{"sentence": sentence.replace('<prd>', '.').replace(' <nospace>', ''), "index": i}
                    for i, sentence in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        sentences = self.combine_sentences(sentences)
        embeddings = [self.get_embedding(sentence["combined_sentence"]) for sentence in sentences]
        for i, sentence in enumerate(sentences):
            sentence["combined_sentence_embedding"] = embeddings[i]

        distances, _ = self.calculate_cosine_distances(sentences)
        chunks, start_index = [], 0
        breakpoint_percentile_threshold = self.breakpoint_percentile_threshold
        breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold)
        indices_above_thresh = [i for i, distance in enumerate(distances) if distance > breakpoint_distance_threshold]

        for index in indices_above_thresh:
            group = sentences[start_index:index + 1]
            chunks.append(" ".join(sentence["sentence"] for sentence in group))
            start_index = index + 1

        if start_index < len(sentences):
            chunks.append(" ".join(sentence["sentence"] for sentence in sentences[start_index:]))

        refined_chunks = self.post_process_chunks(chunks)
        return refined_chunks

#### Intializing OpenAI Client

In [None]:
from openai import OpenAI
client = OpenAI(api_key= "*******")
Chunker = SemanticChunker(embedding_client=client, breakpoint_percentile_threshold=60)

#### Indexing the embedded chunks using Upstash Vector DB
##### Uncomment to chunk and Index the data

In [None]:
from upstash_vector import Index
# chunks = Chunker.split_content(text)
index = Index(url="********", token="**************")
# for chunk in chunks:
#     index.upsert(vectors=[(str(uuid.uuid4()), Chunker.get_embedding(chunk), {"judgment": "XvsY","text":chunk}),]
# )

#### Intialize Cohere Client 
##### Query Vectors from Upstash and Rerank

In [None]:
from typing import List
import cohere

co = cohere.Client("*******")

def provide_chunks(query: str) -> List[str]:
    results = index.query(
        vector=Chunker.get_embedding(query),
        top_k=25,
        include_vectors=True,
        include_metadata=True
    )
    result_chunk = []
    for result in results:
        result_chunk.append(result.metadata['text'])
   

    response = co.rerank(
        model='rerank-english-v2.0',
        query=query,
        documents=result_chunk,
        top_n=15,
    )
    return response


#### Setting up OpenAI API call

In [None]:
from litellm import completion
import os
os.environ["OPENAI_API_KEY"] = "*********"

In [None]:
SYSTEM_MESSAGE = """
As an expert law professional, you are required to deliver an accessible and to the detailed Response to the research question posed by your client. 
If you don't know the answer, you can say "I don't know" or "I don't have enough information to answer this question. 
"""

In [None]:
def Provide_Answer(question, chunks):
   USER_PROMPT = f"""
   Research Question: {question}
   The List of Paragraphs are as follows:
   {chunks}
   Think step by step and provide the answer to the research question based on the provided information from Paragraph
   """

   messages = [
      {"content":SYSTEM_MESSAGE,"role":"system"},
      { "content": USER_PROMPT,"role": "user"}
   ]
   response = completion(model="gpt-3.5-turbo-0125", messages=messages)
   return response.choices[0].message.content
   

In [None]:
questions = [
    "Who are the main parties involved in this case and the nature of their dispute?",
    "What specific agreement was at the center of the Arcelor Mittal Nippon Steel India Ltd. vs. Essar Bulk Terminal Ltd. case?",
    "How did the dispute in the Arcelor Mittal Nippon Steel India Ltd. vs. Essar Bulk Terminal Ltd. case reach the court?",
    "What was the primary legal issue in the Arcelor Mittal Nippon Steel India Ltd. vs. Essar Bulk Terminal Ltd. case?",
    "Did the case address the arbitrability of disputes in the context of a commercial agreement?",
    "What role did the efficacy of Section 17 remedies play in the legal proceedings?",
    "What was the petitioner's main argument regarding court intervention under Section 9 of the Arbitration Act?",
    "How did the petitioner view the role of the Arbitral Tribunal in granting interim relief?",
    "What legal authorities or precedents did the petitioner cite to support their arguments?",
    "What stance did the respondent take regarding the arbitrability of the dispute?",
    "On what grounds did the respondent seek interim measures from the court?",
    "How did the respondent justify the court's power to entertain Section 9 applications after the Arbitral Tribunal's constitution?",
    "Which legal precedents were analyzed regarding court's power under Section 9 after Arbitral Tribunal constitution?",
    "How did precedents influence the court's view on the interaction between Sections 9 and 17 of the Arbitration Act?",
    "What precedent did the court consider significant for the principle of minimal court intervention in arbitration?",
    "What legal framework governs the arbitration process as discussed in the case?",
    "How does the court interpret the role of Section 9 in the arbitration process?",
    "What is the significance of the court's analysis on the arbitrability of disputes in commercial agreements?",
    "How did the court justify its authority to grant interim measures under Section 9 after the constitution of an arbitral tribunal?",
    "In what way did the court address the concerns regarding the overlap between Sections 9 and 17 of the Arbitration Act?",
    "What impact does the court's decision have on the principle of minimal court intervention in arbitration?",
    "Describe the nature and scope of the Cargo Handling Agreement that led to the dispute between Arcelor Mittal Nippon Steel India Ltd. and Essar Bulk Terminal Ltd.",
    "What were the specific amendments to the Cargo Handling Agreement disputed by the parties?",
    "How did the contractual obligations under the Cargo Handling Agreement become a point of contention?",
    "What complex legal principles were at stake in this arbitration case?",
    "How did the court interpret the application of Section 9 vs. Section 17 of the Arbitration and Conciliation Act in this context?",
    "What was the significance of the court's interpretation of the Arbitration Act in the context of commercial arbitration?",
    "How did the court justify its decision regarding the entertainability of Section 9 applications post-Arbitral Tribunal constitution?",
    "What legal precedents did the court rely on to support its reasoning?",
    "How did the court's reasoning address the balance between arbitration autonomy and the need for judicial intervention?",
    "In what way did the court apply the principles of contract law to the dispute?",
    "How did the court view the role of arbitration in resolving commercial disputes in this case?",
    "What implications did the court's decisions have for the future of arbitration in commercial agreements?"
]


#### Running the Test Question set

In [None]:
from tqdm import tqdm

Answers = []
for question in tqdm(questions, desc="Processing Questions"):
    result_chunk = provide_chunks(question)
    Answers.append(Provide_Answer(question, result_chunk))

#### Intializing Eval Pipeline

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
def calculate_semantic_similarity(answer, ref_answer, model):
    """
    Calculate the semantic similarity score between an answer and a reference answer using a given model.

    Args:
    - answer (str): The first answer.
    - ref_answer (str): The reference answer.
    - model: The sentence-transformers model for encoding text.

    Returns:
    - float: The cosine similarity score between the answer and the reference answer.
    """

    print("Actual Answer: ", ref_answer)

    print("Predicted Answer: ", answer)

    # Encode the answers into embeddings
    embeddings1 = model.encode(answer, convert_to_tensor=True)
    embeddings2 = model.encode(ref_answer, convert_to_tensor=True)

    # Compute cosine similarity
    cosine_scores = util.cos_sim(embeddings1, embeddings2)

    # Return the cosine similarity score
    return cosine_scores.item()


#### Loading Human Evaluated DataSet

In [None]:
import json
with open("Human_eval.json", 'r', encoding='utf-8') as file:
        data = json.load(file)

In [None]:
Ref_answers = data["qaArray"]

In [None]:
Ref_answers = [ans["answer"] for ans in Ref_answers]

##### Performing Evaluation of Datasets

In [None]:
scores = []
for i in range(len(questions)):
    print(f"Question:{i}:{questions[i]}")
    score = calculate_semantic_similarity(Answers[i], Ref_answers[i], model)
    print(score)
    scores.append(score)
    print("\n========================\n")

#### Calculate Average Eval Score

In [None]:
total = sum(scores)
average = total / len(scores)
print("Average Score: ", average)