In [None]:
import os
from typing import List
import csv
from tqdm import tqdm      

from datetime import datetime
from datasets import load_dataset
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.prompts.example_selector import (
    MaxMarginalRelevanceExampleSelector,
    SemanticSimilarityExampleSelector,
)
from langchain_community.vectorstores import ( 
    Chroma,
    FAISS,
)
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain_core.output_parsers import (
    StrOutputParser,
    JsonOutputParser,
    PydanticOutputParser
)
from langchain_core.utils.function_calling import convert_to_openai_function

In [None]:
os.environ["OPENAI_API_KEY"] = "<Your API key here>"
os.environ["OPENAI_ORGANIZATION"] = "<Your organisation here>"

In [None]:
class GenQ(BaseModel):
    """Individual QnA pair"""
    question: str= Field(description="individual question generated")
    answer: str= Field(description="corresponding sub-context")

class GenQList(BaseModel):
    """List of QnA pairs"""
    data: List[GenQ]

class QuestionGeneration:
    def __init__(self, dataset, xot):
        """Initialize the llm and parser"""
        self.dataset = load_dataset(dataset) 
        self.data_dict_list_qna_pairs = [{"question": row["question"], "answer": row["answer"]} for row in self.dataset["train"]]
        self.prompt = None
        self.llm = ChatOpenAI(temperature=0.2)
        self.xot = xot
        self.output_parser = JsonOutputParser(pydantic_object=GenQList)
    
    def similarity_prompt(self, context):
        """Prompt Generation"""
        example_prompt = PromptTemplate(
            input_variables=["question", "answer"],
            template="Context:{answer}\nQuestion: {question}",
        )
        
        example_selector = MaxMarginalRelevanceExampleSelector.from_examples(
            self.data_dict_list_qna_pairs, #examples,
            OpenAIEmbeddings(),  # HuggingFaceEmbeddings(),
            FAISS,  # Chroma,
            k=10,
        )
        
        return FewShotPromptTemplate(
            example_selector=example_selector,
            example_prompt=example_prompt,
            prefix="These are the few examples of questions pertaining to the Indian Constitution, judiciary, legislative, and various socio-political issues in India. \n<Examples>",
            suffix="</Examples>\n<Provided context>\n{context}\n</Provided context>\n{format_instructions}\n{instruction}",
            input_variables=["context", "instruction"],
            partial_variables={"format_instructions": self.output_parser.get_format_instructions()}
        )
    def invoke(self, context, raw_output=False):
        self.prompt = self.similarity_prompt(context)
        if raw_output:
            chain = self.prompt | self.llm
        else:
            chain = self.prompt  | self.llm |  self.output_parser
        response = chain.invoke({"context": context, "instruction": self.xot})
        return response

In [None]:
import json

def save_to_json(data, file_path):
    try:
        with open(file_path, 'r') as json_file:
            existing_data = json.load(json_file)
    except FileNotFoundError:
        existing_data = []

    existing_data.extend(data)
    
    with open(file_path, 'w') as json_file:
        json.dump(existing_data, json_file, indent=2)

def add_to_dict(list_of_dicts, doc_id, para_no):
    for dictionary in list_of_dicts:
        dictionary["doc_id"] = doc_id
        dictionary["paragraph_no"] = para_no
    return list_of_dicts

In [None]:
csv_file_path = "paragraphs_dataset.csv"

with open(csv_file_path, 'r', newline='', encoding='utf-8-sig') as csvfile:
    reader = csv.DictReader(csvfile)
    paragraphs = [{"doc_id": row["doc_id"], "paragraph_num": row["paragraph_num"], 'content': row["content"]} for row in reader]

start_paragraph = 1680
end_paragraph = 2000

json_file_path = f"qna_{start_paragraph}_{end_paragraph}_{datetime.now().strftime('%Y%m%d%H%M%S')}_json.json"

tot = "Walk me through this context in manageable parts, step by step, summarizing and analysing as we go"
inl = "Generate the multiple questions from the given context. The generated questions should be similar to question as in examples. The only thing generated should be the questions and corresponding context( from original text) from which the question generated."
inl = tot+" and "+inl

total_paragraphs = end_paragraph - start_paragraph

progress_bar = tqdm(total=total_paragraphs, desc="Processing")

tmp = []
qg = QuestionGeneration("nisaar/Lawyer_GPT_India", inl)
for index in range(start_paragraph, end_paragraph):
    paragraph = paragraphs[index].get("content")
    text = " ".join(paragraph.split())
    doc_id = paragraphs[index].get("doc_id")
    para_no = paragraphs[index].get("paragraph_num")
    response = qg.invoke(context=text)
    if isinstance(response, dict):
        modified_response = add_to_dict(response.get("data"), doc_id=doc_id, para_no=para_no)
        tmp.extend(modified_response) 
    if (index % 10 == 9) or (index == end_paragraph - 1):
        save_to_json(tmp, json_file_path) 
        tmp = []
    progress_bar.update(1)

progress_bar.close()
tqdm._instances.clear()

print(f"List of JSON objects saved to {json_file_path}")
