Load Libraries

In [1]:
import os
import json
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from tqdm import tqdm
import math
import numpy as np

Sentence Splitting & Chunking

In [2]:
def split_paragraphs(rawText):
    text_splitter = RecursiveCharacterTextSplitter(
        separators=[r'[0-9]{1}?[0-9]{1}[.][\s][\D][\bCHAPTER \s\b]','.'],
        chunk_size=400,
        chunk_overlap=250,
        length_function=len,
        is_separator_regex=True,
    )

    return  text_splitter.split_text(rawText)

PDF Extraction

In [3]:
def extract_pdfs(pdfs):
    l = []
    for pdf in pdfs:
        title = pdf[7:]
        if(title != "The Sexual Harassment of Women at Workplace.pdf"):
            continue
        print(title)
        reader = PdfReader(pdf)

        for i, page in enumerate(reader.pages):
            if(i < 2):
                continue
            else :
                raw = page.extract_text()
                chunks = split_paragraphs(raw)
                for text in chunks:
                    dict_temp = {"title" : title, "convo" : text, "link":"", "domain": "Corporate Law"}
                    l.append(dict_temp)
    return l

Create Question Prompt

In [4]:
def create_question(doc):
    model = ChatOllama(model="llama3")
    question_schema = ResponseSchema(
        name="question",
        description="a question about the context"
    )
    question_response_schema = [question_schema]

    question_output_parser = StructuredOutputParser.from_response_schemas(question_response_schema)
    format_instructions = question_output_parser.get_format_instructions()

    bare_prompt_template = "{content}"
    bare_template = ChatPromptTemplate.from_template(template=bare_prompt_template)
    qa_template = """\
        You are an indian citizen facing a legal query or require legal advice from a very skilled
        laywer. For each context and title, create a question that is specific to the context and title. The question can be simple or
        scenario based too. Avoid creating generic or general questions.

        
        Format the output as JSON with following keys:-
            question
            title
        
            
        context:{context}
        title:{title}
    """

    prompt_template = ChatPromptTemplate.from_template(template = qa_template)
    messages = prompt_template.format_messages(
        context = doc["convo"],
        title = doc["title"],
        format_instructions=format_instructions
    )
    question_generation_chain = bare_template | model
    response = question_generation_chain.invoke({"content" : messages})
    output_dict = None
    try:
        output_dict = question_output_parser.parse(response.content)
    except Exception as e:
        pass
    
    return output_dict

Create Answer Prompt

In [7]:
def generate_answers(doc):
    model = ChatOllama(model="llama3")
    answer_schema = ResponseSchema(
        name="answer",
        description="an answer to the question"
    )
    question_response_schema = [answer_schema]

    question_output_parser = StructuredOutputParser.from_response_schemas(question_response_schema)
    format_instructions = question_output_parser.get_format_instructions()

    bare_prompt_template = "{content}"
    bare_template = ChatPromptTemplate.from_template(template=bare_prompt_template)
    qa_template = """\
        You are an experienced and skilled indian corporate laywer who answers the queries and provide explaination
        to the indian citizens queries in simple language and be comprehensive in your answer. For each given question, context and title, answer the
        question given to you based solely on the  information provided in the context. Do not provide any external information or links.
        
        Format the output as JSON with following keys:-
            answer
            question
            title
        
        question:{question}
        context:{context}
        title: {title}
    """

    prompt_template = ChatPromptTemplate.from_template(template = qa_template)
    messages = prompt_template.format_messages(
        question = doc["question"],
        context = doc["context"],
        title = doc["title"],
        format_instructions=format_instructions
    )
    question_generation_chain = bare_template | model
    response = question_generation_chain.invoke({"content" : messages})
    output_dict = None
    try:
        output_dict = question_output_parser.parse(response.content)
    except Exception as e:
        pass
    
    return output_dict

In [8]:
pdfs = []
for _,_,files in os.walk('./Acts', topdown=True):
    for file in files:
        path = './Acts/' + file
        pdfs.append(path)

docs = extract_pdfs(pdfs)
print(docs[0])

The Sexual Harassment of Women at Workplace.pdf
{'title': 'The Sexual Harassment of Women at Workplace.pdf', 'convo': '3 \n THE SEXUAL HARASSMENT OF WOMEN AT WORKPLACE  \n(PREVENTION, PROHIBITION AND REDRESSAL) ACT, 2013  \nACT NO. 14 OF 2013  \n[22nd April, 2013 ] \nAn Act to provide protection against sexual harassment of women at workplace and for the \nprevention and redressal of complaints of sexual harassment and for matters connected \ntherewith or incidental thereto.  \nWHEREAS  sexual harassment results in violat', 'link': '', 'domain': 'Corporate Law'}


In [20]:
for i, doc in enumerate(docs):
    print("Doc number: " + str(i))
    print(doc['convo'])

Doc number: 0
3 
 THE SEXUAL HARASSMENT OF WOMEN AT WORKPLACE  
(PREVENTION, PROHIBITION AND REDRESSAL) ACT, 2013  
ACT NO. 14 OF 2013  
[22nd April, 2013 ] 
An Act to provide protection against sexual harassment of women at workplace and for the 
prevention and redressal of complaints of sexual harassment and for matters connected 
therewith or incidental thereto.  
WHEREAS  sexual harassment results in violat
Doc number: 1
to provide protection against sexual harassment of women at workplace and for the 
prevention and redressal of complaints of sexual harassment and for matters connected 
therewith or incidental thereto.  
WHEREAS  sexual harassment results in violation of the fundamental rights of a woman to equality 
under articles 14 and 15 of the Constitution of India and her right to life and to live with dig
Doc number: 2
r matters connected 
therewith or incidental thereto.  
WHEREAS  sexual harassment results in violation of the fundamental rights of a woman to equality 
und

In [11]:
qac_triplet = []

for doc in tqdm(docs):
    res = create_question(doc)
    if(res) :
        res['context'] = doc['convo']
        print(res['question'])
        qac_triplet.append(res)
        break


  0%|          | 0/238 [00:15<?, ?it/s]

What are my rights as a woman employee if I experience sexual harassment at my workplace, and how can I file a complaint under the Sexual Harassment of Women at Workplace (Prevention, Prohibition and Redressal) Act, 2013?





In [12]:
(qac_triplet[0])

{'question': 'What are my rights as a woman employee if I experience sexual harassment at my workplace, and how can I file a complaint under the Sexual Harassment of Women at Workplace (Prevention, Prohibition and Redressal) Act, 2013?',
 'title': 'The Sexual Harassment of Women at Workplace.pdf',
 'context': '3 \n THE SEXUAL HARASSMENT OF WOMEN AT WORKPLACE  \n(PREVENTION, PROHIBITION AND REDRESSAL) ACT, 2013  \nACT NO. 14 OF 2013  \n[22nd April, 2013 ] \nAn Act to provide protection against sexual harassment of women at workplace and for the \nprevention and redressal of complaints of sexual harassment and for matters connected \ntherewith or incidental thereto.  \nWHEREAS  sexual harassment results in violat'}

Reflection Prompt & Evaluator Prompt

In [9]:
def check_answer(answer, question):
    model = ChatOllama(model = 'llama3')
    eval_prompt = """You are an experienced and skilled indian corporate laywer who answers the queries and provide explaination
        to the indian citizens queries in simple language. You will be given a question and an answer, verify if the given answer answers 
        the given question correctly. If it does answer the question correctly, return True otherwise False.

        <question>
        {question}
        </question>
        <answer>
        {answer}
        </answer>
        """

    eval_chat = ChatPromptTemplate.from_messages(
        [
            ("system", eval_prompt),
        ]
    )
    ref_prompt = """You are an experienced and skilled indian corporate laywer. You will be given a question rewrite an improved version of question.

        <question>
        {question}
        </question>
        """

    ref_chat = ChatPromptTemplate.from_messages(
        [
            ("system", ref_prompt),
        ]
    )
    check = eval_chat | model
    c = check.invoke({'question' : question, 'answer' : answer})
    if(c == "False") :
        rewrite = ref_chat | model
        c = rewrite.invoke({'question' : question})
        return c
    
    return "False"

In [15]:
dataset = []
for doc in tqdm(qac_triplet):
    if(doc):
        res = generate_answers(doc)
        check = check_answer(answer=res["answer"], question=doc['question'])
        if check != "False":
            doc['question'] = check
            res = generate_answers(doc)
        if(res):
            dict_temp = {
                "question" : doc["question"],
                "ground_truth" : res["answer"],
                "context" : doc["context"],
                "title" : doc["title"]
            }
            print(dict_temp["question"])
            print(dict_temp["ground_truth"])

            print()
            dataset.append(dict_temp)


100%|██████████| 1/1 [00:37<00:00, 37.77s/it]

What are my rights as a woman employee if I experience sexual harassment at my workplace, and how can I file a complaint under the Sexual Harassment of Women at Workplace (Prevention, Prohibition and Redressal) Act, 2013?
As a woman employee, if you experience sexual harassment at your workplace, you have the right to file a complaint under the Sexual Harassment of Women at Workplace (Prevention, Prohibition and Redressal) Act, 2013. You can file a complaint with the Internal Complaint Committee (ICC) or the Local Complaint Committee (LCC), as applicable.

To file a complaint, you need to provide a written complaint to the ICC/LCC within 90 days of the alleged incident. The complaint should include your name and contact information, a description of the incident, and any evidence you may have (if any).

The ICC/LCC will investigate the complaint and take appropriate action if the allegations are substantiated. You can also request confidentiality during the investigation process.

Addi




In [18]:
print(dataset[0]['ground_truth'])
print(dataset[0]['context'])

As a woman employee, if you experience sexual harassment at your workplace, you have the right to file a complaint under the Sexual Harassment of Women at Workplace (Prevention, Prohibition and Redressal) Act, 2013. You can file a complaint with the Internal Complaint Committee (ICC) or the Local Complaint Committee (LCC), as applicable.

To file a complaint, you need to provide a written complaint to the ICC/LCC within 90 days of the alleged incident. The complaint should include your name and contact information, a description of the incident, and any evidence you may have (if any).

The ICC/LCC will investigate the complaint and take appropriate action if the allegations are substantiated. You can also request confidentiality during the investigation process.

Additionally, under the Act, you have the right to:

* Report incidents of sexual harassment to your supervisor or HR department
* Seek a safe working environment free from sexual harassment
* Get support and assistance in fil

In [None]:
print("--------Storing---------")
with open("final_batch4.json", "r+") as final:
        file_data = json.load(final)
        for doc in dataset:
                    file_data['data'].append(doc)
                    final.seek(0)
                    json.dump(file_data, final)
        final.close()

Generating Embbedings

In [10]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model="llama3",
)

In [11]:
f_d = []
for i in range(1, 4):
    with open(f"final_batch{i}.json", "r+") as f:
        data = json.load(f)
        for d in tqdm(data["data"]):
            embd = embeddings.embed_query(d["question"])
            embd = np.array(embd)
            f_d.append((embd,d))

100%|██████████| 205/205 [01:25<00:00,  2.41it/s]
100%|██████████| 53/53 [00:15<00:00,  3.46it/s]
100%|██████████| 246/246 [01:33<00:00,  2.64it/s]


Finding out angle from a refrence vector

In [12]:
def angle_between(u, v) :
    dot_product = sum(i * j for i, j in zip(u, v))
    norm_u = math.sqrt(sum(i*i for i in u))
    norm_v = math.sqrt(sum(i*i for i in v))
    cos_theta = dot_product/(norm_u * norm_v)
    angle_rad = math.acos(cos_theta)
    angle_deg = math.degrees(angle_rad)

    return angle_deg

def cosine_sim(u, v) :
    dot_product = sum(i * j for i, j in zip(u, v))
    norm_u = math.sqrt(sum(i*i for i in u))
    norm_v = math.sqrt(sum(i*i for i in v))
    cos_theta = dot_product/(norm_u * norm_v)

    return cos_theta


In [13]:
refrence_vec = np.ones(4096)
deg_ind_pair = []
for i, u in enumerate(f_d) :
    vec = u[0]
    angle = angle_between(vec, refrence_vec)
    deg_ind_pair.append(tuple([angle, i]))


In [14]:
deg_ind_pair.sort()

In [15]:
window_factor = 3
final_mean_questions = []
i = 0
for i in tqdm(range(0, len(deg_ind_pair) - window_factor + 1, window_factor)) :
    chosen_ques = None
    maxi = 0
    for j in range(i, i + window_factor) :
        ind = deg_ind_pair[j][1]
        ques_vec = f_d[ind][0]
        cont_vec = np.array(embeddings.embed_query(f_d[ind][1]['context']))
        sim_score = cosine_sim(ques_vec, cont_vec)
        if sim_score > maxi :
            maxi = sim_score
            chosen_ques = f_d[ind][1]
    final_mean_questions.append(chosen_ques)
    i += (window_factor)

100%|██████████| 168/168 [05:06<00:00,  1.82s/it]


In [16]:
print(len(final_mean_questions))

168


In [17]:
print("--------Storing---------")
with open("final_mean_questions.json", "r+") as final:
        file_data = json.load(final)
        for doc in final_mean_questions:
                    file_data['data'].append(doc)
                    final.seek(0)
                    json.dump(file_data, final)
        final.close()

--------Storing---------
