In [22]:

import requests
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os
import pandas as pd
import tiktoken
import openai
import numpy as np
import os
from openai.embeddings_utils import distances_from_embeddings
import time

openai.api_key = os.getenv('Add your API key here') #Add your API key here

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

In [7]:
tokenizer = tiktoken.get_encoding("cl100k_base")

df = pd.read_csv('data.csv', index_col=None )
df.columns = ['title','heading', 'content']
df.head()

Unnamed: 0,title,heading,content
0,SectionNumber: 1,Title and extent of operation of the Code.,1\nTitle and extent of operation of the Code. ...
1,SectionNumber: 2,Punishment of offences committed within Pakistan.,2\nPunishment of offences committed within Pak...
2,SectionNumber: 3,"Punishment of offences committed beyond, but w...","3\nPunishment of offences committed beyond, bu..."
3,SectionNumber: 4,Extension of Code to extra-territorial offences.,4\nExtension of Code to extra-territorial offe...
4,SectionNumber: 5,Certain laws not to be affected by this Act.,5\nCertain laws not to be affected by this Act...


In [8]:
df['n_tokens'] = df.content.apply(lambda x: len(tokenizer.encode(x)))
df.head()

Unnamed: 0,title,heading,content,n_tokens
0,SectionNumber: 1,Title and extent of operation of the Code.,1\nTitle and extent of operation of the Code. ...,38
1,SectionNumber: 2,Punishment of offences committed within Pakistan.,2\nPunishment of offences committed within Pak...,53
2,SectionNumber: 3,"Punishment of offences committed beyond, but w...","3\nPunishment of offences committed beyond, bu...",87
3,SectionNumber: 4,Extension of Code to extra-territorial offences.,4\nExtension of Code to extra-territorial offe...,533
4,SectionNumber: 5,Certain laws not to be affected by this Act.,5\nCertain laws not to be affected by this Act...,75


In [13]:
max_tokens = 500

# Function to split the text into chunks of a maximum number of tokens
def split_into_many(text, max_tokens = max_tokens):

    # Split the text into sentences
    sentences = text.split('. ')

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
    
    chunks = []
    tokens_so_far = 0
    chunk = []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater 
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            chunk = []
            tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of 
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1

    return chunks
    

shortened = []

# Loop through the dataframe
for row in df.iterrows():

    # If the text is None, go to the next row
    if row[1]['content'] is None:
        continue

    # If the number of tokens is greater than the max number of tokens, split the text into chunks
    if row[1]['n_tokens'] > max_tokens:
        shortened += split_into_many(row[1]['content'])
    
    # Otherwise, add the text to the list of shortened texts
    else:
        shortened.append( row[1]['content'] )

In [14]:
df['content'] = pd.DataFrame(shortened, columns = ['content'])
df['n_tokens'] = df.content.apply(lambda x: len(tokenizer.encode(x)))
df.head()

Unnamed: 0,title,heading,content,n_tokens
0,SectionNumber: 1,Title and extent of operation of the Code.,1\nTitle and extent of operation of the Code. ...,38
1,SectionNumber: 2,Punishment of offences committed within Pakistan.,2\nPunishment of offences committed within Pak...,53
2,SectionNumber: 3,"Punishment of offences committed beyond, but w...","3\nPunishment of offences committed beyond, bu...",87
3,SectionNumber: 4,Extension of Code to extra-territorial offences.,4\nExtension of Code to extra-territorial offe...,495
4,SectionNumber: 5,Certain laws not to be affected by this Act.,5\nCertain laws not to be affected by this Act...,75


In [17]:
df.to_csv('processed/tokens.csv')

In [18]:
df=pd.read_csv('processed/tokens.csv', index_col=0)

In [20]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    time.sleep(7)
    return result["data"][0]["embedding"]

In [23]:
df['embeddings'] = df.content.apply(lambda x: get_embedding(x))
df.head()


Unnamed: 0,title,heading,content,n_tokens,embeddings
0,SectionNumber: 1,Title and extent of operation of the Code.,1\nTitle and extent of operation of the Code. ...,38,"[-0.005502332001924515, -0.0008215743000619113..."
1,SectionNumber: 2,Punishment of offences committed within Pakistan.,2\nPunishment of offences committed within Pak...,53,"[-0.004243207164108753, -0.0023920040111988783..."
2,SectionNumber: 3,"Punishment of offences committed beyond, but w...","3\nPunishment of offences committed beyond, bu...",87,"[0.009597471915185452, -0.007315146271139383, ..."
3,SectionNumber: 4,Extension of Code to extra-territorial offences.,4\nExtension of Code to extra-territorial offe...,495,"[0.002917807549238205, -0.01045273244380951, 0..."
4,SectionNumber: 5,Certain laws not to be affected by this Act.,5\nCertain laws not to be affected by this Act...,75,"[-0.038121387362480164, -0.013538314960896969,..."


In [24]:
df.to_csv('processed/embeddings.csv')

In [25]:
df=pd.read_csv('processed/embeddings.csv', index_col=0)
df.head()
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)

In [34]:
df.head()

Unnamed: 0,title,heading,content,n_tokens,embeddings
0,Scope of Practice 1.1. Scope of Practice Intro...,Learning Objectives Discuss nursing scope of p...,Scope of Practice 1.1. Scope of Practice Intro...,295,"[0.011186833493411541, -0.011353340931236744, ..."
1,Scope of Practice 1.2 History and Foundation,Brief History of Nursing Before discussing sco...,Scope of Practice 1.2 History and Foundation B...,463,"[0.013812066987156868, -0.002538236789405346, ..."
2,Scope of Practice 1.2 History and Foundation,American Nurses Association (ANA) The American...,Scope of Practice 1.2 History and Foundation A...,269,"[-0.005683619063347578, -0.01693866029381752, ..."
3,Scope of Practice 1.3 Regulations & Standards,Open Resources for Nursing (Open RN) Standards...,Scope of Practice 1.3 Regulations & Standards ...,70,"[0.013907037675380707, -0.019138362258672714, ..."
4,Scope of Practice 1.3 Regulations & Standards,ANA Scope and Standards of Practice The Americ...,Scope of Practice 1.3 Regulations & Standards ...,478,"[0.006088844034820795, -0.00912543199956417, 0..."


In [26]:

from openai.embeddings_utils import distances_from_embeddings

In [27]:
def create_context(
    question, df, max_len=1800, size="ada"
):
    """
    Create a context for a question by finding the most similar context from the dataframe
    """

    # Get the embeddings for the question
    q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']

    # Get the distances from the embeddings
    df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')


    returns = []
    cur_len = 0

    # Sort by distance and add the text to the context until the context is too long
    for i, row in df.sort_values('distances', ascending=True).iterrows():
        
        # Add the length of the text to the current length
        cur_len += row['n_tokens'] + 4
        
        # If the context is too long, break
        if cur_len > max_len:
            break
        
        # Else add it to the text that is being returned
        returns.append(row["content"])

    # Return the context
    return "\n\n###\n\n".join(returns)

def answer_question(
    df,
    model="text-davinci-003",
    question="Am I allowed to publish model outputs to Twitter, without a human review?",
    max_len=1800,
    size="ada",
    debug=False,
    max_tokens=150,
    stop_sequence=None
):
    """
    Answer a question based on the most similar context from the dataframe texts
    """
    context = create_context(
        question,
        df,
        max_len=max_len,
        size=size,
    )
    # If debug, print the raw model response
    if debug:
        print("Context:\n" + context)
        print("\n\n")

    try:
        # Create a completions using the questin and context
        response = openai.Completion.create(
            prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
            temperature=0,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=stop_sequence,
            model=model,
        )
        return response["choices"][0]["text"].strip()
    except Exception as e:
        print(e)
        return ""

In [30]:
print(answer_question(df, question="What are the circumstances under which a consent is not considered valid according to the provisions of the Code, including giving consent under fear or misconception, giving consent while under unsoundness of mind or intoxication, or giving consent when the person is under the age of twelve?"))

A consent is not considered valid if it is given under fear or misconception, if the person giving the consent is under unsoundness of mind or intoxication, or if the person giving the consent is under the age of twelve.


In [31]:
print(answer_question(df, question="What is the legal definition of abetment of a thing?"))

Abetment of a thing is defined as a person who instigates any person to do that thing, engages with one or more other person or persons in any conspiracy for the doing of that thing, or intentionally aids, by any act or illegal omission, the doing of that thing.


In [32]:
print(answer_question(df, question="What are the three ways in which a person can be considered to have abetted the doing of a thing?"))

The three ways in which a person can be considered to have abetted the doing of a thing are: 1) Instigating any person to do that thing; 2) Engaging with one or more other person or, persons in any conspiracy for the doing of that thing, if an act or illegal omission takes place in pursuance of that conspiracy, and in order to the doing of that thing; and 3) Intentionally aiding, by any act or illegal omission, the doing of that thing.
