In [1]:
## Uncomment code below if you are running this in the old environment
!pip install -r ../requirements.txt



In [2]:
import warnings
warnings.filterwarnings("ignore")

import openai
import tiktoken
from pprint import pprint
from getpass import getpass
from rich.markdown import Markdown

import os
import glob
import textwrap
import time

import langchain

# loaders
from langchain.document_loaders import DirectoryLoader

# splits
from langchain.text_splitter import RecursiveCharacterTextSplitter

# prompts
from langchain import PromptTemplate, ConversationChain, LLMChain

# vector stores
from langchain.vectorstores import Chroma, FAISS

# retrievers
from langchain.chains import RetrievalQA, ConversationalRetrievalChain

from langchain.embeddings import OpenAIEmbeddings

# dotenv file reader
from dotenv import load_dotenv

print(langchain.__version__)

0.1.6


In [3]:
from langchain_openai import OpenAIEmbeddings

In [5]:
# reload kernel everytime you change the .env value
# Load the dotenv file
load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
EMBEDDING_PERSIST_DIR = os.getenv('EMBEDDING_PERSIST_DIR')
PROCESSED_FILES_DIR = os.getenv('PROCESSED_FILES_DIR')
# MODEL_NAME = os.getenv('MODEL_NAME')
MODEL_NAME = 'gpt-4-1106-preview'

if OPENAI_API_KEY is None or OPENAI_API_KEY == '':
    print('PLEASE UPDATE THE ENV FILE !')
else:
    print(f'sk - ..............{OPENAI_API_KEY[40::]}')
    print(EMBEDDING_PERSIST_DIR)
    print(PROCESSED_FILES_DIR)
    print(MODEL_NAME)

sk - ..............okOMeUtq6pc
../embeddings_folder/embeddings_v3/
../dataset/processed/clean_txt_files_22_feb/
gpt-4-1106-preview


In [6]:
vectordb = Chroma(persist_directory=EMBEDDING_PERSIST_DIR, embedding_function=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY))

# COUNT OF USER UPLOADED TOKENS *SAMPLE ASSUMPTION

In [7]:
!pip install PyPDF2>3.0

In [8]:
!pip install camelot-py[base]



In [9]:
!pip install PyMuPDF



In [None]:
import os
import PyPDF2

# Directory containing PDF files
directory = 'test_data/'

# Initialize an empty string to store concatenated text
concatenated_text = ""

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".pdf"):  # Check if the file is a PDF
        filepath = os.path.join(directory, filename)
        print("Processing:", filepath)
        
        # Creating a pdf reader object
        reader = PyPDF2.PdfReader(filepath)
        
        # Loop through each page and concatenate its text
        for page in reader.pages:
            concatenated_text += page.extract_text()

# Print the concatenated text
print(concatenated_text)

In [11]:
import re

def clean(text):
    # Remove html <a> tag
    text = re.sub(r"<a href[^>]*>([^<]+)</a>", " ", text)
    text = re.sub(r"<a rel[^>]*>([^<]+)</a>", " ", text)

    # Remove image-related tags
    text = re.sub(r"<img[^>]*>", " ", text)
    text = re.sub(r"<figure[^>]*>.*?</figure>", " ", text)
    # Remove image-related tags including .png extension
    text = re.sub(r"<img[^>]*>|<figure[^>]*>.*?</figure>|<[^>]*.png[^>]*>", " ", text)

    # Replace specific domain
    text = text.replace("WWW. QQGIAT .NET", " ")

    # Replace special characters
    text = text.replace("\t", " ").replace("\n", " ").replace("(\r", " ").replace("&nbsp;", " ").replace("amp;", " ")

    # Remove url link
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"www.\S+", "", text)

    # Keep letters and numbers only
    # text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    text = re.sub(r"[^\w\s.,]", " ", text)
    
    # Keep single spaces
    text = re.sub(" +", " ", text)
    
    # Remove long sequences of periods, but keep single periods and other punctuation
    text = re.sub(r"(?<!\w)\.{3,}(?!\w)", " ", text)  # Replace 3 or more periods not surrounded by word characters
    text = re.sub(r"(?<!\w)\.{2,}(?!\w)", " ", text)  # Replace 2 or more periods not surrounded by word characters

    # Remove sequences of 3 or more periods
    text = re.sub(r"\.{3,}", " ", text)
    text = re.sub(r"\. {3,}", " ", text)
    
    # Keep single spaces
    text = re.sub(" +", " ", text)

    return text
    
def second_clean(text):
    # Remove sequences of 3 or more periods
    # Remove sequences of consecutive periods
    text = re.sub(r"\.{2,}", " ", text)

    # Keep single spaces
    text = re.sub(" +", " ", text)
    return text

In [12]:
clean_concatenated_text = clean(concatenated_text)

In [14]:
# clean_concatenated_text

In [15]:
import tiktoken
tokenizer = tiktoken.encoding_for_model(MODEL_NAME)

In [16]:
print(f'Total token of uploaded docs before cleaning: {len(tokenizer.encode(concatenated_text))}')

Total token of uploaded docs before cleaning: 118241


In [17]:
print(f'Total token of uploaded docs after cleaning: {len(tokenizer.encode(clean_concatenated_text))}')

Total token of uploaded docs after cleaning: 106842


In [18]:
# TO SAVE TIME

In [17]:
with open("output_text.txt", "w", encoding="utf-8") as text_file:
    text_file.write(concatenated_text)

In [19]:
with open("output_text.txt", "r", encoding="utf-8") as input_file:
    # Read the content of the input file
    clean_concatenated_text = input_file.read()

# COUNT OF OUTLINE GENERATION TOKEN *WITH USER's UPLOADED FILES

In [21]:
context = """
Australia is a diverse and rapidly growing country that has embraced the opportunities presented by advances in technology and artificial intelligence (AI). With a thriving economy and a focus on innovation and collaboration, Australia has positioned itself as a global leader in the development and adoption of AI and technology, driving economic growth and societal change.
AI and Technology Landscape:
Australia has seen significant advancements in AI and technology, with a growing ecosystem of AI startups, research institutions, and tech companies. The country has a strong emphasis on research and development, with several world-class universities and research organizations leading the way in AI and technology innovation. Additionally, Australia has a well-established tech industry, with companies such as Atlassian, Canva, and Afterpay making a global impact.
In recent years, the Australian government has made significant investments in AI and technology, recognizing the potential for these fields to drive economic growth and improve quality of life. Initiatives such as the Australian AI Action Plan and the National Innovation and Science Agenda provide a roadmap for the development and adoption of AI and technology in Australia.
Economic Impact:
The adoption of AI and technology has had a significant impact on the Australian economy. According to a report by AlphaBeta, AI is projected to add AU$22.17 trillion to the global economy by 2030, with Australia poised to benefit from this economic growth. The use of AI and technology in sectors such as healthcare, finance, and agriculture has increased productivity and efficiency, leading to job creation and improved living standards.
The Australian government has recognized the economic potential of AI and technology and has taken proactive steps to support the development and adoption of these technologies. Initiatives such as the Research and Development Tax Incentive and the AI and Emerging Technologies Initiative provide funding and support for AI and technology research and development, encouraging innovation and economic growth.
Challenges and Opportunities:
While Australia has made significant strides in AI and technology, there are still challenges that need to be addressed. One of the key challenges is the ethical and societal implications of AI, including issues such as privacy, bias, and job displacement. The Australian government has taken steps to address these challenges, with initiatives such as the AI Ethics Framework and the Digital Economy Strategy aiming to ensure that AI and technology are developed and used responsibly.
Australia also faces challenges in terms of talent and skills development in AI and technology. There is a growing demand for workers with AI and technology expertise, and Australia will need to invest in education and training programs to meet this demand. Initiatives such as the Australian Skills and Training Fund and the National Innovation and Science Agenda aim to address this challenge by providing funding and support for education and training in AI and technology.
Despite these challenges, Australia has a wealth of opportunities in AI and technology. The country has a strong foundation in research and development, a thriving tech industry, and a supportive government that is committed to driving innovation and economic growth. Australia also has a strong entrepreneurial culture, with many AI startups and tech companies emerging and making a global impact. With the right support and investment, Australia is well-positioned to continue driving economic growth and societal change through AI and technology.
Conclusion:
Australia has emerged as a global leader in AI and technology, with a thriving ecosystem of startups, research organizations, and tech companies driving innovation and economic growth. The country has made significant investments in AI and technology, recognizing the potential for these fields to improve quality of life and drive economic growth. While there are challenges to be addressed, Australia has a wealth of opportunities in AI and technology, and with the right support and investment, the country is well-positioned to continue driving economic growth and societal change through these fields.
"""
question = 'Policy Outline'

In [25]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
import textwrap
import time

# V2
template = f"""
*****  
"""

# Instantiate the PromptTemplate with the specific context and question for generating the policy outline
# QA_CHAIN_PROMPT = PromptTemplate(
#     # input_variables=["question","context"],
#     template=template
# )

max_tokens_suggested = 1024

from langchain_openai import ChatOpenAI
llm_new = ChatOpenAI(
    model_name=MODEL_NAME,
    openai_api_key = OPENAI_API_KEY,         
    max_tokens=max_tokens_suggested                
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm_new,
    retriever=vectordb.as_retriever(search_kwargs={'k': 7}),
    return_source_documents=True,
    # chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [26]:
print(f'*Total token for outline template: {len(tokenizer.encode(template))}')

*Total token for outline template: 1151


# POLICY OUTLINE 

In [27]:
# context = clean_concatenated_text
context = context
country_name = 'Australia'
question = 'Policy Outline'

In [29]:
# user_prompt = "Generate outline for AI Strategy for Medical & Healthcare document in Australia"
user_prompt = "Generate AI Policy and Development Guidelines for Australia's Industry"
query = f"""*****, {user_prompt}. 
          ******, 
          ***** :{country_name} ***** {question} *****. 
          *****. 
          *****, 
          *****.
          *****, *****:
          {template}
          """

In [50]:
# context + prompt (template & user's)
total_input_token = len(tokenizer.encode(query))
print(total_input_token)

1250


In [35]:
try:
    # Generate the policy outline using the context and question
    start_time = time.time()
    policy_outline = qa_chain(query)['result']
    end_time = time.time()
    print(f'Done! Time taken: {end_time - start_time} seconds')
except Exception as e:
    # Exception handling if maximum tokens too large (works on GPT-3.5)
    print(f'An error occurred: {e}')
    max_tokens_suggested = 500
    start_time = time.time()
    policy_outline = qa_chain(query)['result']
    end_time = time.time()
    print(f'Done with reduced tokens! Time taken: {end_time - start_time} seconds')
    max_tokens_suggested = 1000

print(policy_outline)
print('____________________________________________________________________________________')

Done! Time taken: 68.02548217773438 seconds
1. Introduction and Context
1.1 Rationale for the Policy
        Australia's dynamic growth and adoption of artificial intelligence (AI) technologies offer transformative potential for the economy and society. Recognizing AI's capacity to augment productivity and innovation across various sectors, this policy aims to provide a strategic framework to harness AI opportunities while addressing ethical, security, and societal challenges.

2. Stakeholder Analysis
2.1 Identification of Stakeholders
        Key stakeholders include the government, private sector, academia, civil society, and international partners. Their roles range from policy formulation and enforcement, technological development, academic research, advocacy, to collaboration on global AI standards and practices.

3. Policy Goals and Objectives
3.1 Overarching Goals
        The policy seeks to establish Australia as a leader in developing and deploying AI that is secure, ethical, 

In [40]:
# Split the text into sections and subsections
import re
new_sections = {}
current_section = None
current_subsection = None

for line in policy_outline.split("\n"):
    if re.match(r"^\d+\.\s", line):  # Check if line starts with a section number
        current_section = line.strip()
        new_sections[current_section] = {}
    elif re.match(r"^\d+\.\d+\s", line):  # Check if line starts with a subsection number
        current_subsection = line.strip()
        new_sections[current_section][current_subsection] = ""
    elif current_section and current_subsection:  # Check if section and subsection are initialized
        new_sections[current_section][current_subsection] += line.strip() + " "
import json
# Convert to JSON
policy_json = json.dumps(new_sections, indent=4)

# Print JSON
print(policy_json)

{
    "1. Introduction and Context": {
        "1.1 Rationale for the Policy": "Australia's dynamic growth and adoption of artificial intelligence (AI) technologies offer transformative potential for the economy and society. Recognizing AI's capacity to augment productivity and innovation across various sectors, this policy aims to provide a strategic framework to harness AI opportunities while addressing ethical, security, and societal challenges.  "
    },
    "2. Stakeholder Analysis": {
        "2.1 Identification of Stakeholders": "Key stakeholders include the government, private sector, academia, civil society, and international partners. Their roles range from policy formulation and enforcement, technological development, academic research, advocacy, to collaboration on global AI standards and practices.  "
    },
    "3. Policy Goals and Objectives": {
        "3.1 Overarching Goals": "The policy seeks to establish Australia as a leader in developing and deploying AI that is se

In [41]:
import tiktoken
tokenizer = tiktoken.encoding_for_model(MODEL_NAME)
tokenizer

<Encoding 'cl100k_base'>

In [43]:
policy_title = "AI Policy and Development Guidelines for Australia's Industry"

In [44]:
policy_outline

"1. Introduction and Context\n1.1 Rationale for the Policy\n        Australia's dynamic growth and adoption of artificial intelligence (AI) technologies offer transformative potential for the economy and society. Recognizing AI's capacity to augment productivity and innovation across various sectors, this policy aims to provide a strategic framework to harness AI opportunities while addressing ethical, security, and societal challenges.\n\n2. Stakeholder Analysis\n2.1 Identification of Stakeholders\n        Key stakeholders include the government, private sector, academia, civil society, and international partners. Their roles range from policy formulation and enforcement, technological development, academic research, advocacy, to collaboration on global AI standards and practices.\n\n3. Policy Goals and Objectives\n3.1 Overarching Goals\n        The policy seeks to establish Australia as a leader in developing and deploying AI that is secure, ethical, and contributes to economic prosp

# DETAILING THE SECTIONS 

In [45]:
question = user_prompt

In [51]:
context[:100]

'\nAustralia is a diverse and rapidly growing country that has embraced the opportunities presented by'

In [54]:
max_tokens_suggested = 4000

details_chain = RetrievalQA.from_chain_type(
    llm=llm_new,
    retriever=vectordb.as_retriever(search_kwargs={'k': 7}),
    return_source_documents=True,
)

In [59]:
sections = [
    "1. Introduction and Context",
    "2. Stakeholder Analysis",
    "3. Policy Goals and Objectives",
    "4. Key Focus Areas",
    "5. Regulatory Framework",
    "6. Implementation Plan",
    "7. Monitoring and Evaluation",
    "8. Stakeholder Engagement and Communication",
    "9. Risk Management and Sustainability",
    "10. Conclusion and Next Steps"
]
sections_detailing = {}
for section in sections:
    value = details_chain(
            f"""***** '{section}' : 
                -----------
                ***** {section}'s **********.
                *****.

                *****:
                *****:
                {policy_outline}
                
                *****: 
                {policy_title}

                *****:
                {context}

                ------------
                *****.
                *****.
                *****. 
                
                ***** {country_name} *****.
                *****
                ------------
                *****.
                *****.
                """
        )['result']
    sections_detailing[section] = value
    

In [60]:
sections_detailing

{'1. Introduction and Context': "1. Introduction and Context\n\n1.1 Rationale for the Policy\nAustralia's growing engagement with artificial intelligence (AI) is seen as a key driver for future economic and societal growth. Recognizing the transformative potential of AI, this policy aims to strategically position Australia to capitalize on AI opportunities and address the associated challenges. It is designed to foster innovation and productivity across various sectors while ensuring ethical considerations, security, and societal impacts are addressed, thereby promoting a balanced progression into an AI-augmented future.\n\n2. Stakeholder Analysis\n\n2.1 Identification of Stakeholders\nThe policy identifies a broad range of stakeholders integral to the AI ecosystem in Australia, including government entities, the private sector, academic institutions, civil society, and international partners. These stakeholders play various roles, from shaping and enforcing policies to driving technol

In [61]:
import re
def parse_sections_corrected(input_dict):
    # Define the pattern for detecting subsections within the section text
    subsection_pattern = re.compile(r'(\d+\.\d+)\s+(.*)')
    parsed_dict = {}

    for section, content in input_dict.items():
        # Find all subsections within the section
        subsections = subsection_pattern.findall(content)
        section_dict = {}

        # Process each subsection
        for i, (num, title) in enumerate(subsections):
            # Find the start of the current subsection
            start = content.find(f"{num} {title}") + len(f"{num} ")  # Adjust to start after the subsection number
            # Find the start of the next subsection, if it exists, to define the end of the current subsection
            end = content.find(f"{subsections[i + 1][0]} ", start) if i + 1 < len(subsections) else len(content)
            # Extract the subsection text, without the title
            subsection_text = content[start:end].strip()
            # Remove the title from the subsection text for accurate representation
            if subsection_text.startswith(title):
                subsection_text = subsection_text[len(title):].strip()
            # Assign to section_dict
            section_dict[f"{num} {title}"] = subsection_text

        # Assign the processed section to the output dictionary with the full section title as key
        parsed_dict[section] = section_dict

    return parsed_dict



print(parse_sections_corrected(sections_detailing))

{'1. Introduction and Context': {'1.1 Rationale for the Policy': "Australia's growing engagement with artificial intelligence (AI) is seen as a key driver for future economic and societal growth. Recognizing the transformative potential of AI, this policy aims to strategically position Australia to capitalize on AI opportunities and address the associated challenges. It is designed to foster innovation and productivity across various sectors while ensuring ethical considerations, security, and societal impacts are addressed, thereby promoting a balanced progression into an AI-augmented future.\n\n2. Stakeholder Analysis", '2.1 Identification of Stakeholders': 'The policy identifies a broad range of stakeholders integral to the AI ecosystem in Australia, including government entities, the private sector, academic institutions, civil society, and international partners. These stakeholders play various roles, from shaping and enforcing policies to driving technological advancements and re

# JSON PARSER LANGCHAIN

In [84]:
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

llm = OpenAI()
chat_model = ChatOpenAI()


  warn_deprecated(


In [93]:
policy_outline_schema = {
    "title": "Policy Outline",
    "description": "Outline of a policy with sections and subsections.",
    "type": "object",
    "properties": {
        "sections": {
            "title": "Sections",
            "description": "Sections of the policy outline",
            "type": "object",
            "patternProperties": {
                "^\d+\.\s.*$": {
                    "type": "object",
                    "patternProperties": {
                        "^\d+\.\d+\s.*$": {"type": "string"}
                    }
                }
            }
        }
    },
    "required": ["sections"]
}

llm = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are extracting information in structured formats."),
        ("human", f"Parse the given policy outline : {policy_outline}")
    ]
)

chain = create_structured_output_chain(policy_outline_schema, llm, prompt, verbose=True)
output = chain.run({})

print(output)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are extracting information in structured formats.
Human: Parse the given policy outline : 1. Introduction and Context
1.1 Rationale for the Policy
        Australia's dynamic growth and adoption of artificial intelligence (AI) technologies offer transformative potential for the economy and society. Recognizing AI's capacity to augment productivity and innovation across various sectors, this policy aims to provide a strategic framework to harness AI opportunities while addressing ethical, security, and societal challenges.

2. Stakeholder Analysis
2.1 Identification of Stakeholders
        Key stakeholders include the government, private sector, academia, civil society, and international partners. Their roles range from policy formulation and enforcement, technological development, academic research, advocacy, to collaboration on global AI standards and practices.

3. Policy Goals and Objectives

# COUNT OF DETAILING SECTION

In [67]:
query_detailing = f"""***** '{section}' : 
                -----------
                ***** {section}'s *****.
                *****.

                *****:
                *****:
                {policy_outline}
                
                *****: 
                {policy_title}

                *****:
                {context}

                ------------
                *****
                *****
                ***** 
                
                ***** {country_name}. *****.
                *****.
                ------------
                *****.
                *****.
                """

In [69]:
import tiktoken
tokenizer = tiktoken.encoding_for_model(MODEL_NAME)
len(tokenizer.encode(query_detailing))

1692

In [73]:
total_inp_token = len(tokenizer.encode(query)) + (len(tokenizer.encode(query_detailing))*10)

In [74]:
total_inp_token

18170

In [75]:
print(f'*Total token input for generate outline + detailing an outline: {total_inp_token}')

*Total token input for generate outline + detailing an outline: 18170


In [76]:
print(f'*Total token output for generate outline + detailng an outline: {1024+(max_tokens_suggested*10)}')

*Total token output for generate outline + detailng an outline: 41024


# REGENERATE A SUBSECTION

In [88]:
old_context = context

In [90]:
old_context

"IN BRIEF \nSummary EditionCOUNTRYAcknowledgement of\nThe Department of Foreign Affairs and Trade acknowledges the traditional custodians throughout Australia and pays respect to elders past, present and emerging.\nAboriginal and Torres Strait Islander readers  are warned that the following pages may contain images of deceased persons.2\nWELCOME TO OUR GREA T SOUTHERN LAND\n23Australia is one of the most multicultural countries in the world,\xa0and home to the world’s oldest continuing culture. \nWe have\xa0a\xa0highly skilled workforce and\xa0a proud history of democracy and\xa0stable\xa0government.Australia is a land like no other and is one of 17\xa0countries that together account for almost 70\xa0per cent of the world’s species.\n1 We are home to 10 per cent of the world’s biodiversity from tropical rainforests in the north to the red deserts of the centre and the snowfields of the south-east. Three\xa0Australian cities — Melbourne, Sydney and Adelaide — were listed in the top ten 

In [89]:
context = policy_outline

In [99]:
user_prompt_for_regenerate = f"""
***** 
*****
"""

backend_prompt_for_regenerate = f"""
***** 
{subsection_title_}
*****
{subsection_explanation_}
-------------------
*****, 
*****:
{user_prompt_for_regenerate}

***** 
*****
*****
*****
"""

subsection_title_ = '1.1 Background Information'
subsection_explanation_ = "The technological landscape in Australia has been significantly transformed by advancements in Artificial Intelligence (AI) and data analytics, leading to considerable growth in the collection, usage, and international transfer of personal data by various entities. This policy document is crafted in response to the imperative need for robust data protection measures that can prevent potential misuse of personal data and ensure the privacy of individuals. It seeks to strike a balance between protecting citizens' personal information and enabling technological innovation and economic expansion, which are vital for the country's competitiveness in a global digital economy."
context = 'Paraphrase / Regenerate'

max_tokens_suggested = 2000
REGENERATE_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context"],
    template=regenerate_template
)
regenerate_chain = RetrievalQA.from_chain_type(
    llm=llm_new,
    retriever=vectordb.as_retriever(search_kwargs={'k': 7}),
    return_source_documents=True
)
print(regenerate_chain(backend_prompt_for_regenerate)['result'])

In Australia, the advent of Artificial Intelligence (AI) and the subsequent rise in data analytics have revolutionarily altered the digital domain. There is a marked increase in the collection, utilization, and global exchange of personal data by a myriad of organizations. This policy document is conceived in recognition of the urgent necessity for stringent data protection protocols, which are essential to thwart the potential mishandling of personal data and to safeguard the privacy rights of individuals. The document aims to delicately calibrate the protection of personal information with the imperative to foster technological progress and economic growth, both of which are crucial for Australia's standing in the international digital marketplace.

Due to the absence of comprehensive data policies, Australian citizens have faced online crimes such as identity theft, where personal information is illicitly accessed and exploited. For instance, a notable online crime involved fraudste