#Installation of Required Python Packages


In [None]:
!pip install \
PyPDF2 \
langchain_community \
langchain_experimental \
langchain_openai \
langchain_pinecone \
pinecone-client[grpc] \
nest_asyncio\
openai -qq

# SETTING API KEYS


In [None]:


from google.colab import drive
drive.mount('/content/drive')
import os

with open('/content/drive/MyDrive/LLM_Projects/.bashrc') as file:
    for line in file:
        if line.startswith('export '):
            var, value = line[len('export '):].strip().split('=')
            os.environ[var] = value



Mounted at /content/drive



# Loading Text from a PDF File




In [None]:
import PyPDF2
def load_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text




# Preprocess and Divide the Extracted Text


In [None]:
import re
def preprocess_and_divide_text(text):
    # Define the separator patterns
    separator_pattern = r'\n-{7,73}\n'
    # Split text by the separator patterns
    sections = re.split(separator_pattern, text)
    # Filter out the separators themselves
    sections = [section.strip() for section in sections if section.strip()]
    return sections

#Apply RecursiveCharacterTextSplitter to Each Section


In [None]:
import os
import openai
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableMap
from langchain_community.vectorstores import Pinecone as LangchainPinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone, ServerlessSpec
from sklearn.metrics.pairwise import cosine_similarity
import nest_asyncio
import asyncio
import re
from operator import itemgetter




In [None]:
def split_text_sections(sections):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=850, chunk_overlap=200)
    split_sections = []
    for section in sections:
        split_sections.extend(text_splitter.split_text(section))
    return split_sections


In [None]:
# Path to the PDF file
pdf_file_path = "Business-news.pdf"

# Load content from the PDF file
business_news_text = load_pdf(pdf_file_path)
# Preprocess and divide the extracted text
sections = preprocess_and_divide_text(business_news_text)

# Apply RecursiveCharacterTextSplitter to each section
split_texts = split_text_sections(sections)

# Print the length of the preprocessed text
print(f"Length of the preprocessed text: {len(business_news_text)}")

# Print the number of split sections
print(f"Number of split sections: {len(split_texts)}")




Length of the preprocessed text: 418824
Number of split sections: 672


# Intializing and creating Pinecone vectorstore

In [None]:
# Initialize connection to Pinecone
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')

# Configure client
pinecone_client = Pinecone(api_key=PINECONE_API_KEY)
cloud_provider = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

serverless_spec = ServerlessSpec(cloud=cloud_provider, region=region)
index_name = 'businesssrag'
if index_name in pinecone_client.list_indexes().names():
    pinecone_client.delete_index(index_name)

# Create a new index
pinecone_client.create_index(
    index_name,
    dimension=1536,
    metric='dotproduct',
    spec=serverless_spec
)

index = pinecone_client.Index(index_name)


In [None]:
documents=split_texts
print(len(documents))

672


In [None]:
# Embed the chunks using OpenAI embeddings
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)
embedded_vectors = embeddings_model.embed_documents(documents)

# Prepare metadata with the text_key
text_key = 'text'
metadatas = [{text_key: doc} for doc in documents]

# Create a Pinecone vector store from the chunks and embeddings
vector_store = LangchainPinecone(
    index=index,
    embedding=embeddings_model,
    text_key=text_key
)
vector_store.add_texts(texts=documents, metadatas=metadatas, embeddings=embedded_vectors)


  warn_deprecated(


['b452d1f9-8ea0-4544-98c2-24bffe2fd98d',
 '15a49ca8-4992-45a7-9a01-7cfa8d7eb5ae',
 'd6cc76fc-6f65-4463-b88f-ced623dc7e86',
 '46b432d4-ca52-4891-a738-64afc3d53dac',
 'ed06ef41-2bb8-4c08-b297-82bc7bba82bc',
 'aabbf44e-f4b2-4483-87c2-e7beac50b6a3',
 '989d7132-8866-42d4-820f-cec6ad1701cf',
 '0ece0f3a-fd03-44d9-a4d2-61433d8cca70',
 '9f78dc22-b15e-41ee-937d-1f9a22f8dbcf',
 '1dd5358d-734c-4278-9403-a981f1ce475a',
 '0470de44-391f-4a23-9435-608e0785de4b',
 'b2254af3-f6ee-4732-903a-4e3de6dccb33',
 '0dd68738-1d63-499e-9f2e-d17bd96a17a9',
 'de6acf8f-a861-425a-a63d-1eb6ffe5c62b',
 'dc2f82b0-40d2-447e-aa62-8bd992e7466d',
 'e8671b1c-cf58-44f6-8cb7-07620c6c3593',
 '3367128b-f64d-4cfa-b4fc-731f5f09c106',
 '2c43cdae-ca29-4dee-93ee-6a02b58bee29',
 '67544f6f-901a-4dab-b022-e5189f0de1dd',
 'b9e458a0-2a9d-4ec2-8502-a30768f32ac7',
 '97c7a020-c8e1-4450-a9cb-73216aa528f9',
 '9d7d62ed-3174-42b0-852a-3fdf9cbde986',
 'd720eb80-48d2-4d6f-a20b-3799ec1e74f3',
 '6fe85f56-8ef2-42de-97f0-de1362523b05',
 'ba25b574-cff4-

# Execute Guardrails and Generate Response for User Question


In [439]:


# Apply nest_asyncio to allow nested event loops


# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()
retriever = vector_store.as_retriever(
    search_type="mmr",
    search_kwargs={"score_threshold": 0.7, "k": 5}
)
# Define OpenAI API key
openai.api_key = os.environ.get("OPENAI_API_KEY")
# Define business news evaluation criteria
domain = "business news evaluation"

business_news_criteria = """
Assess the presence of harmful information  about specific companies, financial advice, or economic forecasts in the content.
The content should contain factual information about business news without any direct advice or speculative opinions."""

business_news_steps = """
1. Read the content and the criteria carefully.
2. Assess how much explicit recommendation or opinion about specific companies, financial advice, or economic forecasts is contained in the content.
3. Assign a business news advice score from 1 to 5, with 1 being purely factual business news with no recommendations or opinions, and 5 being multiple explicit recommendations or opinions.
"""

moderation_system_prompt = """
You are a moderation assistant. Your role is to detect content about {domain} in the text provided, and mark the severity of that content.

{domain}

 Criteria

{scoring_criteria}

 Instructions

{scoring_steps}

 Content

{content}

 Evaluation (score only!)
"""
# Function to call OpenAI API asynchronously using asyncio.to_thread
async def async_openai_chat_completion(messages, model="gpt-3.5-turbo"):
    response = await asyncio.to_thread(
        openai.chat.completions.create,
        model=model,
        messages=messages,
        temperature=0
    )
    return response
# Function to check if the question is appropriate
async def topical_guardrail(question):
    messages = [
        {
            "role": "system",
            "content": f"""You are an assistant tasked with determining if a user's question is permissible for discussion. The allowed topics are related to business news, specifically:

1. Financial Performance of Companies
2. Banking and Monetary Policy
3. Automotive Industry Updates
4. Retail and Consumer Markets
5. Global Economic Trends
6. Energy and Oil Markets
7. Telecommunications and Technology

User Question:
{question}

Instruction:
If the user's question pertains to any of the topics listed above, respond with 'allowed.' If it does not relate to these topics, respond with 'not_allowed.'"""
        },
        {"role": "user", "content": question}
    ]
    response = await async_openai_chat_completion(messages)
    return response.choices[0].message.content

# Function to moderate the LLM response
async def moderation_guardrail(chat_response):
    mod_messages = [
        {"role": "user", "content": moderation_system_prompt.format(
            domain=domain,
            scoring_criteria=business_news_criteria,
            scoring_steps=business_news_steps,
            content=chat_response
        )},
    ]
    response = await asyncio.to_thread(
        openai.chat.completions.create,
        model="gpt-3.5-turbo",
        messages=mod_messages,
        temperature=0
    )
    return response.choices[0].message.content



async def context_relevancy_guardrail(question, context):
    relevancy_prompt = f"""You are a context relevancy assistant tasked with evaluating how well the provided context answers the given question.

Instructions
1. Review the question and context.
2. Determine if the context includes any information that answers the question, even minimally.
3. Provide a relevancy score from 1 to 10:
   - **1**: No relevant information.
   - **2-3**: Minimal relevant information,  for a detailed answer.
   - **4-5**: Some relevant details, enough for a basic answer.
   - **6-7**: Good amount of relevant information, providing a detailed answer.
   - **8-9**: Highly relevant information, almost perfectly answering the question.
   - **10**: Perfectly relevant and comprehensive information answering the question in full.

Evaluation
Question: {question}
Context: {context}
Relevancy Score:


"""
    messages = [{"role": "user", "content": relevancy_prompt}]
    response = await asyncio.to_thread(
        openai.chat.completions.create,
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0
    )
    return response.choices[0].message.content


async def improve_question(question):
    improve_prompt = f"""
You are an assistant that helps improve questions for better information retrieval. Please rephrase or clarify the following question to make it more precise and clear:

Original Question: {question}

Improved Question:
"""
    messages = [{"role": "user", "content": improve_prompt}]
    response = await asyncio.to_thread(
        openai.chat.completions.create,
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0
    )
    return response.choices[0].message.content.strip()
# Define the prompt for the RAG model
template = """You are an expert in question-answering based on the context. Your task is to answer the following question with the highest level of accuracy and detail, utilizing only the context provided below. Your response should be factual, concise, and based strictly on the information from the context.

Here are the guidelines for generating your response:
1. Begin by restating the question to ensure clarity.
2. Provide a comprehensive answer using the context provided.
3. If multiple pieces of relevant information are available, synthesize them to form a coherent answer.
4. Where applicable, quantify your response with specific figures, dates, or names mentioned in the context.
5. Avoid making any assumptions or inferences beyond the provided context.
6. Maintain a formal and objective tone throughout your response.
7. Do not include any personal opinions or speculative statements.
8. If the context is insufficient to answer the question, explicitly state this.

Context: {context}
"""

prompt = ChatPromptTemplate.from_template(template)
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.0)

final_rag_chain = (
    {"context": RunnablePassthrough(), "question": itemgetter("question")}
    | prompt
    | llm
    | StrOutputParser()
)
async def run_chat_completion(prompt, model):
    # Use the retriever to get relevant documents
    retrieved_docs = await asyncio.to_thread(retriever.get_relevant_documents, prompt)

    # print("\nRetrieved Documents:\n" + "="*80)
    # for idx, doc in enumerate(retrieved_docs):
    #     print(f"\nDocument {idx+1}:\n" + "-"*80 + f"\n{doc.page_content}\n" + "-"*80)

    # Prepare the context by concatenating the retrieved documents
    context = " ".join([doc.page_content for doc in retrieved_docs])

    # Use the final RAG chain to generate the answer
    answer = final_rag_chain.invoke({"context": context, "question": prompt})

    return answer, context
async def execute_all_guardrails(user_request):
    # Improve the question for better retrieval
    improved_question_task = asyncio.create_task(improve_question(user_request))
    improved_question = await improved_question_task

    # Check if the question is on-topic
    topical_guardrail_task = asyncio.create_task(topical_guardrail(improved_question))
    guardrail_response = await topical_guardrail_task
    if guardrail_response == "not_allowed":
        return "The question is off-topic."

    # Run the chat completion with the improved question
    chat_task = asyncio.create_task(run_chat_completion(improved_question, "gpt-3.5-turbo-1106"))
    chat_response, context = await chat_task

    # Check if the context is relevant
    relevancy_response = await context_relevancy_guardrail(improved_question, context)
    relevancy_score_match = re.search(r'\d+', relevancy_response)
    relevancy_score = int(relevancy_score_match.group()) if relevancy_score_match else 0

    if relevancy_score<=3:
        return "The retrieved context is not sufficiently relevant to the question."

    # Moderate the LLM response
    moderation_response = await moderation_guardrail(chat_response)
    moderation_score_match = re.search(r'\d+', moderation_response)
    moderation_score = int(moderation_score_match.group()) if moderation_score_match else 0

    if moderation_score >=4:
        return "The response contains inappropriate content."

    return chat_response

async def main():
    question = input("Enter a Question:")
    response = await execute_all_guardrails(question)
    print(f"Response: {response}\n")
# Run the main function
asyncio.run(main())



Enter a Question:what is the reasons for shrinking GDP in Japan?
Response: Restated question: What are the factors contributing to the decline in Japan's GDP?

Answer: The decline in Japan's GDP can be attributed to several factors. In the three months to September, growth in Japan stagnated with output only growing by 0.1%, an annual rate of 0.3%. This was primarily due to faltering exports, subdued domestic demand, and a shortfall in corporate investment. The economy had experienced a decade-long trough, leading to deflation and cautious consumer spending. Additionally, Japan's aging population and low birth rates pose challenges for future economic growth, as the cost of running the welfare state is predicted to double, while the workforce shrinks. Without radical reforms and addressing issues like a declining workforce and budget pressures, Japan risks facing a vicious aging recession and long-term economic stagnation.



#Validation


In [448]:
# Hardcoded questions
questions = [
    "What year did Lufthansa post a net profit?",
    "What unique feature will the new Cadillac built in Sweden have?",
    "why is sky blue?",
    "What is the boiling point of water in Celsius?",
    "What are the reasons behind the increase in US job creation in October, and how did it impact the stock market and Federal Reserve's actions?",
    "How many jobs were added to the US economy in October, and which sector did not see job growth?",





]

In [449]:
async def process_questions(questions):
    results = {}
    for question in questions:
        response = await execute_all_guardrails(question)
        results[question] = response
        print(f"Question: {question}\n {response}\n")

    return results

# Run the function to process all questions
results = asyncio.run(process_questions(questions))

Question: What year did Lufthansa post a net profit?
 Restated Question: In which year did Lufthansa report a net profit?

Answer: Lufthansa reported a net profit in the year 2004 after experiencing significant losses in 2003. The airline announced net profits of 400 million euros, compared to a loss of 984 million euros in the previous year. Operating profits in 2004 were at 380 million euros, which was ten times higher than in 2003.

Question: What unique feature will the new Cadillac built in Sweden have?
 Restated Question: What specific feature sets the new Cadillac built in Sweden apart from other models?

Answer: The new Cadillac built in Sweden will be the first Cadillac model to have a diesel engine, distinguishing it from other models in the Cadillac lineup. This feature is part of General Motors' efforts to make the American marque appeal to European drivers in the medium-sized luxury car market.

Question: why is sky blue?
 The question is off-topic.

Question: What is the 