<a href="https://colab.research.google.com/github/sarth-04/Applying-ML-in-Dataplane/blob/main/LLMCompre.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

!pip install fpdf

!pip install pdfplumber

!pip install openai

!pip install ragas

!pip install nvidia-nemo

!pip install numpy

!pip install pandas

!pip install guardrails==2.0.0 --force-reinstall --no-deps

!pip install langchain

!pip install chromadb

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=9188d9113d2b60694c04218266db7a624d2c386ae4c3214b21308ee3702ef16a
  Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2
Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_1

In [2]:

import os
import json
from pathlib import Path
from typing import Dict, List
import pdfplumber
from fpdf import FPDF
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from chromadb import PersistentClient
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.schema import HumanMessage

In [None]:
# Set OpenAI API key
def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = input(f"{var}: ")

_set_env("OPENAI_API_KEY")

# Configure OpenAI
openai_api_key = os.environ["OPENAI_API_KEY"]

# Initialize the embedding function and ChromaDB client
embedding_function = OpenAIEmbeddingFunction(api_key=openai_api_key, model_name="text-embedding-ada-002")
client = PersistentClient()

In [None]:
from langchain.chat_models import ChatOpenAI

# Initialize the OpenAI chat model
chat_model = ChatOpenAI(
    model_name="gpt-4o-mini",
    openai_api_key=os.environ["OPENAI_API_KEY"]
)

from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

chat = ChatOpenAI(model_name="gpt-4o-mini", openai_api_key=os.environ["OPENAI_API_KEY"])
response = chat([HumanMessage(content="What is recursion?")])
print(response.content)

In [None]:
# --- 1. Generate Dataset ---
def generate_dataset() -> Dict:
    """Generate datasets for Socratic Questioning and Knowledge Construction evaluation."""
    from ragas.dataset_schema import SingleTurnSample

    return {
        "socratic_questioning": [
            SingleTurnSample(
                query="What is recursion in programming?",
                response="Recursion is a method where a function calls itself to solve smaller instances of a problem."
            ),
            SingleTurnSample(
                query="Explain the concept of Big-O notation.",
                response="Big-O notation describes the efficiency of algorithms in terms of time or space complexity."
            )
        ],
        "knowledge_construction": [
            SingleTurnSample(
                query="How does binary search work?",
                response="Binary search repeatedly divides the search interval in half. It requires a sorted list to work efficiently."
            ),
            SingleTurnSample(
                query="What are the benefits of modular programming?",
                response="Modular programming improves code reusability, maintainability, and debugging by dividing the codebase into smaller, manageable parts."
            )
        ]
    }


In [None]:

# --- 2. PDF Text Extraction ---
def extract_text_from_pdfs_advanced(pdf_dir: str) -> List[Dict]:
    """Extract text and metadata from PDFs."""
    documents = []
    pdf_dir = Path(pdf_dir)
    for pdf_path in pdf_dir.glob("*.pdf"):
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    text = page.extract_text()
                    if text:
                        documents.append({"text": text, "metadata": {"source": pdf_path.name, "page": page_num}})
        except Exception as e:
            print(f"Error processing {pdf_path.name}: {e}")
    return documents

In [None]:

# --- 3. Create ChromaDB Collection ---
def create_pdf_collection(pdf_dir: str):
    """Create a ChromaDB collection for PDF-based RAG."""
    documents = extract_text_from_pdfs_advanced(pdf_dir)
    if not documents:
        raise ValueError("No documents found in the PDF directory.")
    collection = client.get_or_create_collection(name="pdf_rag", embedding_function=embedding_function)
    for i, doc in enumerate(documents):
        collection.add(documents=[doc['text']], metadatas=[doc['metadata']], ids=[str(i)])
    return collection

In [None]:

import asyncio
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from chromadb import PersistentClient
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

# Constants
GPT_MODEL = "gpt-4o-mini"
system_prompt = "You are an academic instructor assistant. Your role is to provide accurate and helpful academic explanations."
academic_topics = ["programming", "data science", "mathematics", "physics"]

# Input Guardrail: Topical Validation
async def topical_guardrail(user_request):
    print("Checking topical guardrail")
    guardrail_prompt = f"""
    Your role is to assess whether the user question is allowed or not. Allowed topics are: {', '.join(academic_topics)}.
    If the topic is allowed, respond with 'allowed'. Otherwise, respond with 'not_allowed'.
    """
    messages = [
        {"role": "system", "content": guardrail_prompt},
        {"role": "user", "content": user_request},
    ]
    response = ChatOpenAI(model_name=GPT_MODEL, temperature=0)([HumanMessage(content=messages)])
    return response.content.strip()

# Output Guardrail: Moderation
async def moderation_guardrail(chat_response):
    print("Checking moderation guardrail")
    moderation_prompt = f"""
    You are a moderation assistant. Evaluate the following response for academic appropriateness and tone.

    ## Criteria
    1. The content must be accurate and relevant to the query.
    2. The content must not include offensive or inappropriate information.
    3. The content must be expressed in an academic tone.

    Assign a score from 1 to 5:
    - 1: Completely inappropriate.
    - 5: Excellent academic tone and relevance.

    Content:
    {chat_response}

    Evaluation (score only):
    """
    response = ChatOpenAI(model_name=GPT_MODEL, temperature=0)([HumanMessage(content=moderation_prompt)])
    return int(response.content.strip())

# Generate Chat Response
async def get_chat_response(user_request, context):
    print("Generating chat response")
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_request},
        {"role": "assistant", "content": f"Context: {context}"}
    ]
    response = ChatOpenAI(model_name=GPT_MODEL, temperature=0)([HumanMessage(content=user_request)])
    return response.content

# Main Function
async def execute_with_guardrails(user_request, context):
    topical_guardrail_task = asyncio.create_task(topical_guardrail(user_request))
    chat_task = asyncio.create_task(get_chat_response(user_request, context))

    while True:
        done, _ = await asyncio.wait(
            [topical_guardrail_task, chat_task], return_when=asyncio.FIRST_COMPLETED
        )
        if topical_guardrail_task in done:
            guardrail_response = topical_guardrail_task.result()
            if guardrail_response == "not_allowed":
                chat_task.cancel()
                print("Topical guardrail triggered")
                return "I can only answer questions related to academic topics like programming, data science, or mathematics."

            elif chat_task in done:
                chat_response = chat_task.result()
                moderation_score = await moderation_guardrail(chat_response)

                if moderation_score < 3:
                    print(f"Moderation guardrail triggered with score: {moderation_score}")
                    return "The generated response failed to meet academic standards. Please rephrase your query or try again."

                print("Passed moderation")
                return chat_response
        else:
            await asyncio.sleep(0.1)

# Example usage
context = "Academic topics like recursion, algorithms, and mathematics."
user_requests = [
    "What is recursion in programming?",
    "Can you explain Big-O notation?",
    "Tell me about horses."
]

for request in user_requests:
    result = await execute_with_guardrails(request, context)
    print(f"User Query: {request}\nResponse: {result}\n\n")

In [None]:
from fpdf import FPDF
import os

def generate_pdf(output_file: str):
    """Generate a PDF with programming content."""
    try:
        print(f"Starting PDF generation for: {output_file}")

        # Initialize PDF
        pdf = FPDF()
        pdf.set_auto_page_break(auto=True, margin=15)
        pdf.add_page()

        # Content dictionary
        content = {
            "Recursion in Programming": (
                "Recursion is a method in which a function calls itself to solve smaller instances of a problem.\n"
                "Examples include:\n"
                " - Computing factorials.\n"
                " - Traversing tree data structures.\n"
                "Applications:\n"
                " - Divide-and-conquer algorithms like quicksort and merge sort.\n"
            ),
            "Big-O Notation": (
                "Big-O notation describes the efficiency of algorithms in terms of time or space complexity.\n"
                "Examples of complexity:\n"
                " - O(1): Constant time.\n"
                " - O(n): Linear time.\n"
                " - O(n^2): Quadratic time, e.g., bubble sort.\n"
                "Applications:\n"
                " - Comparing sorting algorithms like quicksort (O(n log n)) and bubble sort (O(n^2)).\n"
            ),
            "Binary Search": (
                "Binary search is an efficient algorithm for finding a target value in a sorted list.\n"
                "Steps:\n"
                " 1. Compare the target value with the middle element.\n"
                " 2. If equal, return the index.\n"
                " 3. Otherwise, narrow the search interval to the left or right half.\n"
                "Efficiency:\n"
                " - Time complexity: O(log n).\n"
            ),
            "Benefits of Modular Programming": (
                "Modular programming divides the codebase into smaller, manageable modules or functions.\n"
                "Benefits include:\n"
                " 1. Code reusability.\n"
                " 2. Easier debugging and maintenance.\n"
                " 3. Improved readability and team collaboration.\n"
                "Applications:\n"
                " - Building reusable libraries and APIs.\n"
            )
        }

        # Adding content to PDF
        for chapter, body in content.items():
            pdf.set_font('Arial', 'B', 12)
            pdf.cell(0, 10, chapter, ln=True)
            pdf.set_font('Arial', '', 12)
            pdf.multi_cell(0, 10, body)

        # Save PDF
        pdf.output(output_file)
        print(f"PDF generated successfully: {output_file}")
    except Exception as e:
        print(f"An error occurred during PDF generation: {e}")

# Ensure output directory exists
output_dir = "./output"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "Programming_Concepts.pdf")

# Generate the PDF
generate_pdf(output_file)



In [None]:
import asyncio
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from chromadb import PersistentClient
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

# Constants
GPT_MODEL = "gpt-4o-mini"
system_prompt = "You are an academic instructor assistant. Your role is to provide accurate and helpful academic explanations."
academic_topics = ["programming", "data science", "mathematics", "physics"]

# Input Guardrail: Topical Validation
async def topical_guardrail(user_request):
    print("Checking topical guardrail")
    guardrail_prompt = f"""
    Your role is to assess whether the user question is allowed or not. Allowed topics are: {', '.join(academic_topics)}.
    If the topic is allowed, respond with 'allowed'. Otherwise, respond with 'not_allowed'.
    """
    response = ChatOpenAI(model_name=GPT_MODEL, temperature=0)([HumanMessage(content=guardrail_prompt)])
    return response.content.strip()

# Output Guardrail: Moderation
async def moderation_guardrail(chat_response):
    print("Checking moderation guardrail")
    moderation_prompt = f"""
    You are a moderation assistant. Evaluate the following response for academic appropriateness and tone.

    ## Criteria
    1. The content must be accurate and relevant to the query.
    2. The content must not include offensive or inappropriate information.
    3. The content must be expressed in an academic tone.

    Assign a score from 1 to 5:
    - 1: Completely inappropriate.
    - 5: Excellent academic tone and relevance.

    Content:
    {chat_response}

    Evaluation (score only):
    """
    response = ChatOpenAI(model_name=GPT_MODEL, temperature=0)([HumanMessage(content=moderation_prompt)])
    return int(response.content.strip())

# Generate Academic Response
async def generate_academic_response(user_request, pdf_collection):
    print("Generating academic response")
    topical_guardrail_task = asyncio.create_task(topical_guardrail(user_request))

    # Pre-extract context from PDF collection for relevancy (optional)
    context = extract_relevant_context(user_request, pdf_collection) if pdf_collection else "No specific PDF context available."
    chat_task = asyncio.create_task(get_chat_response(user_request, context))

    while True:
        done, _ = await asyncio.wait(
            [topical_guardrail_task, chat_task], return_when=asyncio.FIRST_COMPLETED
        )
        if topical_guardrail_task in done:
            guardrail_response = topical_guardrail_task.result()
            if guardrail_response == "not_allowed":
                chat_task.cancel()
                print("Topical guardrail triggered")
                return "I can only answer questions related to academic topics like programming, data science, or mathematics."

        if chat_task in done:
            chat_response = chat_task.result()
            moderation_score = await moderation_guardrail(chat_response)
            if moderation_score < 3:
                print(f"Moderation guardrail triggered with score: {moderation_score}")
                return "The generated response failed to meet academic standards. Please rephrase your query or try again."

            print("Passed moderation")
            return chat_response
        else:
            await asyncio.sleep(0.1)

# Utility: Extract Relevant Context from PDF Collection
def extract_relevant_context(query, pdf_collection):
    """Extract relevant context from the PDF collection using embeddings."""
    results = pdf_collection.query(
        query_texts=[query],
        n_results=1
    )
    if results["documents"]:
        return results["documents"][0]["text"]
    return "No context available for the query."


In [None]:

# Example Usage
async def main():
    try:


        # Create ChromaDB collection
        pdf_collection = create_pdf_collection("/content/output/Programming_Concepts.pdf")

        # Simulate an academic query
        queries = [
            "Explain recursion with examples.",
            "What is Big-O notation?",
            "Tell me about horses."  # Out-of-scope query
        ]

        for query in queries:
            response = await generate_academic_response(query, pdf_collection)
            print(f"Query: {query}\nResponse: {response}\n")
    except Exception as e:
        print(f"Error during execution: {e}")
