<a href="https://colab.research.google.com/github/shivakumar-ravichandran/doc_generator/blob/main/document_generator_rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Document Generation from Code

## Dependencies

In [None]:
!pip uninstall -y openai
!pip install openai==0.28
!pip install python-docx faiss-cpu sentence-transformers

Found existing installation: openai 1.54.4
Uninstalling openai-1.54.4:
  Successfully uninstalled openai-1.54.4
Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
Successfully installed openai-0.28.0
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Libraries

In [121]:
import os
import logging
import time
import hashlib
from pathlib import Path
import faiss
import openai
from docx import Document
from sentence_transformers import SentenceTransformer
from typing import List, Tuple, Any
import pickle
import requests
import numpy as np
from datetime import datetime
from urllib.parse import urlparse

In [122]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set up logging
LOG_DIR = "logs"
Path(LOG_DIR).mkdir(exist_ok=True)
log_file = f"{LOG_DIR}/rag_system_{time.strftime('%Y%m%d_%H%M%S')}.log"
logging.basicConfig(
    filename=log_file,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)

In [123]:
# Load OpenAI API key
API_KEY_FILE = "/content/openai_key.txt"
try:
    with open(API_KEY_FILE, "r") as file:
        openai.api_key = file.read().strip()
    logging.info("Successfully loaded OpenAI API key.")
except FileNotFoundError:
    logging.error(f"API key file '{API_KEY_FILE}' not found.")
    raise FileNotFoundError(
        f"Please ensure '{API_KEY_FILE}' exists and contains your API key."
    )

# Generate document

In [112]:
# Function to read and parse the RPG file
def read_rpg_file(file_path):
    """Reads the IBM RPG code file and returns its content line by line."""
    with open(file_path, "r") as file:
        lines = file.readlines()
    return lines

In [113]:
def fetch_rpg_code_from_github(repo_url):
    """Fetches the RPG code from a GitHub repository URL."""
    try:
        logging.info(f"Fetching RPG code from URL: {repo_url}")
        response = requests.get(repo_url)
        response.raise_for_status()
        logging.info("Successfully fetched RPG code.")
        return response.text
    except requests.exceptions.RequestException as e:
        logging.error(f"Failed to fetch RPG code: {e}")
        raise RuntimeError(f"Error fetching RPG code: {e}")


In [114]:
def generate_summary(rpg_code):
    """Generates a detailed summary of the RPG code using OpenAI GPT."""
    try:
        prompt = (
            "You are a technical writer tasked with summarizing legacy IBM RPG code. "
            "Create a high-level summary for the following code, making it understandable "
            "to both technical and non-technical audiences. Ensure the summary includes: "
            "an overview of the code's purpose, key functionalities, and any notable logic.\n\n"
            f"Code:\n{rpg_code}"
        )

        logging.info("Generating summary using OpenAI GPT.")
        response = openai.Completion.create(
            engine="gpt-3.5-turbo-instruct", prompt=prompt, max_tokens=1500, temperature=0.5
        )
        summary = response.choices[0].text.strip()
        logging.info("Summary generation successful.")
        return summary
    except Exception as e:
        logging.error(f"Error generating summary: {e}")
        raise RuntimeError(f"Error generating summary: {e}")

In [115]:
def create_summary_document(summary, output_file="RPG_Summary.docx"):
    """Creates a formatted Word document with the summary."""
    try:
        logging.info(f"Creating Word document: {output_file}")
        doc = Document()
        doc.add_heading("IBM RPG Code Summary", level=1)

        doc.add_paragraph(
            "This document provides a high-level summary of the provided IBM RPG code, "
            "suitable for both technical and non-technical audiences."
        )

        doc.add_heading("Summary", level=2)
        doc.add_paragraph(summary)

        doc.save(output_file)
        logging.info(f"Word document created successfully: {output_file}")
    except Exception as e:
        logging.error(f"Error creating Word document: {e}")
        raise RuntimeError(f"Error creating Word document: {e}")

In [116]:
# Read RPG file from local folder
rpg_file_path = "/content/sample_rpg_code.rpg"
rpg_code = read_rpg_file(rpg_file_path)

In [117]:
# Generate summary
summary = generate_summary(rpg_code)

# Create Word document
create_summary_document(summary)

KeyboardInterrupt: 

# Build RAG System

## Helper functions

In [None]:
def read_document(doc_path: str) -> str:
    """Reads the content of a .docx document."""
    try:
        doc = Document(doc_path)
        full_text = "\n".join(
            [para.text for para in doc.paragraphs if para.text.strip()]
        )
        logging.info(f"Document '{doc_path}' successfully read.")
        return full_text
    except Exception as e:
        logging.error(f"Error reading document '{doc_path}': {e}")
        raise

In [None]:
def chunk_text(text: str, max_chunk_size: int = 500) -> List[str]:
    """Splits text into manageable chunks."""
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(" ".join(current_chunk)) >= max_chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = []

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    logging.info(f"Text successfully chunked into {len(chunks)} chunks.")
    return chunks

In [None]:
def embed_chunks(chunks: List[str], model: Any) -> List[Tuple[int, Any]]:
    """Generates vector embeddings for text chunks."""
    embeddings = model.encode(chunks)
    logging.info(f"Generated embeddings for {len(chunks)} chunks.")
    return [(i, emb) for i, emb in enumerate(embeddings)]

In [None]:
def save_vector_db(embeddings: List[Tuple[int, Any]], db_path: str):
    """Saves vector embeddings to a FAISS index."""
    vectors = [emb[1] for emb in embeddings]
    index = faiss.IndexFlatL2(len(vectors[0]))
    index.add(np.array(vectors))
    with open(db_path, "wb") as f:
        pickle.dump({"index": index, "chunks": embeddings}, f)
    logging.info(f"Vector database saved to '{db_path}'.")

In [None]:
def load_vector_db(db_path: str) -> Tuple[Any, List[str]]:
    """Loads a FAISS vector database."""
    try:
        with open(db_path, "rb") as f:
            data = pickle.load(f)
        logging.info(f"Vector database loaded from '{db_path}'.")
        return data["index"], data["chunks"]
    except Exception as e:
        logging.error(f"Error loading vector database: {e}")
        raise

In [None]:
def query_vector_db(
    query: str, model: Any, index: Any, chunks: List[Tuple[int, Any]], top_k: int = 3
) -> List[str]:
    """Retrieves top-k relevant chunks for a query."""
    query_vector = model.encode([query])[0]
    distances, indices = index.search(np.array([query_vector]), top_k)
    results = [chunks[i][1] for i in indices[0]]
    logging.info(f"Query successfully processed. Retrieved {len(results)} results.")
    return results

In [None]:
def generate_response(query: str, relevant_chunks: List[str]) -> str:
    """Generates a response using OpenAI GPT model based on retrieved chunks."""

    # Limit the number of chunks used in the prompt
    max_chunks = 2  # You might need to adjust this value

    relevant_chunks = relevant_chunks[:max_chunks]

    # Convert NumPy arrays to strings before joining
    relevant_chunks = [str(chunk) for chunk in relevant_chunks]

    context = "\n\n".join([chunk for chunk in relevant_chunks])

    # Limit the total length of the context to avoid exceeding token limit
    max_context_length = 3000  # Adjust this value based on your needs and model's limit

    if len(context) > max_context_length:
        context = context[:max_context_length] + "..." # Truncate the context and add an ellipsis

    prompt = (
        f"The following content is extracted from the document:\n\n"
        f"{context}\n\n"
        f"Using only the above content, answer the question below:\n\n"
        f"Question: {query}\n\n"
        f"Answer (based solely on the document):"
    )

    try:
        response = openai.Completion.create(
            engine="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=300,
            temperature=0.5,
        )
        print(response)
        answer = response.choices[0].text.strip()
        logging.info("Response generated successfully.")
        return answer
    except Exception as e:
        logging.error(f"Error generating response: {e}")
        raise

## Main Workflow

In [None]:
# Paths
doc_path = "RPG_Summary.docx"
vector_db_path = "vector_db.pkl"

### Step 1: Read the document

In [None]:
document_content = read_document(doc_path)

In [None]:
document_content

'IBM RPG Code Summary\nThis document provides a high-level summary of the provided IBM RPG code, suitable for both technical and non-technical audiences.\nSummary\nPurpose:\nThis code is used to demonstrate how to run a curl command and process the results in IBM RPG as JSON. It retrieves the current date and time from a website and displays it in a readable format.\n\nKey Functionalities:\n- Uses QSHCURL to run a curl command and retrieve data from a website as JSON\n- Utilizes YAJL library for parsing the JSON data\n- Displays the date, time, and milliseconds since epoch in a message\n\nNotable Logic:\n- The code uses QSHONI and QSHCURL libraries, which need to be installed for it to work properly.\n- The QSHCURL command writes the results to an IFS file, which is then read and parsed using the YAJLINTO parser.\n- Any errors during the process are handled gracefully.\n- The parsed data is displayed in a message for the user to see.'

### Step 2: Chunk the document content


In [None]:
chunks = chunk_text(document_content)

### Step 3: Embed the chunks


In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")  # Use a lightweight, efficient model
embeddings = embed_chunks(chunks, model)

In [None]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

### Step 4: Save embeddings to vector database

In [None]:
save_vector_db(embeddings, vector_db_path)

### Step 5: Load the vector database

In [None]:
index, chunks = load_vector_db(vector_db_path)


### Step 6: Query the vector database

In [None]:
user_query = "What are the Key Functionalities?"
relevant_chunks = query_vector_db(user_query, model, index, chunks)

### Step 7: Generate a detailed response

In [None]:
response = generate_response(user_query, relevant_chunks)

{
  "id": "cmpl-AYqnTqZjyR7ym8DSxb6JyyUx1UWLv",
  "object": "text_completion",
  "created": 1732869467,
  "model": "gpt-3.5-turbo-instruct",
  "choices": [
    {
      "text": "\n\nThe key functionalities are not explicitly stated in the document, but based on the content provided, it can be inferred that the document is discussing machine learning algorithms, specifically neural networks, and their use in natural language processing tasks. Some of the key functionalities mentioned include feature extraction, dimensionality reduction, and classification.",
      "index": 0,
      "logprobs": null,
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 1841,
    "completion_tokens": 62,
    "total_tokens": 1903
  }
}


### Output response

In [None]:
print("Generated Response:\n", response)

Generated Response:
 The key functionalities are not explicitly stated in the document, but based on the content provided, it can be inferred that the document is discussing machine learning algorithms, specifically neural networks, and their use in natural language processing tasks. Some of the key functionalities mentioned include feature extraction, dimensionality reduction, and classification.


# Build RAG with Ranking

## Installation

In [120]:
!pip install chromadb
!pip uninstall -y openai==0.28
!pip install openai

Found existing installation: openai 0.28.0
Uninstalling openai-0.28.0:
  Successfully uninstalled openai-0.28.0
Collecting openai
  Downloading openai-1.55.3-py3-none-any.whl.metadata (24 kB)
Downloading openai-1.55.3-py3-none-any.whl (389 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.6/389.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
Successfully installed openai-1.55.3


In [126]:
!pip install --upgrade openai



## Crhomadb

In [139]:
import chromadb
from sentence_transformers import SentenceTransformer, CrossEncoder
from typing import List, Dict
from openai.embeddings_utils import get_embedding

In [138]:
# Initialize ChromaDB
chroma_client = chromadb.PersistentClient()

In [148]:
# Define the document embedding function
def embed_document(document: str, collection_name: str, embed_model_name: str = "text-embedding-ada-002") -> None:
    """Embed the document and store it in ChromaDB."""
    try:
        # Create or load collection
        collection = chroma_client.get_or_create_collection(name=collection_name)
        collection.add(
            documents=[document],
            metadatas=[{"source": "generated_document.docx"}],
            ids=["doc_1"]
        )
        logging.info(f"Document successfully embedded and stored in collection '{collection_name}'.")
    except Exception as e:
        logging.error(f"Error embedding document: {e}")
        raise

In [141]:
# Semantic search function
def semantic_search(query: str, collection_name: str, top_k: int = 5) -> List[Dict]:
    """Perform semantic search on the stored embeddings."""
    try:
        collection = chroma_client.get_collection(collection_name)
        query_embedding = SentenceTransformer("all-MiniLM-L6-v2").encode(query).tolist()
        results = collection.query(query_embeddings=[query_embedding], n_results=top_k)
        logging.info("Semantic search completed successfully.")
        return results
    except Exception as e:
        logging.error(f"Error during semantic search: {e}")
        raise

In [142]:
# Re-ranking function using CrossEncoder
def rerank_results(query: str, search_results: List[Dict], rerank_model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2") -> List[Dict]:
    """Re-rank search results using a CrossEncoder."""
    try:
        cross_encoder = CrossEncoder(rerank_model_name)
        query_pairs = [(query, result["document"]) for result in search_results]
        scores = cross_encoder.predict(query_pairs)

        # Combine scores with search results
        for i, result in enumerate(search_results):
            result["score"] = scores[i]

        # Sort by scores in descending order
        reranked_results = sorted(search_results, key=lambda x: x["score"], reverse=True)
        logging.info("Re-ranking completed successfully.")
        return reranked_results
    except Exception as e:
        logging.error(f"Error during re-ranking: {e}")
        raise

In [143]:
# Step 1: Embed the generated document
generated_doc_path = "RPG_Summary.docx"
document_content = read_document(generated_doc_path)

In [144]:
document_content

'IBM RPG Code Summary\nThis document provides a high-level summary of the provided IBM RPG code, suitable for both technical and non-technical audiences.\nSummary\nPurpose:\nThis code is used to demonstrate how to run a curl command and process the results in IBM RPG as JSON. It retrieves the current date and time from a website and displays it in a readable format.\n\nKey Functionalities:\n- Uses QSHCURL to run a curl command and retrieve data from a website as JSON\n- Utilizes YAJL library for parsing the JSON data\n- Displays the date, time, and milliseconds since epoch in a message\n\nNotable Logic:\n- The code uses QSHONI and QSHCURL libraries, which need to be installed for it to work properly.\n- The QSHCURL command writes the results to an IFS file, which is then read and parsed using the YAJLINTO parser.\n- Any errors during the process are handled gracefully.\n- The parsed data is displayed in a message for the user to see.'

In [149]:
embed_document(document_content, collection_name="document_summary_collection")

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:00<00:00, 91.9MiB/s]


In [159]:
# Step 2: Perform semantic search
query = "What are the Key Functionalities?"
search_results = semantic_search(query, collection_name="document_summary_collection")



In [160]:
search_text = search_results["documents"][0][0]

In [163]:
prompt = (
    f"The following content is extracted from the document:\n\n"
    f"{search_text}\n\n"
    f"Using only the above content, answer the question below in a detailed and user-friendly manner:\n\n"
    f"Question: {query}\n\n"
    f"Answer (based solely on the document):"
)

response = openai.Completion.create(
    engine="gpt-3.5-turbo-instruct",
    prompt=prompt,
    max_tokens=300,
    temperature=0.5,
)
print(response)
answer = response.choices[0].text.strip()

{
  "id": "cmpl-AYs7Q2WMz6Rv9qyp9MFlOLn85Bwwl",
  "object": "text_completion",
  "created": 1732874548,
  "model": "gpt-3.5-turbo-instruct",
  "choices": [
    {
      "text": "\n\nThe key functionalities of the provided IBM RPG code are:\n\n1. Retrieving data from a website as JSON: The code uses the QSHCURL command to run a curl command and retrieve data from a website in the form of JSON. This allows the code to access and process data from external sources.\n\n2. Parsing JSON data using YAJL library: The code utilizes the YAJL library to parse the retrieved JSON data. This library is specifically designed for handling JSON data in RPG programs, making it easier to extract and manipulate the desired information.\n\n3. Displaying date and time information: The code retrieves the current date and time from the website and displays it in a readable format. This includes the date, time, and milliseconds since epoch, providing a comprehensive view of the current time.\n\n4. Error handlin

In [164]:
answer

'The key functionalities of the provided IBM RPG code are:\n\n1. Retrieving data from a website as JSON: The code uses the QSHCURL command to run a curl command and retrieve data from a website in the form of JSON. This allows the code to access and process data from external sources.\n\n2. Parsing JSON data using YAJL library: The code utilizes the YAJL library to parse the retrieved JSON data. This library is specifically designed for handling JSON data in RPG programs, making it easier to extract and manipulate the desired information.\n\n3. Displaying date and time information: The code retrieves the current date and time from the website and displays it in a readable format. This includes the date, time, and milliseconds since epoch, providing a comprehensive view of the current time.\n\n4. Error handling: The code is designed to handle any errors that may occur during the process. This ensures that the program runs smoothly and does not crash in case of unexpected errors.\n\n5. U

# Export the DB

In [171]:
settings = chroma_client.get_settings()

In [176]:
data_dir = settings.persist_directory

In [177]:
import shutil
shutil.copytree(data_dir, "exported_collection")

'exported_collection'

In [178]:
import zipfile
from google.colab import files

def zip_folder(folder_path, zip_file_name):
  with zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(folder_path):
      for file in files:
        zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), folder_path))

# Replace 'your_folder_name' with the actual folder name
folder_name = 'exported_collection'
zip_file_name = 'exported_collection.zip'

zip_folder(folder_name, zip_file_name)

files.download(zip_file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>