In [1]:
from data_preprocessing import DocumentConverter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
input_paths = [
    Path(r"Input_Files\JTI RFP Digital Landscape .pptx"),
    Path(r"Input_Files\RFP Classic template_from GIP Sharepoint.pdf"),
    Path(r"Input_Files\RFP_Classic_template_upd.docx"),
    Path(r"Input_Files\SOW- RFP Digital Landscape-v2.2 1.docx")
]

In [3]:
output_path = DocumentConverter(input_paths).convert_pdf_to_md()

Document JTI RFP Digital Landscape .pptx converted.
Saved markdown output to: Scratch
Document RFP Classic template_from GIP Sharepoint.pdf converted.
Saved markdown output to: Scratch
Document RFP_Classic_template_upd.docx converted.
Saved markdown output to: Scratch
Document SOW- RFP Digital Landscape-v2.2 1.docx converted.
Saved markdown output to: Scratch


In [4]:
output_path

[WindowsPath('Scratch/JTI RFP Digital Landscape .md'),
 WindowsPath('Scratch/RFP Classic template_from GIP Sharepoint.md'),
 WindowsPath('Scratch/RFP_Classic_template_upd.md'),
 WindowsPath('Scratch/SOW- RFP Digital Landscape-v2.2 1.md')]

# Testing Document Parser

In [69]:
import logging
from pathlib import Path
import json

In [53]:
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from langchain_text_splitters import MarkdownHeaderTextSplitter

In [54]:
_log = logging.getLogger(__name__)

In [66]:
input_paths = [
    #Path(r".\Input_Files\RFP Classic template_from GIP Sharepoint.pdf"),
    #Path(r".\Input_Files\RFP_Classic_template_upd.docx"),
    Path(r".\Input_Files\SOW- RFP Digital Landscape-v2.2 1.docx")
    #Path(r".\Input_Files\JTI RFP Digital Landscape .pptx"),
    #Path(r".\scratch\SOW_RFP_Digital_Landscape.md")
    ]

In [73]:
doc_converter = (
    DocumentConverter(  # all of the below is optional, has internal defaults.
        allowed_formats=[
            InputFormat.PDF,
            InputFormat.DOCX
        ],  # whitelist formats, non-matching files are ignored.
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
                ),
                InputFormat.DOCX: WordFormatOption(
                    pipeline_cls=SimplePipeline  #, backend=MsWordDocumentBackend
                ),

        },
    )
)

In [None]:

conv_results = doc_converter.convert_all(input_paths)

for res in conv_results:
    out_path = Path("scratch")
    print(
        f"Document {res.input.file.name} converted."
        f"\nSaved markdown output to: {out_path!s}"
    )
    _log.debug(res.document._export_to_indented_azZxtext(max_text_len=16))
    # Export Docling document format to markdowndoc:
    with (out_path / f"{res.input.file.stem}.md").open("w") as fp:
        fp.write(res.document.export_to_markdown())

    with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
        fp.write(json.dumps(res.document.export_to_dict()))

    with (out_path / f"{res.input.file.stem}.txt").open("w") as fp:
        fp.write(res.document.export_to_text())
    

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document SOW- RFP Digital Landscape-v2.2 1.docx
INFO:docling.document_converter:Finished converting document SOW- RFP Digital Landscape-v2.2 1.docx in 0.25 sec.


Document SOW- RFP Digital Landscape-v2.2 1.docx converted.
Saved markdown output to: scratch


In [55]:
def read_markdown_file(file_path):
    """
    Reads a markdown file and returns its content as a string.

    Args:
        file_path (str): The path to the markdown file.

    Returns:
        str: The content of the markdown file, or None if an error occurs.
    """
    try:
        with open(file_path, 'r') as file:
            markdown_content = file.read()
        return markdown_content
    except FileNotFoundError:
        print(f"Error: File not found at path: {file_path}")
        return None
    except Exception as e:
         print(f"An error occurred: {e}")
         return None

# Example usage:

In [58]:
file_path = r'.\scratch\SOW- RFP Digital Landscape-v2.2 1.md'
content = read_markdown_file(file_path)

In [None]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
md_header_splits = markdown_splitter.split_text(content)
md_header_splits

[Document(metadata={}, page_content='**Request for Proposal (RfP) brief – digital landscape overhaul**  \n**Table of content**  \n- **Background**\n- **Objectives of our corporate website**\n- **General requirements for our digital landscape**\n- **Scope and deliverables of the digital landscape overhaul**\n- **JTI.com (Q3-4 2023)**\n- **Careers.jti.com (Q1 2024)**\n- **JT-Science.com (Q1-2 2024)**\n- **Website hosting (Q3-Q4 2023)**\n- **Social media community management support (Q3 2023)**\n- **Technical requirements**\n- **References**  \n1.Background  \nThe eventual scope of this RfP is overhauling JTI’s digital landscape, which is comprised of our current corporate website at the core, complemented by microsites such as the career platform, science communication website and other smaller websites (see 4.d.). Over the years, add-ons and ad hoc updates have been pushed to serve various agendas. This resulted in creating a disjointed digital ecosystem, which is no longer in line with

In [24]:
print(md_header_splits[7].page_content)

Do not submit generic marketing materials, broadly descriptive attachments, or other general literature.  
Indicative timetables for evaluation and award of this RFP are as follows:  
| EVENT                                                | DEADLINE   |
|------------------------------------------------------|------------|
| Distribution of RFP                                  |            |
| Intent to participate email                          |            |
| Submission of clarification  questions by email only |            |
| Distribution of answers to  clarification questions  |            |
| Receipt of RFP responses                             |            |
| Submission of samples /  Presentations (if required) |            |
| Selection decision                                   |            |
| Estimated start date  (subject to contract)          |            |  
Presentations will be taking place at our location office. You will be given a 2-hour time slot in which you must 

In [None]:
from prompts.questions_for_sections import intro

In [4]:
intro.format(section="Introduction")

'Create an Introduction section for the RFP document, giving a brief overview on JTI (Japan Tobacco International).'

In [2]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from prompts.classification_prompt import classification_prompt
from agents.classification_agent import classify_rfx

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import fitz
from llm_calling import llm_calling
from langchain_core.documents import Document
from data_preprocessing import data_preprocessing
import os

In [4]:
def file_names(directory):
    files=[]
    for file in os.listdir(directory):
        if file.endswith(".pdf"):
            files.append(os.path.join(directory, file))
    return files

In [5]:
def langchain_doc_creation(file_path):
    doc=[]
    for f in file_path:
        dp = data_preprocessing(file_path=f)
        doc += dp.load_data()
    return doc

In [6]:
directory = "./User_File"
file_path = file_names(directory)
doc_input = langchain_doc_creation(file_path)

In [7]:
doc_txt=''
for d in doc_input:
    doc_txt+= d.page_content
    doc_txt+= "\n\n"

In [8]:
system_prompt = classification_prompt

In [None]:
rfx_type = classify_rfx(text=doc_txt,model_name="qwen2.5:7b").classify_rfx_solve()

In [9]:
rfx_type

{'rfx_type': 'RFP'}

In [18]:
from creating_retriever import universal_retrieval,user_retriever
from langchain_qdrant import QdrantVectorStore
from llm_calling import llm_calling
from pathlib import Path
from langgraph.types import Command, interrupt

In [2]:
type_of_retrieval = "dense" #@param ["dense", "sparse", "hybrid"]
collection_name = 'SOW-_RFP_Digital_Landscape-v2'
path = f"""./tmp/langchain_qdrant_user_{type_of_retrieval}"""
my_file = Path(path+f"""/collection/{collection_name}/storage.sqlite""")

embeddings = llm_calling(embedding_model="llama3.2:latest").call_embed_model()

if my_file.is_file():
    print("DB Exists")
    retriever_user = universal_retrieval(collection_name=collection_name,embeddings=embeddings,path=path).load_existing_vdb_collection()

DB Exists


In [3]:
type_of_retrieval = "dense" #@param ["dense", "sparse", "hybrid"]
collection_name = f"""jti_rfp_{type_of_retrieval}"""
path = f"""./tmp/langchain_qdrant_{type_of_retrieval}"""
my_file = Path(path+f"""/collection/{collection_name}/storage.sqlite""")
#directory = "./Input_Files"

if my_file.is_file():
    print("DB Exists")
    retriever_input = universal_retrieval(collection_name=collection_name,embeddings=embeddings,path=path).load_existing_vdb_collection()

DB Exists


In [4]:
from agents.brief_intake_agent import brief_intake
from pprint import pprint

In [5]:
app = brief_intake(un_retriever=retriever_input,us_retriever=retriever_user
                   ,model_name="qwen2.5:7b").run_brief_intake()

In [6]:
# Run
inputs = {
    "question": "Create an Intoduction section for the RFP document, giving a bried overview on JTI (Japan Tobacco International)."
}
thread = {"configurable": {"thread_id": "1"}}
for output in app.stream(inputs,thread):
    for key, value in output.items():
        # Node
        #print("prerit")
        pprint(f"Node '{key}':")
        # Optional: print full state at each node
        # pprint.pprint(value["keys"], indent=2, width=80, depth=None)
    #pprint("\n---\n")

# Final generation
pprint(value["generation"])

---TRANSFORM QUERY---
---ROUTE QUESTION---
---ROUTE QUESTION TO UNIVERSAL VECTORSTORE---
"Node 'transform_query':"
---RETRIEVE UNIVERSAL---
"Node 'retrieve_universal':"
---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: GENERATE---
"Node 'grade_documents':"
---GENERATE---
"Node 'generate':"
('To create a brief introduction section for the RFP document that provides an '
 'overview of Japan Tobacco International (JTI), you can use the following '
 'template:\n'
 '\n'
 '---\n'
 '\n'
 '**Introduction**\n'
 '\n'
 'Japan Tobacco International (JTI) is a leading international tobacco and '
 'vaping company headquartered in Geneva, Switzerland. Our goal is to become '
 'the most successful and responsible tobacco company in the world. For more '
 'detailed information about JTI globally, please refer to Appendix 1 of this '
 'document. To learn more about our

In [8]:
gen = value["generation"]

In [11]:
gen.split("\n\n---\n\n")[1]

'**Introduction**\n\nJapan Tobacco International (JTI) is a leading international tobacco and vaping company headquartered in Geneva, Switzerland. Our goal is to become the most successful and responsible tobacco company in the world. For more detailed information about JTI globally, please refer to Appendix 1 of this document. To learn more about our local market presence, visit our website.'

In [17]:
value[0].value

'Please provide RELEVANT documents supporting the How can we extract key evaluation metrics from the uploaded document to create a comprehensive Evaluation Criteria section for this RFP? question:'

In [20]:
# Run
inputs = {
    "question": "From the document uploaded, create an Evaluation Criteria section which will point down all the evaluation metrics for this RFP."
}
thread = {"configurable": {"thread_id": "3"}}
for output in app.stream(inputs,thread, stream_mode="updates"):
    for key, value in output.items():
        # Node
        pprint(f"Node '{key}':")
    pprint("\n---\n")

# Final generation
#pprint(value["generation"])

---TRANSFORM QUERY---
---ROUTE QUESTION---
---ROUTE QUESTION TO USER VECTORSTORE---
"Node 'transform_query':"
'\n---\n'
---RETRIEVE USER---
"Node 'retrieve_user':"
'\n---\n'
---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: ALL DOCUMENTS ARE NOT RELEVANT TO QUESTION, TRANSFORM QUERY---
"Node 'grade_documents':"
'\n---\n'
---NEED HUMAN INPUT---
Question: How can we extract key evaluation metrics from the uploaded document to create a comprehensive Evaluation Criteria section for this RFP?
"Node '__interrupt__':"
'\n---\n'


In [23]:
key=='__interrupt__'

True

In [21]:
additional_txt = """Evaluation Criteria: JTI will evaluate the submissions and presentations based on, but not limited to the following selection criteria:
1. Bidder's capabilities and ability to service JTI and affiliates.
2. Scope of services and delivery standards.
3. Bidder's references and reputation.
4. Bidder's value-added services.
5. Such other factors as may be described elsewhere in this RFP.
6. Cost – demonstrated value proposition.
7. Sustainability – Demonstrating strong evidence that the Supplier’s sustainability commitments will support JTI Supplier Standards and strengthen JT Group Tobacco Business sustainability strategy.
8. Technical Solutions and Services to be compliant with JTI’s Robustness.
"""

In [24]:
if key == '__interrupt__':

    for output in app.stream(Command(resume=additional_txt),thread, stream_mode="updates"):
        for key, value in output.items():
            # Node
            pprint(f"Node '{key}':")
            # Optional: print full state at each node
            # pprint.pprint(value["keys"], indent=2, width=80, depth=None)
        pprint("\n---\n")

    pprint(value["generation"])

---NEED HUMAN INPUT---
Question: How can we extract key evaluation metrics from the uploaded document to create a comprehensive Evaluation Criteria section for this RFP?
"Node 'human_interrupt':"
'\n---\n'
---GENERATE---
"Node 'generate':"
'\n---\n'
('Based on the provided context, you can extract the following key evaluation '
 'metrics for creating a comprehensive Evaluation Criteria section for this '
 'RFP:\n'
 '\n'
 "1. **Bidder's Capabilities and Ability to Service JTI and Affiliates**\n"
 '2. **Scope of Services and Delivery Standards**\n'
 "3. **Bidder's References and Reputation**\n"
 "4. **Bidder's Value-Added Services**\n"
 '5. **Cost – Demonstrated Value Proposition**\n'
 '6. **Sustainability** - This includes demonstrating strong evidence that the '
 'Supplier’s sustainability commitments will support JTI Supplier Standards '
 'and strengthen JT Group Tobacco Business sustainability strategy.\n'
 '7. **Technical Solutions and Services to be Compliant with JTI’s '
 'Robustn

In [30]:
value['generation'].split("\n")

['Based on the provided context, you can extract the following key evaluation metrics for creating a comprehensive Evaluation Criteria section for this RFP:',
 '',
 "1. **Bidder's Capabilities and Ability to Service JTI and Affiliates**",
 '2. **Scope of Services and Delivery Standards**',
 "3. **Bidder's References and Reputation**",
 "4. **Bidder's Value-Added Services**",
 '5. **Cost – Demonstrated Value Proposition**',
 '6. **Sustainability** - This includes demonstrating strong evidence that the Supplier’s sustainability commitments will support JTI Supplier Standards and strengthen JT Group Tobacco Business sustainability strategy.',
 '7. **Technical Solutions and Services to be Compliant with JTI’s Robustness**',
 '',
 'These metrics cover a broad range of factors that can help in evaluating the submissions comprehensively.']

In [1]:
import json
from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH

In [2]:
TOC = {
    "A. INTRODUCTION": ["A.1 JTI", "A.2 Our Engagement"],
    "B. PURPOSE OF THE RFP": ["B.1 Responses", "B.2 Schedule", "B.3 Queries", "B.4 Evaluation Criteria"],
    "C. CONTEXT": ["C.1 Project Scope and Objective", "C.2 JTI Requirements", "C.3 Proposal evaluation criteria"],
    "D. RESPONSE": ["D.1 Executive Summary", "D.2 Additional proposal details"],
    "E. APPENDICES": []
}

In [3]:
def add_heading(doc, text, level):
    p = doc.add_heading(text, level=level)
    p.alignment = WD_ALIGN_PARAGRAPH.LEFT

def add_paragraph(doc, text):
    p = doc.add_paragraph(text)
    p.style.font.size = Pt(11)
    return p

def insert_toc(paragraph):
    """Insert a field code for TOC that Word will convert into a clickable TOC"""
    fldChar1 = OxmlElement('w:fldChar')
    fldChar1.set(qn('w:fldCharType'), 'begin')
    instrText = OxmlElement('w:instrText')
    instrText.set(qn('xml:space'), 'preserve')
    instrText.text = 'TOC \\o "1-3" \\h \\z \\u'
    fldChar2 = OxmlElement('w:fldChar')
    fldChar2.set(qn('w:fldCharType'), 'separate')
    fldChar3 = OxmlElement('w:fldChar')
    fldChar3.set(qn('w:fldCharType'), 'end')
    r = paragraph.add_run()
    r._r.append(fldChar1)
    r._r.append(instrText)
    r._r.append(fldChar2)
    r._r.append(fldChar3)

def build_doc_from_json(data_json, output_path="drafts/Generated_Document.docx"):
    doc = Document()
    doc.add_heading("Generated Proposal", level=0)
    
    # Insert TOC Placeholder
    toc_paragraph = doc.add_paragraph()
    insert_toc(toc_paragraph)
    doc.add_page_break()
    
    for section_title, subsections in TOC.items():
        section_key = section_title.split(".")[0]
        add_heading(doc, section_title, level=1)
        if subsections:
            for subsection in subsections:
                subsection_key = subsection.split()[0]
                heading_text = subsection
                content = data_json.get(section_key, {}).get(subsection_key, "(No content provided)")
                add_heading(doc, heading_text, level=2)
                add_paragraph(doc, content)
        else:
            content = data_json.get(section_key, "(No content provided)")
            add_paragraph(doc, content)
    
    doc.save(output_path)
    print(f"Document saved as {output_path}")
    print("Open the document in Word and press F9 to update the TOC!")
    
    return output_path

In [4]:
build_doc_from_json(data_json=TOC)

Document saved as drafts/Generated_Document.docx
Open the document in Word and press F9 to update the TOC!


'drafts/Generated_Document.docx'