In [2]:
from dotenv import load_dotenv

from genai.client import Client
from genai.credentials import Credentials
from genai.schema import TextGenerationParameters, TextGenerationReturnOptions
from genai.extensions.langchain import LangChainInterface
from genai.schema import (
    DecodingMethod,
    ModerationHAP,
    ModerationHAPInput,
    ModerationHAPOutput,
    ModerationParameters,
    TextGenerationParameters,
)

from langchain.document_loaders import PyPDFLoader, PyMuPDFLoader, PyPDFium2Loader
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma, Milvus, FAISS
from langchain.embeddings import HuggingFaceEmbeddings

load_dotenv()
client = Client(credentials=Credentials.from_env())

model = LangChainInterface(
    model_id="mistralai/mixtral-8x7b-instruct-v0-1",
    client=Client(credentials=Credentials.from_env()),
    parameters=TextGenerationParameters(
        decoding_method=DecodingMethod.GREEDY,
        max_new_tokens=3000,
        min_new_tokens=1,
        # temperature=0.05,
        top_k=50,
        top_p=1,
        # stop_sequences=['```'],
    )
)

In [9]:
import fitz  # PyMuPDF
from PyPDF2 import PdfWriter

# Load the original PDF using PyMuPDFLoader and split it into documents
pdf_path = "requirement example/elm_rm_report.pdf"
doc = fitz.open(pdf_path)
documents = []
for page in doc:
    # Extract the page content and create a new document
    content = page.get_text("text")
    new_doc = fitz.open()
    new_doc.insert_pdf(doc, from_page=page.number, to_page=page.number)
    documents.append(new_doc)

# Update the page content in each document
for document in documents:
    for page in document:
        # Update the page content as desired
        page_content = page.get_text("text")
        updated_content = "yingkit Updated: " + page_content
        # Set the updated content back to the page
        # page.set_text(updated_content)

        # Clear existing page content
        page.delete_text()
        # Insert the updated content to the page
        page.insert_text(page.rect, updated_content)

# Create a new PDF writer
pdf_writer = PdfWriter()

# Add the pages from each updated document to the PDF writer
for document in documents:
    for page in document:
        # Convert the page to a PDF image
        pdf_bytes = page.get_pixmap().tobytes("pdf")
        # Add the PDF image to the PDF writer
        pdf_writer.add_page(pdf_bytes)

# Save the updated PDF file
output_path = 'requirement example/elm_rm_report.out.pdf'
with open(output_path, 'wb') as f:
    pdf_writer.write(f)

print(f"Updated PDF saved: {output_path}")

AttributeError: 'Page' object has no attribute 'delete_text'

In [6]:
file_path = "requirement example/elm_rm_report.pdf"

loader = PyMuPDFLoader(file_path)
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents(data)
for doc in docs:
    print(doc.page_content)

Confidential
 Requirements Specification
Project Automotive Requirements
Prepared by Susan
  Apr 12, 2024, 3:17:36 AM UTC
Configuration Body Electronics Requirements
Configuration Type Local Configuration
Component Body
Confidential
Table of Contents
Introduction ....................................................................................................................................5
Documentation Requirements........................................................................................................5
Artifact Content........................................................................................................................................5
The system shall detect the opening of a side door........................................................................6
Artifact Content........................................................................................................................................6
Interior Lights.......................

In [2]:
def build_prompt(longinput):
    prompt_template = """[INST]be a business analyst, summary the requirement provided.\n
        <<SYS>>
        requirement:```{requirement}```
        <</SYS>>
        [/INST]
        summary:"""

    i = 0
    prompts = []
    while i+4000 <= len(longinput):
        chunk = [longinput[i:i+4000]]
        print(f"input>>{chunk}")
        prompts += [prompt_template.format(requirement=chunk)]
        i+=4000
    return prompts

In [3]:
from docx import Document

def build_summary(input_file_path):

    adoc = Document(input_file_path)

    summaries = []
    prompts = []

    longinput = ""

    for paragraph in adoc.paragraphs:
        inputtext = paragraph.text
        longinput += inputtext

    prompts += build_prompt(longinput)
        

    for table in adoc.tables:
        longinput = ""
        for row in table.rows:
            for cell in row.cells:
                inputtext = cell.text
                longinput += inputtext

        prompts += build_prompt(longinput)

    for response in model.generate(prompts):
        summaries += [response.generated_text]
        print(f"summary>>{response.generated_text}")

    return summaries


In [4]:
from pathlib import Path
import os

for path in Path('requirement example').rglob('*.docx'):
    print(path.name)
    inputfile = os.path.join(Path('requirement example'),path.name)
    summaries = build_summary(inputfile)

# output_file_path = input_file_path.replace(
#         '.docx', '-out.docx')

# adoc.save(output_file_path)

MEC - Developer Requirements and LBGUPS.docx
input>>['MEC Discovery and Beta Customer PilotPhase 1 - RequirementsAugust  - December 2021Discovery (Phase 1) Scope / OfferingOur phase 1 will start with offering a BETA offering on TELUS MEC. This beta offering will be tested using early adopter “friendly” Enterprise customers. This initial service will consider the foundational building blocks of our TELUS MEC network, and offer a basic subset of developer features to cater towards use cases which are 10ms or above within the range of our 4 sites. Developers will have partial self service for set up, configuration, lifecycle management, and monitoring of their applications using our TMEC service within Canada only. User / Enterprise Developer FunctionsPartial management from TELUS (creation / provisioning), with partial self service capabilities for user on lifecycle / container management and monitoring / alertingTELUS will manually provision container for customer based on initial consu