In [1]:
import os

from langchain_core.messages import HumanMessage
from langchain_openai import AzureChatOpenAI

os.environ["OPENAI_API_VERSION"] = "2024-02-15-preview"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://openaimodelv3si.openai.azure.com/"
os.environ["AZURE_OPENAI_API_KEY"] = "e248b8e8dbf94849ac6888971cacc9ae"

In [2]:
llm = AzureChatOpenAI(
    openai_api_version="2023-05-15",
    azure_deployment="trial1",
)

In [3]:
message = HumanMessage(
    content="Translate this sentence from English to French. I love programming."
)
llm.invoke([message])

AIMessage(content="J'adore la programmation.", response_metadata={'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {}})

In [4]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("/home/chandan/orange/code/2021-annual-report.pdf")
pages = loader.load_and_split()
pages

[Document(page_content='© 2022 Wells˜Fargo & Company.  All rights reserved.\nDeposit products offered through Wells˜Fargo Bank, N.A. Member FDIC.CCM7565  (Rev 00, 1/each)WELLS FARGO & COMPANY\n420 MONTGOMERY STREET | SAN FRANCISCO, CA | 941041˜866˜878˜5865 | WELLSFARGO.COMWELLS˜FARGO WELLS˜FARGO & COMPANY °˛°˝ ANNUAL REPORT\n   \n 2021 \nAnnual Report  | Wells Fargo & Company', metadata={'source': '/home/chandan/orange/code/2021-annual-report.pdf', 'page': 0}),
 Document(page_content='Contents \nLetter from CEO II \nOur Performance XV \nOperating Committee XVI \nBoard of Directors XVII \n2021 Financial Report 1 \nStock Performance 203 \nAbout Wells Fargo 204', metadata={'source': '/home/chandan/orange/code/2021-annual-report.pdf', 'page': 2}),
 Document(page_content='A letter from \nCharles W. Scharf \nCEO and President \nWells Fargo & Company As I write this letter refecting on 2021, it is difcult \nto describe what we have all been through and the extent to which COVID-19 has impacte

In [5]:
# from langchain_openai import OpenAIEmbeddings
# embeddings = OpenAIEmbeddings()

from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = AzureOpenAIEmbeddings(
    azure_deployment="embed1",
    openai_api_version=os.getenv("OPENAI_API_VERSION"),
)
vector = FAISS.from_documents(pages, embeddings)
retriever = vector.as_retriever()

In [14]:
from langchain.chains import RetrievalQA
from langchain.prompts import (
    PromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
)

review_template = """Your job is to use ESG (A sustainability report is a report published by companies on the environmental, social and governance (ESG) impacts of their activities) 
documents and annual reports to answer questions. Use
the following context to answer questions. Be as detailed as possible, but
don't make up any information that's not from the context. If you don't know
an answer, say you don't know.
{context}
"""

esg_system_prompt = SystemMessagePromptTemplate(
    prompt=PromptTemplate(input_variables=["context"], template=review_template)
)

# esg_human_prompt = HumanMessagePromptTemplate(
#     prompt=PromptTemplate(input_variables=["question"], template="Answer the question: '{question}' by filling the None in the table: ['Start date', 'End date', 'Indicate if you are providing emissions data for past reporting years', 'Select the number of past reporting years you will be providing Scope 2 emissions data for*', 'Select the number of past reporting years you will be providing Scope 2 emissions data for*', 'Select the number of past reporting years you will be providing Scope 3 emissions data for*'][None, None, None, None, None, None] where list is a row of the table")
# )
# esg_human_prompt = HumanMessagePromptTemplate(
#     prompt=PromptTemplate(input_variables=["question"], template="{question}.\n Answer according to the response format provided. The table(if provided) in the response options is passed as list of columns and each list is a rows. Rows are divided by newline. Merge them with contextual meaning and then fill the table")
# )
esg_human_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(input_variables=["question"], template="{question}.\n Answer according to the response format provided. The table(if provided) in the response options is passed as list of columns and each list is a rows. Rows are divided by newline. Merge them with contextual meaning and then fill the table.")
)
# esg_human_prompt = HumanMessagePromptTemplate(
#     prompt=PromptTemplate(input_variables=["question"], template="{question}.\n Answer according to the response format provided. The table(if provided) in the response options is passed as list of columns comma separated columns. Rows are divided by newline.")
# )
# esg_human_prompt = HumanMessagePromptTemplate(
#     prompt=PromptTemplate(input_variables=["question"], template="{question}.\n Answer according to the response format provided. The table(if provided) in the response options is passed as list of columns comma separated columns. Rows are divided by newline. When you're returning a table, please mention it by flag 'Table:'")
# )
messages = [esg_system_prompt, esg_human_prompt]

esg_prompt = ChatPromptTemplate(
    input_variables=["context", "question"], messages=messages
)

esg_vector_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)
esg_vector_chain.combine_documents_chain.llm_chain.prompt = esg_prompt

In [11]:
# query = "State the start and end date of the year for which you are reporting data and indicate whether you will be providing emissions data for past reporting years."
query = "(C0.2) State the start and end date of the year for which you are reporting data and indicate whether you will be providing emissions data for past reporting years.\nConnection to other frameworks RE100\nResponse options\nPlease complete the following table. * Column/row appearance is dependent on selections in this or other questions.\n['Start date', 'End date', 'Indicate if you are providing', 'Select the number of past', 'Select the number of past', 'Select the number of past']\n['', '', 'emissions data for past', 'reporting years you will be', 'reporting years you will be', 'reporting years you will be']\n['', 'None', 'reporting years', 'providing Scope 1 emissions data for*', 'providing Scope 2 emissions data for*', 'providing Scope 3 emissions data for*']\n['From: [DD/MM/YYYY]', 'To: [DD/MM/YYYY]', 'Select from:', 'Select from:', 'Select from:', 'Select from:']\n['', '', '● Yes', '● 1 year', '● 1 year', ':selected: ● 1 year']\n['', '', ':selected: ● No', '· 2 years', '● 2 years', '· 2 years']\n['', '', '', '· 3 years', '● 3 years', '· 3 years']\n['', '', '', '● 4 years', '● 4 years', '· 4 years']\n['', '', '', '● 5 years', '● 5 years', '● 5 years']\n['', '', '', '● Not providing past', '● Not providing past', '● Not providing past']\n['None', 'None', '', 'emissions data for', 'emissions data for', 'emissions data for']\n['None', 'None', '', 'Scope 1', 'Scope 2', 'Scope 3']"
# query = "(C-FS0.7) Which activities does your organization undertake, and which industry sectors does your organization lend to, invest in, and/orinsure? Question dependencies This question is mandatory, and your response to it determines which questions will be shown throughout the questionnaire and which response options will be presented within these questions. If no selection is made in column 4, the FW-FS module will not appear. Minor change\n Connection to other frameworks\n TCFD\n Financial Sector\n Response options\nPlease complete the following table: (*column/row appearance is dependent on selections in this or other questions)\n\nActivity,Does your organization undertake this activity?,Insurance types underwritten*,Industry sectors your organization lends to, invests in, and/or insures*\nBanking (Bank),Select from:,Select all that apply:,See drop-down options below\nNone,:selected: ● Yes,· General (non-life),\nNone,:selected: ● No,· Life and/or Health,\nInvesting (Asset manager),,,\nInvesting (Asset owner),,,\nInsurance underwriting (Insurance company),,,"
response = esg_vector_chain.invoke(query)
response

{'query': "(C0.2) State the start and end date of the year for which you are reporting data and indicate whether you will be providing emissions data for past reporting years.\nConnection to other frameworks RE100\nResponse options\nPlease complete the following table. * Column/row appearance is dependent on selections in this or other questions.\n['Start date', 'End date', 'Indicate if you are providing', 'Select the number of past', 'Select the number of past', 'Select the number of past']\n['', '', 'emissions data for past', 'reporting years you will be', 'reporting years you will be', 'reporting years you will be']\n['', 'None', 'reporting years', 'providing Scope 1 emissions data for*', 'providing Scope 2 emissions data for*', 'providing Scope 3 emissions data for*']\n['From: [DD/MM/YYYY]', 'To: [DD/MM/YYYY]', 'Select from:', 'Select from:', 'Select from:', 'Select from:']\n['', '', '● Yes', '● 1 year', '● 1 year', ':selected: ● 1 year']\n['', '', ':selected: ● No', '· 2 years', '

In [33]:
import os
import re
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential


def is_non_table_element(element):
    return element["type"] != "table"

def print_non_table_paragraphs(form_recognizer_result):
    for element in form_recognizer_result["document"]["pages"]:
        if isinstance(element, dict):  # Handle nested elements
            for inner_element in element.values():
                if is_non_table_element(inner_element):
                    if inner_element["type"] == "paragraph":
                        print(inner_element["content"])
                    else:
                        # Handle other non-table elements (e.g., lines, headings) if needed
                        pass

endpoint = "https://team-orange.cognitiveservices.azure.com/"
key = "6c106509d1c34958aeb7fff48c806a5a"

class Table:
    def __init__(self, rows, columns, default_value="None"):
        self.rows = rows
        self.columns = columns
        self.table = [[default_value for _ in range(columns)] for _ in range(rows)]

    def get_value(self, row, column):
        return self.table[row][column]

    def set_value(self, row, column, value):
        self.table[row][column] = value

    def get_table(self):
        strTable = ""
        for row in self.table:
            strTable += str(row) + "\n"
        return strTable

    def print_table(self):
        for row in self.table:
            print(row)

def find_matching_texts(text):
    regex_pattern = r"(?s)\((C(?:-FS)?\d+(?:\.\d+)?[a-z]?)\)(.*?)(?=\((C(?:-FS)?\d+(?:\.\d+)?[a-z]?)\)|$)"
    matches = re.findall(regex_pattern, text)

    result_dict = {}

    for match in matches:
        start_pattern, captured_text, end_pattern = match
        result_dict[start_pattern] = captured_text.strip()

    all_c_patterns = re.findall(r"(?s)\((C(?:-FS)?\d+(?:\.\d+)?[a-z]?)\)", text)

    for c_pattern in all_c_patterns:
        if c_pattern not in result_dict:
            result_dict[c_pattern] = ""

    return result_dict

tables = {}
tablecells = {}
print_table = {}

def analyze_layout():
    pdf_path = "/home/chandan/orange/code/Survey-Questionnire-Part1.pdf"

    with open(pdf_path, "rb") as pdf_file:
        pdf_content = pdf_file.read()

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )
    poller = document_analysis_client.begin_analyze_document(
        "prebuilt-layout", pdf_content
    )
    result = poller.result()
    data = ""
    for table_idx, table in enumerate(result.tables):
        mytable = Table(table.row_count, table.column_count)
        for cell in table.cells:
            mytable.set_value(cell.row_index, cell.column_index, cell.content)
            if len(cell.spans) == 0:
                continue
            tablecells[cell.spans[0].offset] = table_idx
        tables[table_idx] = mytable
        print_table[table_idx] = False
    
    for element in result.paragraphs:
        if len(element.spans) == 0:
            continue
        item = tablecells.get(element.spans[0].offset)
        if item is not None:
            if print_table[item] == False:
                data += tables[item].get_table()
                print_table[item] = True
        else:
            # print(element.content)
            if element.content.startswith("Page"):
                continue
            data += str(element.content) + "\n"
    
    return data

In [36]:
que = find_matching_texts(analyze_layout())
query = que["C0.1"]
response = esg_vector_chain.invoke(query)
response

{'query': 'Give a general description and introduction to your organization.\nResponse options\nThis is an open text question with a limit of 5,000 characters. Please note that when copying from another document into the ORS, formatting is not retained.\nCDP DISCLOSURE INSIGHT ACTION',
 'result': "Wells Fargo & Company is a leading financial services company that provides banking, investment, mortgage, and consumer and commercial finance through more than 7,200 locations, 13,000 ATMs, online, and mobile devices. With over 266,000 team members, Wells Fargo serves one in three households in the United States. The company's vision is to satisfy our customers' financial needs and help them succeed financially, and this is built on the belief that they can only be successful when their customers, communities, and employees are successful. The company has a strong reputation and actively supports employees, customers, and communities, especially those most in need.\n\nThe company is undergoi

In [19]:
import fitz
import re


def extract_specific_questions_from_pdf(pdf_path):
    questions = []
    pdf_document = fitz.open(pdf_path)

    question_pattern = r"\((C(?:-FS)?\d+(?:\.\d+)?[a-z]?)\) (.+?)(?:\.|\?)(?=\s)"

    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]

        text = page.get_text()
        question_matches = re.findall(question_pattern, text, re.DOTALL)
        questions.extend(question_matches)

    pdf_document.close()

    return questions


pdf_path = '/home/chandan/orange/code/Survey-Questionnire-Part1.pdf'

specific_questions = extract_specific_questions_from_pdf(pdf_path)

In [25]:
#create table

import textwrap

def create_pdf_with_table(page, result_string, start_y):
    # Parse the result string to extract rows and columns
    rows = result_string.strip().split('\n')
    table_data = [row.strip().split(', ') for row in rows]

    # Define table parameters
    num_rows = len(table_data)
    num_columns = len(table_data[0])
    cell_width = 100
    cell_height = 30
    start_x = 20
    #start_y = 400
    font_size = 10
    prev_max_height = 0

    # Draw table data
    for row_index, row_data in enumerate(table_data):
        max_height = 0
        for col_index, cell_data in enumerate(row_data):
            # Wrap text if it exceeds the column width
            wrapped_lines = textwrap.wrap(cell_data, width=int(cell_width/font_size))

            # Calculate cell height based on wrapped lines
            cell_height = len(wrapped_lines) * font_size
            max_height = max(max_height, cell_height)

            # Draw data cell and adjust border dynamically
            rect = fitz.Rect(start_x + col_index * cell_width, start_y + (row_index + 1) * cell_height,
                             start_x + (col_index + 1) * cell_width, start_y + row_index * cell_height)
            page.draw_rect(rect)

            # Insert data text line by line
            for line_index, line in enumerate(wrapped_lines):
                # Calculate the position of the text within the cell
                text_x = rect.x0 + 5
                text_y = rect.y1 + (line_index + 1) * font_size
                # Insert data text
                page.insert_text((text_x, text_y), line, fontsize=font_size)

            # Draw vertical lines to separate columns
            # Draw vertical lines to separate columns
            page.draw_line(fitz.Point(rect.x0, start_y + (row_index + 1) * cell_height), fitz.Point(rect.x0, start_y + row_index * cell_height))
            page.draw_line(fitz.Point(rect.x1, start_y + (row_index + 1) * cell_height), fitz.Point(rect.x1, start_y + row_index * cell_height))

        # Draw a line below the maximum height of the previous row
        line_y = start_y + max_height + 10
        page.draw_line(fitz.Point(start_x, line_y), fitz.Point(start_x + num_columns * cell_width, line_y))

        # Update previous maximum height
        prev_max_height = max_height

        # Adjust the starting position of the next row
        start_y += max_height

    return start_y

def table(result):
    result = result.replace("[", "").replace("]", "").replace("'", "")
    result = result.replace("]\n[", "\n")
    print(result)
    return result

# Create PDF with a table
#create_pdf_with_table(pdf_path, result_string)

In [38]:
def create_pdf_with_answers(input_pdf_path, output_pdf_path, questions):
    output_pdf = fitz.open() 
    y_coordinate = 50  # Initial y-coordinate for the first question
    page = output_pdf.new_page()  

    for question_number, question_text in questions:
        tableflag = 0
        # Check if inserting the next text would exceed the page height
        question_text = question_text.replace('\n', '')
        if y_coordinate + 50 >= page.rect.height:
            page = output_pdf.new_page()  # Create a new page
            y_coordinate = 50  # Reset y-coordinate

        # Insert question number with purple color
        page.insert_text((50, y_coordinate), f"{question_number}:", color=(0.5, 0, 0.5))  
        if y_coordinate + 20 >= page.rect.height:
            y_coordinate += 40  # Move to the next line
        else:
            y_coordinate += 20  # Move to the next line

        # Insert a horizontal line below the answer
        page.draw_line((50, y_coordinate - 2), (550, y_coordinate - 2))  
        if y_coordinate + 20 >= page.rect.height:
            y_coordinate += 40  # Move to the next line
        else:
            y_coordinate += 20  # Move to the next line
            
        # Insert question text
        #question_text = question_text.FONT_BOLD
        if len(question_text) > 99:
            substrings = []
            for i in range(0, len(question_text), 99):
                substrings.append(question_text[i:i+99])
            for i in substrings:
                page.insert_text((50, y_coordinate), f"{i}", color=(0.5, 0, 0.5))  # Adjust position and color as needed
                y_coordinate += 20  # Move to the next line
            
        else:
            page.insert_text((50, y_coordinate), f"{question_number}: {question_text}", color=(0.5, 0, 0.5)) 
            y_coordinate += 20  # Move to the next line

        #tablemap
        #print(matching_texts[question_number])
        # print(que[question_number])
        response = esg_vector_chain.invoke(que[question_number])
        answer = response["result"]
        if answer.startswith("Table:"):
            y_coordinate = create_pdf_with_table(page, table(answer), y_coordinate)
            y_coordinate += 50
            tableflag = 1

            
        # Insert sample answer below the question
        if tableflag == 0:
            if len(answer) > 99:
                substrings = []
                for i in range(0, len(answer), 99):
                    substrings.append(answer[i:i+99])
                for i in substrings:
                    page.insert_text((50, y_coordinate), f"{answer}")  
                    y_coordinate += 20  # Move to the next line
            
            else:
                page.insert_text((50, y_coordinate), f"{answer}") 
                y_coordinate += 40  # Move to the next line for the next question

    

    # Save the output PDF
    output_pdf.save(output_pdf_path)
    output_pdf.close()

    print("New PDF created successfully!")

output_pdf_path = '/home/chandan/orange/code/output_with_answers.pdf'

# Create PDF with sample answers for each question
create_pdf_with_answers(pdf_path, output_pdf_path, specific_questions)

New PDF created successfully!
