In [1]:
import os
import glob
import signal
import sys
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load all PDFs from the specified folder
pdf_folder_path = "pdf"  # Update to your path
all_pdf_paths = glob.glob(os.path.join(pdf_folder_path, "*.pdf"))

# Load each PDF document and split text
documents = []
for pdf_path in all_pdf_paths:
    loader = PyPDFLoader(pdf_path)
    pdf_docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    documents.extend(text_splitter.split_documents(pdf_docs))

print(f"Total loaded document chunks: {len(documents)}")

Total loaded document chunks: 1862


In [3]:
# Set up embeddings and LLM with Google Gemini API
GEMINI_API_KEY = "AIzaSyCz1KGd2xl3Q_YmscP3ua0ZWd8Zrv9HO00"
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GEMINI_API_KEY)
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", google_api_key=GEMINI_API_KEY)

In [4]:
# Create FAISS vector database from documents
vector_db = FAISS.from_documents(documents, embeddings)
retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [8]:
# Definisikan template RAG prompt dalam bahasa Indonesia
def generate_rag_prompt(query, context):
    prompt = f"""
Anda adalah bot yang berperan sebagai penilai indeks sustainability perusahaan yang dapat memberikan nilai sustainability perusahaan berdasarkan indikator carbon disclosure index berikut:

1. Assessment/description/identification of climate change-related risk
2. Assessment/description/identification of climate change-related opportunities
3. Assessment of the impact of future climate-related risk and opportunities
4. Description of scope1 emissions
5. Comparison of scope1 emissions with the previous year
6. Description of scope2 emissions
7. Comparison of scope2 emissions with the previous year
8. Description of scope3 emissions
9. Comparison of scope3 emissions with the previous year
10. Total GHG emissions
11. Emission of Ozone depleting substances
12. Nitrogen oxide, sulfur oxide and other air emissions
13. Disclosure of emission intensity
14. Disclosure of GHG emissions by sources
15. Disclosure of GHG emission by facilities/segment
16. Description of protocol/standard/methodology used to measure GHG emissions
17. Disclosure of reduction in emissions
18. Disclosure of initiatives taken to reduce emissions
19. Description of targets set to reduce emissions
20. Comparison of current year targets with the previous year
21. Are emissions generated within permissible limits given by CPCB/SPCB
22. Environment management system in the company
23. Identify the person/management/committee responsible for climate policies/strategy
24. Description of climate change-related business strategy
25. Consumption of fuel
26. Consumption of purchased/acquired electricity
27. Quantification of total energy consumption
28. Comparison of total energy consumption with the previous year
29. Reduction in energy consumption
30. Targets set to reduce energy consumption
31. Consumption of renewable energy
32. Participation in emission trading schemes
33. Statement from CEO/chairman/head of sustainability committee of the company regarding climate change 
34. Inclusion of words “climate change”/“global warming”
35. Section devoted to climate change/natural capital/sustainability
36. Investment in clean energy technologies
37. Awards if any

dan akan dibagi menjadi empat kategori penilaian yaitu:

0 :	Not Reported
1 :	Qualitative
2 :	Quantitative
3 :	Qualitative and Quantitative

jika di dalam report PDF tersebut terdapat indikator yang sesuai di-list, maka berikan label berdasarkan kategori yang ada di tiap indikatornya.

PERTANYAAN: '{query}'
KONTEKS: '{context}'
JAWABAN:
"""
    return prompt

# Buat template dalam bahasa Indonesia untuk chain RetrievalQA
template = """
Anda adalah seorang ahli sustainability yang berpengalaman dalam menjelaskan jawaban akurat dari teks yang kompleks.
Manfaatkan konteks yang diberikan untuk memberikan jawaban yang jelas dan terinci.

Konteks:
{context}

Berikan jawaban yang informatif dan mendalam berdasarkan konteks yang ada:
"""

# Buat prompt template
prompt = ChatPromptTemplate.from_messages([("system", template), ("human", "{input}")])

# Create RetrievalQA chain with specified chain_type and prompt
from langchain.chains import RetrievalQA  # Import RetrievalQA
chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, chain_type_kwargs={"prompt": prompt})

In [11]:
from langchain.schema import HumanMessage

while True:
    query = input("Query (or type 'exit' to quit): ")
    if query.lower() == 'exit':  # Check if the user wants to exit
        print("Exiting the program. Goodbye!")
        break  # Exit the loop

    context = "\n".join([result.page_content for result in retriever.get_relevant_documents(query)])
    prompt = generate_rag_prompt(query=query, context=context)

    # Create a HumanMessage object with the generated prompt
    messages = [HumanMessage(content=prompt)]

    # Pass the messages to the llm
    answer = llm(messages=messages)
    print("Answer:", answer.content)

    output_file = "answer.txt"

    with open(output_file, "w") as file:
        file.write(answer.content)

    print(f"Answer saved to {output_file}")

Answer: Berdasarkan teks yang diberikan, kategori penilaian untuk indikator "Assessment/description/identification of climate change-related risk" pada PT Pertamina adalah **3: Qualitative and Quantitative**.

Alasannya adalah deskripsi tersebut mencakup:

* **Kualitatif:**  Deskripsi risiko secara kualitatif seperti "Meningkatnya bencana alam dan kejadian cuaca ekstrem akibat perubahan iklim" dan potensi dampaknya ("Potensi dampak pada manusia (cedera dan/atau kematian), kerusakan aset, dan gangguan operasional").  Deskripsi juga menyebutkan penggunaan metode FTA (Fault Tree Analysis) untuk analisis akar penyebab, yang merupakan pendekatan kualitatif.

* **Kuantitatif:**  Teks menyebutkan penggunaan Value at Risk (VaR) untuk menilai risiko secara akurat.  Penggunaan VaR merupakan pendekatan kuantitatif dalam pengukuran risiko.  Meskipun detail perhitungan VaR tidak diberikan, penyebutannya menunjukkan adanya aspek kuantitatif dalam penilaian risiko.  Selain itu,  pengembangan risiko u