In [13]:
# https://python.langchain.com/docs/use_cases/summarization

In [14]:
# API 키를 환경변수로 관리하기 위한 설정 파일
from dotenv import load_dotenv

# API 키 정보 로드
load_dotenv()

True

# 문서 로드(Load Documents)

In [15]:
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma, FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

In [16]:
# from langchain_community.document_loaders import DirectoryLoader
# 
# loader = DirectoryLoader(".", glob="data/*.pdf", show_progress=True)
# docs = loader.load()

In [17]:
from langchain.document_loaders import PyPDFLoader

# PDF 파일 로드. 파일의 경로 입력
loader = PyPDFLoader("data/개인정보 보호법(법률)(제19234호)(20240315).pdf")

# 페이지 별 문서 로드
docs = loader.load_and_split()

In [18]:
print(f"문서의 수: {len(docs)}\n")
print("[메타데이터]\n")
print(docs[0].metadata)
print("\n========= [앞부분] 미리보기 =========\n")
print(docs[0].page_content[2500:2600])

문서의 수: 41

[메타데이터]

{'source': 'data/개인정보 보호법(법률)(제19234호)(20240315).pdf', 'page': 0}




# 프롬프트
* 프롬프트 엔지니어링
    - 주어진 데이터(context)를 토대로 우리가 원하는 결과를 도출할 때 중요한 역할을 함

# 언어모델 생성(Create LLM)

In [19]:
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0125")

In [20]:
# from langchain import hub
# from langchain.chains import LLMChain
# 
# map_prompt = hub.pull("rlm/map-prompt")
# map_chain = LLMChain(llm=llm, prompt=map_prompt)

In [21]:
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate

# Map
map_template = """The following is a set of documents
{docs}
Based on this list of docs, please identify the main themes 
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

In [22]:
# Note we can also get this from the prompt hub, as noted above
# reduce_prompt = hub.pull("rlm/reduce-prompt")

In [23]:
# Reduce
reduce_template = """The following is set of summaries:
{docs}
Take these and distill it into a final, consolidated summary of the main themes. 
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

In [24]:
from langchain_text_splitters import CharacterTextSplitter
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain, StuffDocumentsChain

reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0
)
split_docs = text_splitter.split_documents(docs)

In [25]:
print(map_reduce_chain.run(split_docs))

  warn_deprecated(


The main themes of the documents provided revolve around the protection of personal information, including regulations, guidelines, and enforcement measures related to the handling, processing, and transfer of personal data. Key topics include the establishment of personal information protection laws, the rights and responsibilities of information subjects and processors, compliance with regulations, oversight by the Protection Commission, international data transfer, dispute resolution mechanisms, enforcement of penalties for violations, and the importance of transparency, consent, and security measures in personal information processing. Additionally, there is a focus on specific areas such as the appointment of personal information protection managers, certification standards, impact assessments, collective lawsuits, and fines for non-compliance with personal information protection laws.
