In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
import os
from dotenv import load_dotenv

# Load environment variables from a .env file
load_dotenv()

# Set the OpenAI API key environment variable
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

llm = ChatOpenAI(model="gpt-4o")

In [None]:
import pandas as pd

file_path = ('./files/jeju/성별카드이용금액비율.csv') # insert the path of the csv file
data = pd.read_csv(file_path, on_bad_lines="skip")

#preview the csv file
data.head()

In [None]:
# 연령대 '알수없음'이 있는 행은 삭제
data = data.drop(data[data['age_range'] == '알수없음'].index)
data.tail()

In [4]:
loader = CSVLoader(file_path=file_path)
docs = loader.load_and_split()

In [5]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

embeddings = OpenAIEmbeddings()
index = faiss.IndexFlatL2(len(OpenAIEmbeddings().embed_query(" ")))
vector_store = FAISS(
    embedding_function=OpenAIEmbeddings(),
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

In [None]:
vector_store.add_documents(documents=docs)

In [None]:
from langchain_community.document_loaders import DataFrameLoader

base_dir = './files/jeju'

embeddings = OpenAIEmbeddings()
index = faiss.IndexFlatL2(len(OpenAIEmbeddings().embed_query(" ")))
vector_store = FAISS(
    embedding_function=OpenAIEmbeddings(),
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

for roots, dirs, files in os.walk(base_dir):
    for file in files:
        file_path = f'{base_dir}/{file}'
        df = pd.read_csv(file_path, on_bad_lines='skip')
        df = df.drop(df[df['age_range'] == '알수없음'].index)
        
        if '성별' in file:
            loader = DataFrameLoader(df, page_content_column='sex')
            docs = loader.load()
            vector_store.add_documents(documents=docs)
        elif '시간' in file:
            loader = DataFrameLoader(df, page_content_column='time_type')
            docs = loader.load()
            vector_store.add_documents(documents=docs)
        elif '요일' in file:
            loader = DataFrameLoader(df, page_content_column='day_of_week')
            docs = loader.load()
            vector_store.add_documents(documents=docs)
        else:
            print(f'Cannot find a {file} file.')


In [58]:
vector_store.save_local("faiss_index")

new_vector_store = FAISS.load_local(
    "faiss_index", embeddings, allow_dangerous_deserialization=True
)

In [59]:
docs = new_vector_store.similarity_search("20", k=3)
docs

[Document(metadata={'base_year_month': 202401, 'user_type': '내국인', 'age_range': '70대 이상', 'use_amout_by_million': 0.0}, page_content='오전'),
 Document(metadata={'base_year_month': 201804, 'user_type': '내국인', 'age_range': '19세 이하', 'use_amout_by_million': 21.0}, page_content='오전'),
 Document(metadata={'base_year_month': 202401, 'user_type': '내국인', 'age_range': '30대 이상', 'use_amout_by_million': 0.0}, page_content='오전')]

In [86]:
from langchain.text_splitter import CharacterTextSplitter

retriever = new_vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})
question = '공휴일에 제주도민은 어디에 돈을 가장 많이 쓰지?'
question_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator=' ',
    chunk_size=10,
    chunk_overlap=0,
)
queries = question_splitter.split_text(question)
print(queries)

docs = []
for query in queries:
    documents = retriever.get_relevant_documents(query)
    for document in documents:
        row = {
            "page_content": document.page_content,
            "metadata": document.metadata
        }
        docs.append(row)
        
restored_df = pd.DataFrame(docs)

restored_df.head()

Created a chunk of size 11, which is longer than the specified 10
Created a chunk of size 14, which is longer than the specified 10


['공휴일에', '제주도민은', '어디에', '돈을', '가장', '많이', '쓰지?']


Unnamed: 0,page_content,metadata
0,공휴일,"{'base_year_month': 201801, 'age_range': '19세 ..."
1,공휴일,"{'base_year_month': 201801, 'age_range': '20대'..."
2,공휴일,"{'base_year_month': 201801, 'age_range': '20대'..."
3,남성,"{'base_year': 2019, 'base_quarter': '4분기', 'bi..."
4,남성,"{'base_year': 2019, 'base_quarter': '4분기', 'bi..."


# map reduce

In [96]:
# RetrievalQA & map_reduce
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain.prompts import ChatPromptTemplate

retriever = new_vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            긴 문서의 다음 부분을 사용하여 질문에 답하는 데 적합한 텍스트가 있는지 확인합니다.
            ---
            {context}
            """
        ),
        ("human", "{question}")
    ]
)

map_doc_chain = map_doc_prompt | llm

def map_docs(inputs):
    documents = inputs["documents"]
    question = inputs["question"]
    
    question_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        separator=' ',
        chunk_size=10,
        chunk_overlap=0,
    )
    queries = question_splitter.split_text(question)
    print(queries)
    
    docs = []
    for query in queries:
        documents = retriever.get_relevant_documents(query)
        for document in documents:
            row = {
                "page_content": document.page_content,
                "metadata": document.metadata
            }
            docs.append(row)
            
    restored_df = pd.DataFrame(docs)
    return restored_df
    # return "\n".join(
    #     map_doc_chain.invoke(
    #         {"context":doc.metadata, "question":question}
    #     ).content for doc in documents
    # )

map_chain = {"documents": retriever, "question": RunnablePassthrough()} | RunnableLambda(map_docs)

reduce_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            긴 문서는 제주도의 카드 이용 내역서입니다.
            카드 이용 내역서는 성별, 시간대별, 요일별로 구분됩니다.
            긴 문서와 질문에서 다음과 같이 추출한 부분이 주어지면 최종 답을 만듭니다.
            답을 모르면 모른다고 말하세요.
            답을 지어내려고 하지 마세요.
            한국어로 답변해야 합니다.
            ------
            {context}
            """
        ),
        ("human", "{question}"),
    ]
)

chain = {"context": map_chain, "question": RunnablePassthrough()} | reduce_prompt | llm
chain.invoke("공휴일에 제주도민은 어디에 돈을 가장 많이 쓰지?")

Created a chunk of size 11, which is longer than the specified 10
Created a chunk of size 14, which is longer than the specified 10


['공휴일에', '제주도민은', '어디에', '돈을', '가장', '많이', '쓰지?']


AIMessage(content='모르겠습니다.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 4, 'prompt_tokens': 645, 'total_tokens': 649, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_7f6be3efb0', 'finish_reason': 'stop', 'logprobs': None}, id='run-d38669bf-9900-4baa-91b4-eb1367205414-0', usage_metadata={'input_tokens': 645, 'output_tokens': 4, 'total_tokens': 649, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

# RetrievalQA & agent

In [None]:
# RetrievalQA
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain.agents import initialize_agent, Tool
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

llm=ChatOpenAI(
    model='gpt-4o',
    temperature=0.1,
    callbacks=[StreamingStdOutCallbackHandler()],
)

retriever_qa = RetrievalQA.from_chain_type(
    llm,
    retriever=new_vector_store.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 5, "fetch_k": 20},
    ),
    chain_type='stuff',
)

tools = [
    Tool(
        name='assistant for start-ups in Jeju.',
        func=retriever_qa.run,
        description="If you want to start a business in Jeju Island, you must use this tool! It gives you an answer in Korean.",
    ),
]

agent=initialize_agent(
    tools,
    llm,
    agent='zero-shot-react-description',
    verbose=True,
)

response = agent.run("공휴일에 제주도민은 어디에 돈을 가장 많이 쓰지?")
print(response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo answer this question, I need to determine where residents of Jeju Island spend the most money during public holidays. This might involve understanding local spending habits and popular activities or destinations during these times. I will use the tool designed for start-ups in Jeju to gather insights on this topic.

Action: assistant for start-ups in Jeju.
Action Input: 공휴일에 제주도민의 소비 패턴에 대한 정보[0m

Created a chunk of size 11, which is longer than the specified 10
Created a chunk of size 14, which is longer than the specified 10


['공휴일에', '제주도민의', '소비', '패턴에', '대한', '정보']


AttributeError: 'RetrievalQA' object has no attribute 'get_relevant_documents'

# Simple csv rag

In [61]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

retriever = vector_store.as_retriever()

# Set up system prompt
system_prompt = (
    """
    너는 질문에 대한 답을 해주는 역할을 가진 챗봇이야.
    내가 준 문서를 기반으로 질문에 답변을 해줘.
    답변을 5개 정도 준비하고, 준비한 답변 중 가장 높은 카드 이용량을 가진 답변을 최종 답변으로 선택해.
    모르면 모른다고 말해. 모르는 것을 지어내서 답변하면 절대 안돼.
    ---
    {context}
    """
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
    
])

# Create the question-answer chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [62]:
answer= rag_chain.invoke({"input": "공휴일에 제주도민은 어디에 돈을 가장 많이 쓰지?"})
print(answer['answer'])

미안해, 그 질문에 대한 답변을 제공할 수 있는 정보가 없어. 제주도민이 공휴일에 어디에 돈을 가장 많이 쓰는지에 대한 구체적인 데이터는 내게 없어.


# langchain v0.3

In [51]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.tools.retriever import create_retriever_tool

retriever = new_vector_store.as_retriever()

retriever_tool = create_retriever_tool(
    retriever,
    name="assistant for start-ups in Jeju",
    description="If you want to start a business in Jeju Island, you must use this tool!",
)

tools = [retriever_tool]

llm = ChatOpenAI(
    model='gpt-4o',
    temperature=0.1,
)

# Set up system prompt
system_prompt = (
    """
    너는 질문에 대한 답을 해주는 역할을 가진 챗봇이야.
    내가 준 문서를 기반으로 질문에 답변을 해줘.
    답변을 5개 정도 준비하고, 준비한 답변 중 가장 높은 카드 이용량을 가진 답변을 최종 답변으로 선택해.
    모르면 모른다고 말해. 모르는 것을 지어내서 답변하면 절대 안돼.
    ---
    {context}
    """
)

prompt = ChatPromptTemplate.from_messages([
    ("system", """
    너는 질문에 대한 답을 해주는 역할을 가진 챗봇이야.
    내가 준 문서를 기반으로 질문에 답변을 해줘.
    답변을 5개 정도 준비하고, 준비한 답변 중 가장 높은 카드 이용량을 가진 답변을 최종 답변으로 선택해.
    모르면 모른다고 말해. 모르는 것을 지어내서 답변하면 절대 안돼.
    ---
    {context}
    """),
    MessagesPlaceholder("chat_history", optional=True),
    ("human", "{input}"),
    MessagesPlaceholder("agent_scratchpad"),
])


In [52]:
from langchain.agents import create_openai_tools_agent, AgentExecutor

agent = create_openai_tools_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)