# 랭체인으로 RAG구현하기 소스코드 1

In [None]:
!pip install langchain openai  # 필요한 라이브러리 설치

import os
from google.colab import files
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI


In [None]:
# OpenAI API 키 설정
os.environ["OPENAI_API_KEY"] = "your-openai-api-key"

# 1. 파일 업로드
uploaded = files.upload()

# 2. 업로드된 파일 이름 가져오기 (업로드된 파일을 바로 사용)
uploaded_filename = list(uploaded.keys())[0]

# 3. 파일 로더 설정
data_loader = UnstructuredFileLoader(uploaded_filename)

In [None]:
# 4. 텍스트 분할 설정
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=500,
    chunk_overlap=50
)

# 5. 텍스트 로드 및 분할
docs = data_loader.load_and_split(text_splitter=splitter)

# 6. 임베딩 설정 및 캐싱
embeddings = OpenAIEmbeddings()  # OpenAI 임베딩 인스턴스 생성

In [None]:
# 캐시 디렉토리 설정 (Google Colab 작업 디렉토리에 생성됨)
cache_dir = LocalFileStore("./.cache/")

# 캐시 지원 임베딩 인스턴스 생성
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

# 7. 벡터 스토어 생성
vectorstore = Chroma.from_documents(docs, cached_embeddings)

# 8. 검색기 인스턴스 생성
retriever = vectorstore.as_retriever()

# 9. LLM 인스턴스 생성
model = ChatOpenAI()

# 10. RetrievalQA 체인 생성
chain = RetrievalQA.from_chain_type(
    llm=model,
    chain_type="map_reduce",
    retriever=retriever,
)

In [None]:
# 11. 질문 실행
answer = chain.run("What is the capital of France?")
print(answer)  # "Paris"와 같은 답변이 출력됩니다.


In [None]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

chatgpt = ChatOpenAI(model_name="gpt-3.5-turbo", streaming=True, callbacks=[StreamingStdOutCallbackHandler()], temperature = 1)
answer = chatgpt.predict("왜 파이썬이 가장 인기있는 프로그래밍 언어야?")

#실시간 타이핑이 보이도록

# 랭체인을 사용하여 RAG구현 소스코드 2

Langchain을 활용한 RAG 구현 소스코드 2


In [None]:
!pip install langchain unstructured pypdf pdf2image docx2txt pdfminer

Collecting unstructured
  Downloading unstructured-0.14.9-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pdfminer
  Downloading pdfminer-20191125.tar.gz (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl (13 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://www.google.com")
data = loader.load()
print(data[0].page_content)

In [None]:
from langchain.document_loaders import UnstructuredURLLoader

urls = [
    "page"
    "page"
]
loader = UnstructuredURLLoader(urls=urls)
data

# PDF다운로더

PDF Document Loader

In [None]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.2.6-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.21.3-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extensi

In [None]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("/content/drive/MyDrive/R152r2E.pdf")
pages = loader.load_and_split()

ValueError: File path /content/drive/MyDrive/R152r2E.pdf is not a valid file or url

In [None]:
print(pages[0].page_content)

GE.23 -11213(E)  
  Agreement  
  Concerning the  Adoption of Harmonized Technical United Nations 
Regulations for Wheeled Vehicles, Equipment and Parts which can be 
Fitted and/or be Used on Wheeled Vehicles and the Conditions for 
Reciprocal Recognition of Approvals Granted on the Basis of these 
United Nat ions Regulations * 
  (Revision 3, including the amendments which entered into force on 14 September 2017)  
_________  
  Addendum 1 51 – UN Regulation No. 1 52 
  Revision 2 
Incorporating all valid text up to:  
Supplement 1 to the 01 series of amendments – Date of entry into force: 3 January 2021  
Supplement 2 to the 01 series of amendments – Date of entry into force: 30 September 2021  
02 series of amendments – Date of entry into force: 30 September 2021  
  Uniform  provisions concerning the approval of motor vehicles with 
regard to the Advanced Emergency Braking System (AEBS) for M 1 and 
N1 vehicles  
_________  
UNITED NATIONS  
  
 * Former titles of the Agreement:  


In [None]:
len(pages)

36

In [None]:
print(pages[1].page_content)

E/ECE/TRANS/505/Rev.3/Add.1 51/Rev. 2 
2 This document is meant purely as documentation tool. The authentic and legal binding text s 
are: ECE/TRANS/WP.29/2020/69 , ECE/TRANS/WP.29/2021/16  and 
ECE/TRANS/WP.29/2021/18 .


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function = len
)

In [None]:
pages = loader.load_and_split()



In [None]:
texts = text_splitter.split_documents(pages)
len(texts)

122

In [None]:
print(texts[1].page_content)

regard to the Advanced Emergency Braking System (AEBS) for M 1 and 
N1 vehicles  
_________  
UNITED NATIONS  
  
 * Former titles of the Agreement:  
  Agreement concerning the Adoption of Uniform Conditions of Approval and Reciprocal Recognition of 
Approval for Motor Vehicle Equipment and Parts, done at Geneva on 20 March 1958 (original version);  
  Agreement concerning the Adop tion of Uniform Technical Prescriptions for Wheeled Vehicles, 
Equipment and Parts which can be Fitted and/or be Used on Wheeled Vehicles and the Conditions for 
Reciprocal Recognition of Approvals Granted on the Basis of these Prescriptions, done at Geneva on 
5 October 1995 (Revision 2).   E/ECE/TRANS/505/Rev.3/Add.1 51/Rev. 2 
   
 
15 June 2023


In [None]:
char_list = []
for i in range(len(texts)):
  char_list.append(len(texts[i].page_content))
print(char_list)

[997, 736, 220, 879, 909, 856, 695, 994, 996, 235, 964, 952, 977, 375, 928, 953, 967, 995, 216, 980, 993, 910, 598, 934, 961, 993, 583, 994, 997, 435, 923, 966, 563, 992, 978, 219, 982, 977, 848, 993, 992, 459, 959, 944, 912, 944, 419, 933, 916, 988, 477, 995, 995, 956, 836, 926, 945, 943, 260, 976, 973, 994, 651, 943, 968, 983, 781, 918, 965, 959, 645, 969, 944, 983, 462, 969, 996, 385, 941, 973, 989, 452, 681, 984, 934, 994, 569, 919, 949, 935, 996, 197, 977, 942, 953, 882, 997, 962, 990, 974, 322, 984, 966, 981, 704, 969, 892, 976, 917, 307, 701, 968, 995, 684, 912, 975, 465, 990, 784, 969, 894, 520]


# 텍스트 임베딩

**TEXT Embedding**

In [None]:
!pip install openai langchain pypdf tiktoken

In [None]:
from langchain.embeddings import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings(openai_api_key = "")

In [None]:
embeddings = embeddings_model.embed_documents(
    [
        "Hello world",
        "How are you?",
        "제 이름은 홍길동입니다."
        "공부가 너무 싫어요"
        "랭체인이 뭔지 아시나요?"
    ]
)
len(embeddings),len(embeddings[0])

In [None]:
embedded_query_q = embeddings_model.embed_quary("이 대화에서 언급된 이름은 무엇입니까?")
embedded_query_a = embeddings_model.embed_quary("이 대화에서 언급된 이름은 홍길동입니다.")
print(len(embedded_query_q),len(embedded_query_a))

In [None]:
from numpy import dot
from numpy.linalg import norm
import numpy as np

def cos_sim(A,B):
  return dot(A,B)/(norm(A)*norm(B))

In [None]:
print(cos_sim(embedded_query_q, embedded_query_a))
print(cos_sim(embedded_query_q, embeddings[1]))
print(cos_sim(embedded_query_q, embeddings[3]))
#임베딩 모델간의 유사도 확인

허깅페이스 임베딩

# 허깅페이스 모델을 사용한 임베딩

In [None]:
!pip install sentence_transformers

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3108, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2901, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 169, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 242, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 377, in run
    requirement_set = resolver.resolve(
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/resolution/resolvelib/resolver.py", line 

In [None]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

ModuleNotFoundError: Module langchain_community.embeddings not found. Please install langchain-community to access this module. You can install it using `pip install -U langchain-community`

In [None]:
embeddings = hf.embed_documents(
    [
    "today is monday",
    "weather is nice today",
    "what's the problem?",
    "langhcain in useful",
    "Hello World!",
    "my name is morris"
    ]
)

In [None]:
BGE_query_q = hf.embed_query("Hello? who is this?")
BGE_query_a = hf.embed_query("hi this is harrison")

print(cos_sim(BGE_query_q, BGE_query_a))
print(cos_sim(BGE_query_q, embeddings[1]))
print(cos_sim(BGE_query_q, embeddings[5]))

In [None]:
sentences = [
    "안녕하세요",
    "제 이름은 홍길동입니다.",
    "이름이 무엇인가요?",
    "랭체인은 유용합니다.",
    "홍길동 아버지의 이름은 홍상직입니다."
    ]
ko_embeddings = hf.embed_documents(sentences)

In [None]:
BGE_query_q_2 = hf.embed_query("홍길동은 아버지를 아버지라 부르지 못하였습니다. 홍길동 아버지의 이름은 무엇입니까?")
BGE_query_a_2 = hf.embed_query("홍길동의 아버지는 엄했습니다.")


print("질문: 홍길동은 아버지를 아버지라 부르지 못하였습니다. 홍길동 아버지의 이름은 무엇입니까? \n", "-"*100)
print("홍길동의 아버지는 엄했습니다. \t\t 문장 유사도: ", round(cos_sim(BGE_query_q_2, BGE_query_a_2),2))
print(sentences[1] + "\t\t\t 문장 유사도: ", round(cos_sim(BGE_query_q_2, ko_embeddings[1]),2))
print(sentences[3] + "\t\t\t 문장 유사도: ", round(cos_sim(BGE_query_q_2, ko_embeddings[3]),2))
print(sentences[4] + "\t 문장 유사도: ", round(cos_sim(BGE_query_q_2, ko_embeddings[4]),2))

# 벡터스토어를 이용하여 문서 임베딩 및 유사성 검색


벡터스토어

In [None]:
!pip install chromadb tiktoken transformers sentence_transformers openai langchain pypdf

Collecting chromadb
  Downloading chromadb-0.5.3-py3-none-any.whl (559 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m559.5/559.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-1.35.10-py3-none-any.whl (328 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.3/328.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.2.6-py3-none-any.whl (975 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter

tokenizer = tiktoken.get_encoding("cl100k_base")

def tiktoken_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader


# load the document and split it into chunks
loader = PyPDFLoader("/content/drive/MyDrive/R152r2E.pdf")
pages = loader.load_and_split()

# split it into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0, length_function = tiktoken_len)
docs = text_splitter.split_documents(pages)

# create the open-source embedding function
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "Alibaba-NLP/gte-Qwen2-7B-instruct"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/284 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/145k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/55.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/880 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/7 [00:00<?, ?it/s]

model-00001-of-00007.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00007.safetensors:   0%|          | 0.00/4.78G [00:00<?, ?B/s]

model-00003-of-00007.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00007.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00005-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00006-of-00007.safetensors:   0%|          | 0.00/3.66G [00:00<?, ?B/s]

model-00007-of-00007.safetensors:   0%|          | 0.00/2.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
# load it into Chroma
db = Chroma.from_documents(docs, hf)

# query it
query = "(AEBS) is used in what situations?"
docs = db.similarity_search(query)

# print results
print(docs[0].page_content)

InvalidDimensionException: Embedding dimension 384 does not match collection dimensionality 768

In [None]:
tiktoken_len(docs[0].page_content)

343

In [None]:
# save to disk
db2 = Chroma.from_documents(docs, hf, persist_directory="./chroma_db")
docs = db2.similarity_search(query)

In [None]:
# load from disk
db3 = Chroma(persist_directory="./chroma_db", embedding_function=hf)
docs = db3.similarity_search(query)
print(docs[0].page_content)

In [None]:
docs = db3.similarity_search_with_relevance_scores(query, k=3)


print("가장 유사한 문서:\n\n {}\n\n".format(docs[0][0].page_content))
print("문서 유사도:\n {}".format(docs[0][1]))

# huggingface, chromaDB, Langchain을 사용한 RAG

In [None]:
!pip install chromadb tiktoken transformers sentence_transformers openai langchain pypdf

Collecting chromadb
  Downloading chromadb-0.5.4-py3-none-any.whl (581 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m581.4/581.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-1.35.13-py3-none-any.whl (328 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.5/328.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.2.7-py3-none-any.whl (983 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import openai
os.environ["OPENAI_API_KEY"] = ''

In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding("cl100k_base")

def tiktoken_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

In [None]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.2.7-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.21.3-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extensi

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader

In [None]:
loader = PyPDFLoader("/content/drive/MyDrive/UNECEmergeData.pdf")
pages = loader.load_and_split()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50, length_function = tiktoken_len)
texts = text_splitter.split_documents(pages)

from langchain.embeddings import HuggingFaceEmbeddings

model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

docsearch = Chroma.from_documents(texts, hf)

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

openai = ChatOpenAI(model_name="gpt-4o-mini-2024-07-18",
                    streaming=True, callbacks=[StreamingStdOutCallbackHandler()],
                    temperature = 0)

qa = RetrievalQA.from_chain_type(llm = openai,
                                 chain_type = "stuff",
                                 retriever = docsearch.as_retriever(
                                    search_type="mmr",
                                    search_kwargs={'k':3, 'fetch_k': 10}),
                                 return_source_documents = True)

query = "What are the specific requirements and procedures outlined in the braking test conditions for a vehicle seeking approval, including the initial brake temperature range, test speed criteria for different vehicle categories, brake actuation force limits, and the necessary steps to be taken before and during the test to ensure compliance with the approval regulations?"
result = qa(query)

The specific requirements and procedures outlined in the braking test conditions for a vehicle seeking approval are as follows:

1. **Initial Brake Temperature**: The brakes must be at an initial temperature of at least 55 °C and not exceed 100 °C before the test begins.

2. **Test Speed Criteria**:
   - For general brake testing, the speed should be 100 km/h or 0.9 times the maximum speed (Vmax) of the vehicle, whichever is lower.
   - For high-speed testing applicable to vehicle categories L3, L4, L5, and L7 with a Vmax greater than 125 km/h, the test speed should be 0.8 Vmax.

3. **Brake Actuation Force Limits**:
   - Hand control: Up to 250 N.
   - Foot control: Up to 400 N for vehicle categories L3 and L4, and up to 500 N for vehicle categories L5 and L7.

4. **Test Procedure Steps**:
   - **Acceleration and Braking**: For each stop, the vehicle must be accelerated to the specified test speed and then the brakes should be actuated under the specified conditions.
   - **Brake Appli

In [None]:
result

{'query': 'What are the specific conditions under which the ABS function must be disabled or disconnected according to the regulations mentioned in the provided context?',
 'result': 'According to the regulations mentioned in the provided context, the specific conditions under which the ABS function must be disabled or disconnected are between 40 km/h and 20 km/h, when the vehicle is lightly loaded, and when the engine is disconnected.',
 'source_documents': [Document(metadata={'page': 40, 'source': '/content/drive/MyDrive/UNECEmergeData.pdf'}, page_content='(b) The anti -lock system shall be either disconnected or inoperative (ABS \nfunction disabled), between 40 km/h and 20 km/h.  \n(c) Lightly loaded.  \n(d) Engine disconnected.  \n1.3. Test conditions and procedure:  \n(a) Initial brake temperature:   ≥ 55 °C and  ≤ 100 °C. \n(b) Test speed: 60 km/h or 0.9 Vmax, whichever is lower.'),