In [38]:
#!pip install pypdf
#!pip install chromadb
#!pip install gpt4all
#!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.0-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.34.0 (from sentence-transformers)
  Using cached transformers-4.41.1-py3-none-any.whl.metadata (43 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.2.2-cp39-none-macosx_10_9_x86_64.whl.metadata (25 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.5.0-cp39-cp39-macosx_10_9_x86_64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.13.1-cp39-cp39-macosx_10_9_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting Pillow (from sentence-transformers)
  Using cached pillow-10.3.0-cp39-cp39-macosx_10_10_x86_64.whl.metadata (9.2 kB)
Collecting networkx (from torch>=1.11.0->sentence-transformers)
  Downloading networkx-3.2.1-py3-none-any.whl.metadat

In [1]:
import argparse
import os
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.vectorstores.chroma import Chroma
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_community.embeddings.bedrock import BedrockEmbeddings

from langchain.vectorstores.chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama
from gpt4all import Embed4All
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

import getpass

#def _set_if_undefined(var: str):
#    if not os.environ.get(var):
#        os.environ[var] = getpass(f"Please provide your {var}")
#
#_set_if_undefined("LANGCHAIN_API_KEY")
#
#os.environ["LANGCHAIN_TRACING_V2"]="true"
#os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
#os.environ["LANGCHAIN_PROJECT"]="AI Assistant"


CHROMA_PATH = "chroma"
DATA_PATH = "data"

In [2]:
def get_embedding_function():
    #embeddings = BedrockEmbeddings(credentials_profile_name="default", region_name="us-east-1")
    #embeddings = OllamaEmbeddings(model="nomic-embed-text")
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    return embeddings

In [3]:
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

In [7]:
doc = load_documents()

In [4]:
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

In [9]:
chunks = split_documents(doc)

In [5]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks


In [6]:
def add_to_chroma(chunks: list[Document]):
    # Load the existing database.
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("✅ No new documents to add")

In [7]:
def clear_database():
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

In [13]:
clear_database()

In [14]:
add_to_chroma(chunks)

  from tqdm.autonotebook import tqdm, trange
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



Number of existing documents in DB: 0
👉 Adding new documents: 4


  warn_deprecated(


In [8]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [16]:
def query_rag(query_text: str):
    # Prepare the DB.
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_score(query_text, k=5)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    model = Ollama(model="openhermes")
    response_text = model.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return response_text

In [10]:
question = "Serge à t il des compétence en python et AWS ?"

In [19]:
query_rag(question)

Number of requested results 5 is greater than number of elements in index 4, updating n_results = 4


Response: Oui, Serge a des compétences en Python et AWS. Il est mentionné qu'il a travaillé comme Développeur Python & Rust à LEDR Technologies et est certifié AWS Solutions Architect Associate. Il possède également des expériences professionnelles impliquant le déploiement de solutions informatiques sur AWS et l'utilisation d'outils tels que Terraform pour la conception d'architectures AWS.
Sources: ['data/CV Serge Keita.pdf:0:0', 'data/CV Serge Keita.pdf:0:1', 'data/CV Serge Keita.pdf:0:2', 'data/CV Serge Keita.pdf:0:3']


"Oui, Serge a des compétences en Python et AWS. Il est mentionné qu'il a travaillé comme Développeur Python & Rust à LEDR Technologies et est certifié AWS Solutions Architect Associate. Il possède également des expériences professionnelles impliquant le déploiement de solutions informatiques sur AWS et l'utilisation d'outils tels que Terraform pour la conception d'architectures AWS."

In [19]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

In [13]:
def query_rag2(query_text: str):
    # Prepare the DB.
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_score(query_text, k=5)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    #pipeline("text-generation", model="tiiuae/falcon-7b-instruct", trust_remote_code=True)
    llm = pipeline("text-generation", model="openchat/openchat-3.6-8b-20240522", framework="pt")  # force PyTorch framework

    #model = Ollama(model="openhermes")
    #response_text = model.invoke(prompt)
    response = llm(prompt, max_length=1000)
    response_text = response[0]["generated_text"]

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return response_text

In [23]:
model = "tiiuae/falcon-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

Downloading shards:   0%|          | 0/2 [00:01<?, ?it/s]
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Downloading shards:   0%|          | 0/2 [00:01<?, ?it/s]


ValueError: Could not load model tiiuae/falcon-7b-instruct with any of the following classes: (<class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'>, <class 'transformers.models.falcon.modeling_falcon.FalconForCausalLM'>). See the original errors:

while loading with AutoModelForCausalLM, an error is thrown:
Traceback (most recent call last):
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/transformers/pipelines/base.py", line 283, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 558, in from_pretrained
    return model_class.from_pretrained(
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3511, in from_pretrained
    resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/transformers/utils/hub.py", line 1040, in get_checkpoint_shard_files
    cached_filename = cached_file(
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/transformers/utils/hub.py", line 399, in cached_file
    resolved_file = hf_hub_download(
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
    return fn(*args, **kwargs)
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1221, in hf_hub_download
    return _hf_hub_download_to_cache_dir(
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1367, in _hf_hub_download_to_cache_dir
    _download_to_tmp_and_move(
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1884, in _download_to_tmp_and_move
    http_get(
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 542, in http_get
    temp_file.write(chunk)
OSError: [Errno 28] No space left on device

while loading with FalconForCausalLM, an error is thrown:
Traceback (most recent call last):
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/transformers/pipelines/base.py", line 283, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3511, in from_pretrained
    resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/transformers/utils/hub.py", line 1040, in get_checkpoint_shard_files
    cached_filename = cached_file(
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/transformers/utils/hub.py", line 399, in cached_file
    resolved_file = hf_hub_download(
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
    return fn(*args, **kwargs)
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1221, in hf_hub_download
    return _hf_hub_download_to_cache_dir(
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1367, in _hf_hub_download_to_cache_dir
    _download_to_tmp_and_move(
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1884, in _download_to_tmp_and_move
    http_get(
  File "/Users/sergekeita/opt/anaconda3/envs/llm_langgraph/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 542, in http_get
    temp_file.write(chunk)
OSError: [Errno 28] No space left on device




In [21]:
!pip install accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: accelerate
Successfully installed accelerate-0.30.1


In [14]:
query_rag2(question)

Number of requested results 5 is greater than number of elements in index 4, updating n_results = 4
Downloading shards:   0%|          | 0/4 [01:38<?, ?it/s]


KeyboardInterrupt: 