<a href="https://colab.research.google.com/github/rui1011/LLMATCH_RuiKomatsu/blob/main/LLMATCH_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install llama-index --upgrade
!pip install llama-index
!pip install llama-index-core
!pip install llama-index transformers sentence-transformers
!pip install llama-index-embeddings-huggingface
!pip install torch
!pip install langchain-community
!pip install --upgrade llama-index
!pip install --upgrade llama-index python-dotenv pydantic PyYAML
!pip install llama-index-llms-huggingface
!pip install --upgrade llama-index-llms-huggingface

!pip uninstall -y llama-index-agent-openai llama-index-embeddings-openai \
               llama-index-llms-openai llama-index-multi-modal-llms-openai \
               llama-index-program-openai llama-index-question-gen-openai

!pip install llama-index-llms-langchain
!pip install langchain_community pypdf
!pip install -U bitsandbytes
!pip install sentence-transformers
!pip install llama-index-embeddings-huggingface

Collecting llama-index-agent-openai<0.5.0,>=0.4.0 (from llama-index)
  Using cached llama_index_agent_openai-0.4.1-py3-none-any.whl.metadata (726 bytes)
Collecting llama-index-embeddings-openai<0.4.0,>=0.3.0 (from llama-index)
  Using cached llama_index_embeddings_openai-0.3.1-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-llms-openai<0.4.0,>=0.3.0 (from llama-index)
  Using cached llama_index_llms_openai-0.3.13-py3-none-any.whl.metadata (3.3 kB)
Collecting llama-index-multi-modal-llms-openai<0.5.0,>=0.4.0 (from llama-index)
  Using cached llama_index_multi_modal_llms_openai-0.4.2-py3-none-any.whl.metadata (726 bytes)
Collecting llama-index-program-openai<0.4.0,>=0.3.0 (from llama-index)
  Using cached llama_index_program_openai-0.3.1-py3-none-any.whl.metadata (764 bytes)
Collecting llama-index-question-gen-openai<0.4.0,>=0.3.0 (from llama-index)
  Using cached llama_index_question_gen_openai-0.3.0-py3-none-any.whl.metadata (783 bytes)
Using cached llama_index_agent_opena

In [3]:
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import Document, QueryBundle
from llama_index.core.service_context import ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig, AutoModelForSequenceClassification
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from llama_index.core import Settings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
import numpy as np
from huggingface_hub import HfFolder
from huggingface_hub import whoami
from llama_index.core.prompts import PromptTemplate

In [4]:
def rerank_candidates_with_cross_encoder(query, candidates, model_name="cross-encoder/ms-marco-TinyBERT-L-2-v2", device="cpu"):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

  pairs = [[query, candidate] for candidate in candidates]
  inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors="pt").to(device)

  with torch.no_grad():
    outputs = model(**inputs)
    scores = torch.sigmoid(outputs.logits).squeeze(1).tolist()

  ranked_candidates = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
  return ranked_candidates

In [5]:
def query_engine(query, index, llm, top_k=5):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  query_embedding = Settings.embed_model.get_query_embedding(query)

  query_engine_ = index.as_query_engine(similarity_top_k = top_k)
  retrieved_nodes = query_engine_.query(QueryBundle(query_str=query)).source_nodes
  retrieved_chunks = [node.text for node in retrieved_nodes]

  ranked_chunks = rerank_candidates_with_cross_encoder(query, retrieved_chunks, device=device)

  ranked_chunks = [chunk for chunk, _ in ranked_chunks]

  context = "\n\n".join(ranked_chunks)
  prompt = f"""
  あなたは知識豊富なアシスタントです。以下のコンテキストを使用して質問に答えて下さい。
  他の質問文や関連のない情報は出力しないでください。
  回答は必ず簡潔にまとめてください。
  コンテキスト:
  {context}

  質問:{query}
  """

  response = llm.generate([prompt])
  return response

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
)

In [7]:
model_id = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
      )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
text_generation_pipeline = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    max_new_tokens = 256,
    temperature = 0.8,
    do_sample=True,
)

Device set to use cuda:0


In [9]:
my_llm = HuggingFacePipeline(pipeline = text_generation_pipeline)
Settings.llm = my_llm

  my_llm = HuggingFacePipeline(pipeline = text_generation_pipeline)


In [10]:
embedding_model = HuggingFaceEmbedding(
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
)

In [14]:
pdf_path = "/content/drive/MyDrive/experimental_file.pdf"

loader = PyPDFLoader(pdf_path)
documents = loader.load_and_split()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 256,
    chunk_overlap=20,
    length_function=len,
    add_start_index=True
)

In [15]:
docs = text_splitter.create_documents([doc.page_content for doc in documents])
docs = [Document(text=doc.page_content, metadata = doc.metadata) for doc in docs]

Settings.llm = my_llm
Settings.embed_model = embedding_model

index = VectorStoreIndex.from_documents(
    documents = docs,
    show_progress=True
)

Parsing nodes:   0%|          | 0/116 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/116 [00:00<?, ?it/s]

In [None]:
query = "question"
response = query_engine(query, index, my_llm)
print(response)