<a href="https://colab.research.google.com/github/rui1011/LLMATCH_RuiKomatsu/blob/main/LLMATCH_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install llama-index --upgrade
!pip install llama-index
!pip install llama-index-core
!pip install llama-index transformers sentence-transformers
!pip install llama-index-embeddings-huggingface
!pip install torch
!pip install langchain-community
!pip install --upgrade llama-index
!pip install --upgrade llama-index python-dotenv pydantic PyYAML
!pip install llama-index-llms-huggingface
!pip install --upgrade llama-index-llms-huggingface

!pip uninstall -y llama-index-agent-openai llama-index-embeddings-openai \
               llama-index-llms-openai llama-index-multi-modal-llms-openai \
               llama-index-program-openai llama-index-question-gen-openai

!pip install llama-index-llms-langchain
!pip install langchain_community pypdf
!pip install -U bitsandbytes
!pip install sentence-transformers
!pip install llama-index-embeddings-huggingface
!pip install --upgrade bitsandbytes
!pip install --upgrade transformers
!pip install autoawq

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transforme

In [3]:
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import Document, QueryBundle
from llama_index.core.service_context import ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig, AutoModelForSequenceClassification
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from llama_index.core import Settings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
import numpy as np
from huggingface_hub import HfFolder
from huggingface_hub import whoami
from llama_index.core.prompts import PromptTemplate

In [4]:
def rerank_candidates_with_cross_encoder(query, candidates, model_name="cross-encoder/ms-marco-TinyBERT-L-2-v2", device="cpu"):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

  pairs = [[query, candidate] for candidate in candidates]
  inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors="pt").to(device)

  with torch.no_grad():
    outputs = model(**inputs)
    scores = torch.sigmoid(outputs.logits).squeeze(1).tolist()

  ranked_candidates = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
  return ranked_candidates

In [18]:
def query_engine(query, index, llm, top_k=5):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  query_embedding = Settings.embed_model.get_query_embedding(query)

  query_engine_ = index.as_query_engine(similarity_top_k = top_k)
  retrieved_nodes = query_engine_.query(QueryBundle(query_str=query)).source_nodes
  retrieved_chunks = [node.text for node in retrieved_nodes]

  ranked_chunks = rerank_candidates_with_cross_encoder(query, retrieved_chunks, device=device)

  ranked_chunks = [chunk for chunk, _ in ranked_chunks]

  context = "\n\n".join(ranked_chunks)
  prompt = f"""
  あなたは知識豊富な周南公立大学情報科学部情報科学科のアシスタントです。以下のルールに従って、質問に回答してください。

  1. 【出力ルール】
   - 回答は100字以上200字以下の簡潔な文章にまとめること。
   - 指定されたコンテキストおよび質問に関連する情報のみを使用すること。
   - 質問文を繰り返さないこと。
   - 他の質問文や無関係な情報は一切含めないこと。

  2. 【思考手順】
   - Let's think step by step.
   - 最終回答は簡潔にまとめる。
   - 最終的な文字数を必ず確認し、条件を満たすまで調整する。

  【コンテキスト】:{context}

  【質問】:{query}

  【最終回答】:
  """

  response = llm.generate([prompt])
  return response

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
)

In [7]:
!pip install huggingface_hub

from huggingface_hub import login
from google.colab import userdata

HF_TOKEN = userdata.get('HF_TOKEN')

if HF_TOKEN:
    login(HF_TOKEN)
    print("Successfully logged in to Hugging Face!")
else:
    print("HF_TOKEN is not set. Please add it to the Colab secrets.")

[0mSuccessfully logged in to Hugging Face!


In [8]:
model_id = "elyza/Llama-3-ELYZA-JP-8B-AWQ"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.bfloat16,
      )

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/900 [00:00<?, ?B/s]

We suggest you to set `torch_dtype=torch.float16` for better efficiency with AWQ.


model.safetensors.index.json:   0%|          | 0.00/63.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.68G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

In [9]:
text_generation_pipeline = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    max_new_tokens = 256,
    temperature = 0.8,
    do_sample=True,
)

Device set to use cuda:0


In [10]:
my_llm = HuggingFacePipeline(pipeline = text_generation_pipeline)
Settings.llm = my_llm

  my_llm = HuggingFacePipeline(pipeline = text_generation_pipeline)


In [11]:
embedding_model = HuggingFaceEmbedding(
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
pdf_path = "/content/drive/MyDrive/experimental_file/experimental_file.pdf"

loader = PyPDFLoader(pdf_path)
documents = loader.load_and_split()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 256,
    chunk_overlap=20,
    length_function=len,
    add_start_index=True
)

In [13]:
docs = text_splitter.create_documents([doc.page_content for doc in documents])
docs = [Document(text=doc.page_content, metadata = doc.metadata) for doc in docs]

Settings.llm = my_llm
Settings.embed_model = embedding_model

index = VectorStoreIndex.from_documents(
    documents = docs,
    show_progress=True
)

Parsing nodes:   0%|          | 0/116 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/116 [00:00<?, ?it/s]

In [22]:
query = """
情報科学部の特色を教えてください。
"""
response = query_engine(query, index, my_llm)
final_answer = response.generations[0][0].text.split("【最終回答】:")[-1].strip()
print(final_answer)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


情報科学部の特色として、3年次第2クオーターには対面を必須とする授業が配置されず、オンデマンドやハイブリッドなどの遠隔に対応した授業のみが配置される点が挙げられます。また、長期間のインターンシップや短期海外留学、滞在型のボランティアなどへの参加を推奨しており、学生の自由度が高く、多様な学修スタイルを選択することができます。
