# LLMのRAGを使ったQABOT

1. **データ取得**:
    - ICML 2024の論文PDFを取得し、./icml_2024_papersディレクトリに格納する
2. **データの埋め込みベクトル化**:
    - 取得したPDFからテキストを抽出し、ベクトル化する。    <---- now
3. **LLMのRAGモデルの設定**:
    - 質問に対して関連する文書を検索し、その文書を基に回答を生成するRAGモデルを設定する。
4. **QABOTの構築**:
    - ユーザーインターフェースを作成し、ユーザーからの質問に対してRAGモデルを使って回答するQABOTを構築する。
    
    
- Reference
    - [openai-cookbook/examples
/Question_answering_using_embeddings.ipynb](https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb)

In [1]:
!pip3 install python-dotenv tiktoken
!pip3 install openai==1.3.4
!pip3 install PyPDF2

[0m

In [2]:
import ast  # 文字列として保存された埋め込みを配列に変換します。
import openai
from openai import OpenAI
import pandas as pd
import tiktoken
import os
from tqdm import tqdm
from scipy import spatial    # 検索用ベクトルの類似度を計算する
from PyPDF2 import PdfReader

from dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [4]:
openai.api_key = os.getenv("OPENAI_API_KEY")

MODEL_NAME = "gpt-3.5-turbo-0125"
# MODEL_NAME = "gpt-3.5-turbo-instruct"
# MODEL_NAME = "gpt-4-0125-preview"
# MODEL_NAME = "gpt-4-turbo-2024-04-09"
MODEL4o_NAME = "gpt-4o-2024-05-13"

# モデルとエンコーディングの設定
# embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
# max_tokens = 8000

max_tokens = 1000

# EMBEDDING_MODEL = "text-embedding-ada-002"
EMBEDDING_MODEL = "text-embedding-3-small"
TEMPERATURE = 0.7

# OpenAIクライアントの初期化
client = OpenAI()

In [5]:
# PDFディレクトリと保存ディレクトリ
pdf_dir = "./icml_2024_papers"
save_dir = "./icml_2024_embeddings"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [6]:
pdf_files = os.listdir(pdf_dir)
pdf_files

['$S^2$IP-LLM: Semantic Space Informed Prompt Learning with LLM for Time Series Forecasting.pdf',
 'A Bias-Variance-Covariance Decomposition of Kernel Scores for Generative Models.pdf',
 'A Closer Look at the Limitations of Instruction Tuning.pdf',
 'A Human-Inspired Reading Agent with Gist Memory of Very Long Contexts.pdf',
 'A Language Model’s Guide Through Latent Space.pdf',
 'A Multimodal Automated Interpretability Agent.pdf',
 'A Sober Look at LLMs for Material Discovery: Are They Actually Good for Bayesian Optimization Over Molecules?.pdf',
 'A Tale of Tails: Model Collapse as a Change of Scaling Laws.pdf',
 'Accurate LoRA-Finetuning Quantization of LLMs via Information Retention.pdf',
 'Active Preference Learning for Large Language Models.pdf',
 'Adaptive Text Watermark for Large Language Models.pdf',
 'Agent Instructs Large Language Models to be General Zero-Shot Reasoners.pdf',
 'Agent Smith: A Single Image Can Jailbreak One Million Multimodal LLM Agents Exponentially Fast.pdf

In [7]:
# PDFからテキストを抽出
def convert_pdf_to_text(pdf_path):
    reader = PdfReader(os.path.join(pdf_dir, pdf_path))
    text = ''
    for page in reader.pages:
        text += page.extract_text() + '\n'
    return text

In [8]:
# テキストをチャンクに分割
def chunk_text(text, max_tokens, encoding):
    tokens = encoding.encode(text, allowed_special=set(), disallowed_special=set())
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk = tokens[i:i + max_tokens]
        chunks.append(encoding.decode(chunk))
    return chunks

In [9]:
# テキストを埋め込みベクトルに変換
def get_embedding(text, model=EMBEDDING_MODEL):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

In [10]:
# エンコーディングの取得
encoding = tiktoken.get_encoding(embedding_encoding)

In [11]:
# PDFからテキストを抽出して埋め込みを生成する関数
def embed_text_from_pdf(file_path):
    # PDFからテキストを抽出する（例：PyMuPDFなどのライブラリを使用）
    # テキストの抽出は省略しているので、適宜実装してください
    extracted_text = extract_text_from_pdf(file_path)

    # テキストをチャンクに分割
    text_chunks = split_text_into_chunks(extracted_text)

    # 各チャンクに対して埋め込みを生成し、データ構造に保存
    for chunk in text_chunks:
        response = client.embeddings.create(
            model=EMBEDDING_MODEL,
            input=chunk,
        )
        embedding = response.data[0].embedding
        data.append({'text': chunk, 'embedding': embedding, 'file': file_path})

In [12]:
# テキストとその埋め込みを保存するデータ構造
data = []

In [13]:
# PDFファイルごとに処理
for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
    data = []
    text = convert_pdf_to_text(pdf_file)
    text_chunks = chunk_text(text, max_tokens, encoding)
    for chunk in text_chunks:
        embedding = get_embedding(chunk)
        data.append({'text': chunk, 'embedding': embedding, 'file': pdf_file})
    
    # データフレームに変換し、CSVに保存
    df = pd.DataFrame(data)
    csv_file_path = os.path.join(save_dir, f"{os.path.splitext(pdf_file)[0]}.csv")
    df.to_csv(csv_file_path, index=False, escapechar='\\')

Processing PDFs:   1%|          | 2/306 [00:26<1:09:22, 13.69s/it]unknown widths : 
[0, IndirectObject(746, 0, 140506542847696), 9, 10, 284, 11, IndirectObject(747, 0, 140506542847696), 17, 26, 513, 27, 28, 207, 29, 31, 596, 32, IndirectObject(748, 0, 140506542847696), 97, 98, 513, 99, IndirectObject(749, 0, 140506542847696), 100, 101, 513, 102, IndirectObject(750, 0, 140506542847696), 107, 108, 255, 109, 110, 523, 111, 113, 500, 114, IndirectObject(751, 0, 140506542847696), 122, IndirectObject(752, 0, 140506542847696), 124, 136, 300, 138, IndirectObject(753, 0, 140506542847696), 171, 176, 612, 177, IndirectObject(754, 0, 140506542847696), 178, 181, 492, 182, 185, 239, 186, IndirectObject(755, 0, 140506542847696), 187, 191, 689, 192, IndirectObject(756, 0, 140506542847696), 193, 196, 647, 197, 198, 541, 199, IndirectObject(757, 0, 140506542847696), 200, 205, 482, 206, IndirectObject(758, 0, 140506542847696), 207, 210, 501, 211, 214, 234, 215, IndirectObject(759, 0, 140506542847696), 21

In [14]:
df.head()

Unnamed: 0,text,embedding,file
0,Zero-Shot ECG Classification with Multimodal L...,"[0.002399072516709566, -0.029044684022665024, ...",Zero-Shot ECG Classification with Multimodal L...
1,"; Kiyasseh et al., 2021;\nWang et al., 2023). ...","[0.013038376346230507, 0.008787560276687145, 0...",Zero-Shot ECG Classification with Multimodal L...
2,"with images, ECG signals pose\na unique chall...","[0.006792128551751375, -0.007400283589959145, ...",Zero-Shot ECG Classification with Multimodal L...
3,EPE ) is introduced in Sec. 3.4.\n3.2. Cross-M...,"[0.027774350717663765, 0.025056704878807068, 0...",Zero-Shot ECG Classification with Multimodal L...
4,"solution.\nIn summary, our model learns repre...","[-0.010094649158418179, 0.005103956907987595, ...",Zero-Shot ECG Classification with Multimodal L...
