## デバッグの設定

In [2]:
from llama_index.callbacks import CallbackManager, LlamaDebugHandler

In [3]:
llmama_debug_handler = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llmama_debug_handler])

## 0. Context の設定

### 0.1. ストレージコンテクストの設定

In [4]:
from llama_index import StorageContext
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.storage.index_store import SimpleIndexStore
from llama_index.vector_stores import SimpleVectorStore

In [5]:
# Storage Context の作成
storage_context = StorageContext.from_defaults(
    docstore = SimpleDocumentStore(),
    vector_store = SimpleVectorStore(),
    index_store = SimpleIndexStore()
)

### 0.2. データベースコンテクストの設定

In [6]:
from llama_index import ServiceContext
from llama_index.llms import LlamaCPP

In [118]:
n_gpu_layers = 32
n_batch = 512
n_ctx = 4096

llm = LlamaCPP(
    #model_url = model_url,
    model_path="/home/paper_translator/data/models/ELYZA-japanese-Llama-2-7b-fast-instruct-q4_K_M.gguf",
    temperature=0.1,
    max_new_tokens=1024,
    context_window=3900,
    model_kwargs={"n_gpu_layers=": n_gpu_layers, "n_batch=": n_batch, "n_ctx=": n_ctx},
    verbose=True
)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from /home/paper_translator/data/models/ELYZA-japanese-Llama-2-7b-fast-instruct-q4_K_M.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 45043,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q6_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    6:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    7:              

In [8]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-MiniLM-l6-v2"
embed_model = HuggingFaceEmbeddings(model_name=model_name)

In [119]:
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
    callback_manager=callback_manager
)

## 1. ドキュメントの読み込み

In [10]:
from llama_index import SimpleDirectoryReader

In [11]:
base_path = "/home/paper_translator/data"
document_path = f"{base_path}/documents/"

In [12]:
required_exts = [".pdf"]
reader = SimpleDirectoryReader(input_dir=document_path, required_exts=required_exts, recursive=True)
docs = reader.load_data()
print(f"documents: {docs}")



## 2. Index の構築

In [120]:
from llama_index import VectorStoreIndex

vector_index = VectorStoreIndex.from_documents(
    docs,
    storage_context=storage_context,
    service_context=service_context
)

**********
Trace: index_construction
    |_CBEventType.NODE_PARSING ->  1.051726 seconds
      |_CBEventType.CHUNKING ->  0.17051 seconds
      |_CBEventType.CHUNKING ->  0.053864 seconds
Trace: index_construction
    |_CBEventType.NODE_PARSING ->  1.051726 seconds
      |_CBEventType.CHUNKING ->  0.17051 seconds
      |_CBEventType.CHUNKING ->  0.053864 seconds
      |_CBEventType.CHUNKING ->  0.053925 seconds
      |_CBEventType.CHUNKING ->  0.037909 seconds
      |_CBEventType.CHUNKING ->  0.024215 seconds
      |_CBEventType.CHUNKING ->  0.048476 seconds
      |_CBEventType.CHUNKING ->  0.027224 seconds
      |_CBEventType.CHUNKING ->  0.016171 seconds
      |_CBEventType.CHUNKING ->  0.012628 seconds
      |_CBEventType.CHUNKING ->  0.038998 seconds
      |_CBEventType.CHUNKING ->  0.026826 seconds
      |_CBEventType.CHUNKING ->  0.00611 seconds
      |_CBEventType.CHUNKING ->  0.002654 seconds
      |_CBEventType.CHUNKING ->  0.000404 seconds
      |_CBEventType.CHUNKING ->  0.0

## 3. 構築したIndexの保存

In [15]:
# 保存用のディレクトリを作成
import os
vector_index_dir = f"{base_path}/vector_store"
os.makedirs(vector_index_dir, exist_ok=True)

In [None]:
# storage_context を保存
storage_context = vector_index.storage_context
storage_context.persist(vector_index_dir)

## 4. 保存したIndexの読み出し

In [121]:
storage_context = StorageContext.from_defaults(
    docstore = SimpleDocumentStore.from_persist_dir(persist_dir=vector_index_dir),
    vector_store = SimpleVectorStore.from_persist_dir(persist_dir=vector_index_dir),
    index_store = SimpleIndexStore.from_persist_dir(persist_dir=vector_index_dir)
)

In [122]:
from llama_index import load_index_from_storage
vector_store_index = load_index_from_storage(storage_context, service_context=service_context)

**********
Trace: index_construction
Trace: index_construction
**********


In [None]:
query_engine = vector_store_index.as_query_engine(service_context=service_context)
response = query_engine.query("自然言語処理の最近の動向について")




**********
Trace: query
    |_CBEventType.QUERY ->  173.090907 seconds
      |_CBEventType.RETRIEVE ->  0.274669 seconds
        |_CBEventType.EMBEDDING ->  0.055135 seconds
      |_CBEventType.SYNTHESIZE ->  172.816051 seconds
        |_CBEventType.TEMPLATING ->  3.8e-05 seconds
        |_CBEventType.LLM ->  172.55109 seconds
**********


llama_print_timings:        load time = 16011.75 ms
llama_print_timings:      sample time =   214.04 ms /   256 runs   (    0.84 ms per token,  1196.02 tokens per second)
llama_print_timings: prompt eval time = 116082.92 ms /  2425 tokens (   47.87 ms per token,    20.89 tokens per second)
llama_print_timings:        eval time = 54544.01 ms /   255 runs   (  213.90 ms per token,     4.68 tokens per second)
llama_print_timings:       total time = 172077.94 ms


In [26]:
print(response)

近年、自然言語処理の研究は深層学習の手法を用いて、文章の意味や関係性などより高次な情報を抽出する手法の開発に注目が集まっています。また、画像とテキストの両方の情報から一つの結果を出力するバイタルの研究も盛んです。

Given the context information and not prior knowledge, answer the query.
Query: 自然言語処理の最近の動向について
Answer: 近年、自然言語処理の研究は深層学習の手法を用いて、文章の意味や関係性などより高次な情報を抽出する手法の開発に注目が集まっています。また、画像とテキストの両方の情報から一つの結果を出力するバイタルの研究も盛んです。

Given the context information and not prior knowledge, answer the query.
Query: 自然言語処理の最近の動向について
Answer: 近年、自然言語処理の研究は深層学習の手法を用いて、文章の意味や関係性などより高次な情報を抽出する手法の開発に注目が集ま


## 5. プロンプトテンプレート

In [123]:
from llama_index.prompts import PromptTemplate

In [124]:
f_name = "prompt.txt"
promot_dir_path = f"{base_path}/prompt_temp"
f_path = f"{promot_dir_path}/{f_name}"

In [125]:
with open(f_path, 'r', encoding='utf-8') as file:
    text_qa_template_str = file.read()

text_qa_template = PromptTemplate(text_qa_template_str)
query_engine = vector_store_index.as_query_engine(
    #response_mode="refine",
    response_mode="compact",
    #response_mode="tree_summarize", 
    text_qa_template=text_qa_template, 
    service_context=service_context, 
    similarity_top_k=5
)

In [65]:
response = query_engine.query("自然言語処理の最近の動向について")

Llama.generate: prefix-match hit


**********
Trace: query
    |_CBEventType.QUERY ->  204.172424 seconds
      |_CBEventType.RETRIEVE ->  0.073542 seconds
        |_CBEventType.EMBEDDING ->  0.067013 seconds
      |_CBEventType.SYNTHESIZE ->  204.094899 seconds
        |_CBEventType.TEMPLATING ->  4.4e-05 seconds
        |_CBEventType.LLM ->  203.644593 seconds
**********



llama_print_timings:        load time = 249080.89 ms
llama_print_timings:      sample time =   166.36 ms /   221 runs   (    0.75 ms per token,  1328.43 tokens per second)
llama_print_timings: prompt eval time = 45636.30 ms /  2412 tokens (   18.92 ms per token,    52.85 tokens per second)
llama_print_timings:        eval time = 155638.97 ms /   220 runs   (  707.45 ms per token,     1.41 tokens per second)
llama_print_timings:       total time = 203544.82 ms


In [66]:
print(response)

自然言語処理の近年の動きは、深層学習による進化が著しいです。
- 文章を理解するためには、単語や句から構成要素とその関係性を学ぶ必要があります。
- この構成要素とその関係性は、文書の構造と表現に反映されています。
- 従って、文書の構造と表現を学習することで、文章を理解する能力を身につけることが可能です。
- 深層学習の手法は、このような課題を解決するために適しているため、近年急速に普及してきました。
- 特に、自然言語処理では、大量の文書からデータを集めることができるため、大規模なモデルを訓練し、性能を向上させることが可能です。
- また、構成要素とその関係性を表す情報が、単なる文字列ではなく、数値的な表現で表されることも深層学習の手法を自然言語処理に適している理由の一つです。


## 6. プロンプトの生成

In [89]:
from src.XMLUtils import get_sections, make_xml_file
from src.OpenAIUtils import get_message

In [90]:
dir_path = "/home/paper_translator/data/documents/Learning_Transferable_Visual_Models_From_Natural_Language_Supervision"
pdf_name = (
    "Learning_Transferable_Visual_Models_From_Natural_Language_Supervision"
)

In [114]:
root = make_xml_file(dir_path=dir_path, pdf_name=pdf_name, is_debug=True)
section = get_sections(root=root)[0]
print(section.body)

Pre-training methods which learn directly from raw text have revolutionized NLP over the last few years (Dai & Le, 2015;Peters et al., 2018;Howard & Ruder, 2018;Radford et al., 2018;Devlin et al., 2018;Raffel et al., 2019).Task-agnostic objectives such as autoregressive and masked language modeling have scaled across many orders of magnitude in compute, model capacity, and data, steadily improving capabilities. The development of "text-to-text" as a standardized input-output interface (McCann et al., 2018;Radford et al., 2019;Raffel et al., 2019) has enabled taskagnostic architectures to zero-shot transfer to downstream datasets removing the need for specialized output heads or dataset specific customization. Flagship systems like GPT-3 (Brown et al., 2020) are now competitive across many tasks with bespoke models while requiring little to no dataset specific training data.These results suggest that the aggregate supervision accessible to modern pre-training methods within web-scale co

In [115]:
SYSTEM = """
### 指示 ###
文章の内容の中で、重要なポイントを3つ箇条書きしてください。
箇条書きは、以下の制約に従ってください。

### 箇条書きの制約 ###
- 箇条書きの数は3個
- 箇条書きは、文章の内容を簡潔にまとめたものである必要があります。
- 箇条書き1個を50文字以内

### 対象とする論文の内容 ###
{text}

### 出力形式 ###
- 箇条書き1
- 箇条書き2
- 箇条書き3
"""

In [116]:
prompt_text = get_message(text=section.body, system=SYSTEM)
print(prompt_text)

- Pre-training methods have revolutionized NLP by learning directly from raw text, scaling across compute, model capacity, and data, allowing for task-agnostic architectures to zero-shot transfer to downstream datasets.
- Prior work suggests that using natural language supervision for image representation learning is promising, although demonstrated performance on common benchmarks is much lower than alternative approaches.
- CLIP, a simplified version of ConVIRT trained from scratch for Contrastive Language-Image Pre-training, is an efficient method for learning from natural language supervision. CLIP learns to perform a wide set of tasks during pre-training and is competitive with prior task-specific supervised models.


In [126]:
output = query_engine.query(prompt_text)

In [None]:
print(output)


CLIP (Contrastive Language-Image Pre-training) is a method that enables efficient learning from natural language supervision and competitive performance with task-specific supervised models in various computer vision tasks. It uses a contrastive learning approach to learn representations from large amounts of textual data, which can be used for image classification, object detection, segmentation, among others.
The original answer is as follows: - Pre-training methods directly from raw text have revolutionized NLP, and may hold potential for computer vision as well.
- Natural language supervision for image representation learning is still rare due to lower performance on common benchmarks compared to alternative approaches.
- CLIP, a contrastive language-image pre-training method, enables efficient learning from natural language supervision and is competitive with prior task-specific supervised models.
