In [1]:
import pandas as pd

df = pd.read_csv('document/clean_content_4.csv')

In [2]:
from langchain.document_loaders import DataFrameLoader

loader = DataFrameLoader(df, page_content_column="content")
docs = loader.load()

In [3]:
len(docs),docs[0]

(3105,
 Document(page_content='Close login 登入 國立臺灣大學 農業化學系 忘記密碼? 國立臺灣大學 農業化學系 首頁 網站導覽 舊網頁 國立臺灣大學 農業化學系 MENU  最新消息 活動資訊 招生訊息 徵才資訊 更多 》 更多 》 更多 》 更多 》 Recent News Events 更多 》 更多 》 農化概況 更多 》 研究成果 更多 》 繽紛花絮 更多 》 Powered by RulingDigital 國立臺灣大學 農業化學系 Copyright © 2015 Department of Agricultural Chemistry, National Taiwan University. All rights reserved. 最佳瀏覽畫面建議使用IE 11版本 、Mozilla Firefox或Google Chrome', metadata={'url': 'https://www.ac.ntu.edu.tw/', 'timestamp': 1702927507.0440347}))

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings

CHUNK_SIZE = 1024
CHUNK_OVERLAP = 0

child_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True
)

all_splits = child_splitter.split_documents(docs)
print(f'len(all_splits): {len(all_splits)}')


len(all_splits): 19092


In [7]:
# with open('document/split_doc_1024.txt', 'w') as f:
#     [f.write(doc.page_content+'\n') for doc in all_splits]

### Small2Big FAISS

In [6]:
from langchain.vectorstores import FAISS

embeddings = HuggingFaceBgeEmbeddings(model_name = "yentinglin/bert-base-zhtw")
faiss = FAISS.from_documents(all_splits, embeddings)
faiss.save_local(f'embeddings/faiss_bert_base-zhtw_64_8')

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/399 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

No sentence-transformers model found with name /home/ai2lab/.cache/torch/sentence_transformers/yentinglin_bert-base-zhtw. Creating a new one with MEAN pooling.
Some weights of BertModel were not initialized from the model checkpoint at /home/ai2lab/.cache/torch/sentence_transformers/yentinglin_bert-base-zhtw and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever
from langchain.vectorstores import FAISS
from tqdm import tqdm

def monkeypatch_FAISS(embeddings_model):
    from typing import Iterable, List, Optional, Any
    def _add_texts(
            self,
            texts: Iterable[str],
            metadatas: Optional[List[dict]] = None,
            ids: Optional[List[str]] = None,
            **kwargs: Any,
        ) -> List[str]:
            """Run more texts through the embeddings and add to the vectorstore.

            Args:
                texts: Iterable of strings to add to the vectorstore.
                metadatas: Optional list of metadatas associated with the texts.
                ids: Optional list of unique IDs.

            Returns:
                List of ids from adding the texts into the vectorstore.
            """
            embeddings = embeddings_model.embed_documents(texts)
            return self._FAISS__add(texts, embeddings, metadatas=metadatas, ids=ids)

    FAISS.add_texts = _add_texts

monkeypatch_FAISS(embeddings)

PARENT_SIZE = 128
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=PARENT_SIZE, chunk_overlap=CHUNK_OVERLAP)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
bigchunk_store = InMemoryStore()

faiss = FAISS.from_documents(all_splits, embeddings)

retriever = ParentDocumentRetriever(
    vectorstore=faiss,
    docstore=bigchunk_store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

# retriever.add_documents(docs)
fold = 100
sz = len(docs)//fold
for i in tqdm(range(fold)):
    retriever.add_documents(docs[sz*i:sz*(i+1)], ids=None)

# db = FAISS.from_documents(all_splits, embeddings)
faiss.save_local(f'embeddings/faiss_big2small_{PARENT_SIZE}_{CHUNK_SIZE}')

In [None]:
# faiss.save_local(f'embeddings/faiss_big2small_{PARENT_SIZE}_64')
sub_docs = faiss.similarity_search("學生身份別有哪些類別？", 1)
[print(len(sub_docs[i].page_content), sub_docs[i].page_content) for i in range(len(sub_docs))]

In [None]:
retriever = faiss.as_retriever(search_kwargs={"k": 1})
retrieved_docs = retriever.get_relevant_documents("學生身份別有哪些類別？")
[print(len(retrieved_docs[i].page_content), retrieved_docs[i].page_content) for i in range(len(retrieved_docs))]

### Split paragraph by LLM

In [5]:
split_prompt_template = '''
Decompose the "Content" into clear and simple propositions, ensuring they are interpretable out of context.
1. Split compound sentence into simple sentences. Maintain the original phrasing from the input whenever possible.
2. Remove the irrelevant content. 
3. For any named entity that is accompanied by additional descriptive information, separate this information into its own distinct proposition.
4. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the entities they refer to.
5. Present the results as a list of strings, formatted in JSON.
Input:"English  Version 國立臺灣大學圖書資訊學系 Department and Graduate Institute of Library and Information Science, National Taiwan University  選單
					 主選單 學士班獎助學金 僅提供系上開設獎學金消息 圖資系專屬獎學金一覽表 2023.09更新 臺大文學院獎助學金 臺大獎助學金一覽表 臺大弱勢助學金 臺大生活學習獎助金 國立臺灣大學希望助學金 (請以關鍵字搜尋希望助學金) LIS, NTU © All rights reserved since 2014"
Output:["國立臺灣大學圖書資訊學系", "Department and Graduate Institute of Library and Information Science, National Taiwan University", "圖書資訊學系學士班獎助學金", "僅提供圖書資訊學系系上開設獎學金消息", "圖資系專屬獎學金一覽表 2023.09更新", "臺大文學院獎助學金", "臺大獎助學金一覽表", "臺大弱勢助學金", "臺大生活學習獎助金", "國立臺灣大學希望助學金 (請以關鍵字搜尋希望助學金)" ]
Input:"'''

# print(num_tokens_from_string(split_prompt_template+'"\nOutput:', "gpt-3.5-turbo"))

### Token Check

In [11]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [13]:
MAX_TOKEN = 4096
templelate_token = num_tokens_from_string(split_prompt_template+'"\nOutput:', "gpt-3.5-turbo")
avalible_content_token = MAX_TOKEN-templelate_token-5
print(f'avalible_content_token: {avalible_content_token}')


avalible_content_token: 3482


In [132]:
# set([doc.metadata['url'] for doc in all_splits[-226:]])==set(testset['URL'])

True

### ChatGPT

In [6]:
import openai

openai.api_key = ""

def ask_chatgpt(prompt):
    
    completion = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      temperature=0,
      messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ]
    )

    return completion

In [102]:
from tqdm import tqdm

content_list, url_list, timestamp_list = [], [], []
for idx, split in enumerate(tqdm(all_splits[:-266])):
    content = split.page_content
    url = split.metadata['url']
    timestamps = split.metadata['timestamp']
    prompt = split_prompt_template+content+'"\nOutput:'
    print(prompt)
    output = ask_chatgpt(prompt)['choices'][0]['message']['content']
    print(output)
    timestamp_list.append(timestamps)
    url_list.append(url)
    content_list.append(output)
    if idx>1:
        break

  0%|          | 0/19092 [00:00<?, ?it/s]


Decompose the "Content" into clear and simple propositions, ensuring they are interpretable out of context.
1. Split compound sentence into simple sentences. Maintain the original phrasing from the input whenever possible.
2. Remove the irrelevant content. 
3. For any named entity that is accompanied by additional descriptive information, separate this information into its own distinct proposition.
4. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the entities they refer to.
5. Present the results as a list of strings, formatted in JSON.
Example 1
Input:"English  Version 國立臺灣大學圖書資訊學系 Department and Graduate Institute of Library and Information Science, National Taiwan University  選單
					 主選單 學士班獎助學金 僅提供系上開設獎學金消息 圖資系專屬獎學金一覽表 2023.09更新 臺大文學院獎助學金 臺大獎助學金一覽表 臺大弱勢助學金 臺大生活學習獎助金 國立臺灣大學希望助學金 (請以關鍵字搜尋希望助學金) LIS, NTU © All rights reserved since 2014"
Output:[

  0%|          | 1/19092 [00:10<54:05:42, 10.20s/it]

["國立臺灣大學農業化學系", "忘記密碼?", "國立臺灣大學農業化學系首頁", "網站導覽", "舊網頁", "國立臺灣大學農業化學系最新消息", "活動資訊", "招生訊息", "徵才資訊", "國立臺灣大學農業化學系農化概況", "研究成果", "繽紛花絮", "Powered by RulingDigital", "國立臺灣大學農業化學系Copyright © 2015 Department of Agricultural Chemistry, National Taiwan University. All rights reserved.", "最佳瀏覽畫面建議使用IE 11版本、Mozilla Firefox或Google Chrome"]

Decompose the "Content" into clear and simple propositions, ensuring they are interpretable out of context.
1. Split compound sentence into simple sentences. Maintain the original phrasing from the input whenever possible.
2. Remove the irrelevant content. 
3. For any named entity that is accompanied by additional descriptive information, separate this information into its own distinct proposition.
4. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the entities they refer to.
5. Present the results as a list of strings, form

  0%|          | 2/19092 [00:48<142:26:34, 26.86s/it]

["跳到主要內容", "全站搜尋", "點擊時關閉搜尋", "點擊時前往搜尋", "新生招生訊息", "選課專區", "本校行事曆", "臺大課程網", "場地借用專區", "學雜費專區", "兼任助理專區", "相關法規", "檔案與表單下載", "嗨教育", "最新消息", "臺大校學士教學創新推動計畫", "臺大國際學院", "全國夏季學院", "基礎學科認證課程暨認證免修", "教務長室", "信箱：ntudeanacademic@ntu.edu.tw", "地址：臺北市羅斯福路四段一號行政大樓 212 室", "教務處秘書室", "信箱：academic@ntu.edu.tw", "地址：臺北市羅斯福路四段一號行政大樓 211 室", "招生辦公室", "信箱：ntuadm@ntu.edu.tw", "地址：臺北市羅斯福路四段一號行政大樓 106 室", "註冊組", "信箱：chiajin@ntu.edu.tw", "地址：臺北市羅斯福路四段一號行政大樓 106 室", "課務組", "信箱：curri@ntu.edu.tw", "地址：臺北市羅斯福路四段一號行政大樓 208 室", "研究生教務組", "信箱：graduate@ntu.edu.tw", "地址：臺北市羅斯福路四段一號行政大樓 210 室", "資訊組", "信箱：acaif@ntu.edu.tw", "地址：臺北市羅斯福路四段一號行政大樓 302 室", "醫學院教務分處", "信箱：macd@ntu.edu.tw", "地址：臺北市中正區仁愛路一段 1 號 (基礎大樓 3 樓)", "教學發展中心&數位學習中心", "信箱：ntuctld@ntu.edu.tw", "地址：臺北市羅斯福路四段一號 博雅教學館 5 樓", "10617 臺北市羅斯福路四段一號", "No. 1, Sec. 4, Roosevelt Road, Taipei, 10617 Taiwan (ROC)", "Copyright © 2021", "國立臺灣大學教務處", "Office of Academic Affairs, National Taiwan University", "TOP"]

Decompose the "Content" into clear and simple propositi

  0%|          | 2/19092 [01:06<175:04:07, 33.01s/it]

["國立臺灣大學總務處", "國立臺灣大學總務處", "112年度內各項付款及113年1月薪資入帳作業", "各單位務必於規定報帳期間內，提早完成報帳作業", "逾時無法受理，延誤公務", "2023-11-29 科研採購:水下設備定位傳輸發報器（採購案號:1121418）決標資訊", "2023-12-19 戲劇學系水源校區多功能教學空間裝修工程", "2023-12-19 本校明達館平面機車停車場擬自112年12月25日起停止開放", "敬請將機車停放至周邊其他停車場域", "2023-12-16 人文館新建工程工區東側圍籬因應景觀工程及道路重鋪等施工需求第二階段移設", "樂學館及人類學博物館前道路將封閉施工無法通行", "最新消息", "關於本處", "前往各組", "服務簡介", "服務滿意度", "總務人說故事", "臺灣大學總務處", "版權所有", "連結各組"]





### Qwen

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "Qwen/Qwen-14B-Chat-Int4"

# llm = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16, device_map="auto")
# tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", trust_remote_code=True).eval()

In [14]:
def generate_text(prompt_text):

    # inputs = tokenizer(prompt_text, return_tensors="pt").to(0)
    # outputs = model.generate(**inputs, pad_token_id=50256, max_new_tokens=2048)
    # response = tokenizer.decode(outputs[0], do_sample=0, skip_special_tokens=True)
    response, history = model.chat(tokenizer, prompt_text, do_sample=0, history=None)

    return response

In [15]:
from tqdm import tqdm
# import json


content_list, url_list, timestamp_list = [], [], []
for idx, split in enumerate(tqdm(all_splits)):
    content = split.page_content
    url = split.metadata['url']
    timestamps = split.metadata['timestamp']
    prompt = split_prompt_template+content+'"\nOutput:'
    print(prompt)
    output = generate_text(prompt)
    print(output)
    timestamp_list.append(timestamps)
    url_list.append(url)
    # content_list.append(json.loads(output[len(prompt):]))
    content_list.append(output)

    if idx>0:
        break

  0%|          | 0/19092 [00:00<?, ?it/s]


Decompose the "Content" into clear and simple propositions, ensuring they are interpretable out of context.
1. Split compound sentence into simple sentences. Maintain the original phrasing from the input whenever possible.
2. Remove the irrelevant content. 
3. For any named entity that is accompanied by additional descriptive information, separate this information into its own distinct proposition.
4. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the entities they refer to.
5. Present the results as a list of strings, formatted in JSON.
Example 1
Input:"English  Version 國立臺灣大學圖書資訊學系 Department and Graduate Institute of Library and Information Science, National Taiwan University  選單
					 主選單 學士班獎助學金 僅提供系上開設獎學金消息 圖資系專屬獎學金一覽表 2023.09更新 臺大文學院獎助學金 臺大獎助學金一覽表 臺大弱勢助學金 臺大生活學習獎助金 國立臺灣大學希望助學金 (請以關鍵字搜尋希望助學金) LIS, NTU © All rights reserved since 2014"
Output:[

  0%|          | 1/19092 [00:28<152:39:35, 28.79s/it]

["國立臺灣大學", "農業化學系", "登入", "忘記密碼?", "首頁", "網站導覽", "舊網頁", "最新消息", "活動資訊", "招生訊息", "徵才資訊", "更多", "農化概況", "研究成果", "繚紛花絮"]

Decompose the "Content" into clear and simple propositions, ensuring they are interpretable out of context.
1. Split compound sentence into simple sentences. Maintain the original phrasing from the input whenever possible.
2. Remove the irrelevant content. 
3. For any named entity that is accompanied by additional descriptive information, separate this information into its own distinct proposition.
4. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the entities they refer to.
5. Present the results as a list of strings, formatted in JSON.
Example 1
Input:"English  Version 國立臺灣大學圖書資訊學系 Department and Graduate Institute of Library and Information Science, National Taiwan University  選單
					 主選單 學士班獎助學金 僅提供系上開設獎學金消息 圖資系專屬獎學金一覽表 2023.09更

  0%|          | 1/19092 [00:37<198:38:34, 37.46s/it]

["國立臺灣大學教務處", "Office of Academic Affairs, National Taiwan University"]





### Taiwan llama

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "yentinglin/Taiwan-LLM-13B-v2.0-chat"

# llm = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16, device_map="auto")
# tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=True, trust_remote_code=True).eval()


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
def generate_text(prompt_text):

    inputs = tokenizer(prompt_text, return_tensors="pt")
    outputs = model.generate(**inputs, pad_token_id=50256, max_new_tokens=2048)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

In [8]:
from tqdm import tqdm

content_list, url_list, timestamp_list = [], [], []
for idx, split in enumerate(tqdm(all_splits)):
    content = split.page_content
    url = split.metadata['url']
    timestamps = split.metadata['timestamp']
    prompt = split_prompt_template+content+'"\nOutput:'
    # print(prompt)
    output = generate_text(prompt)
    print(output)
    timestamp_list.append(timestamps)
    url_list.append(url)
    content_list.append(output[len(prompt):])
    # content_list.append(output)

    if idx>0:
        break

2024-01-08 00:30:11.879587: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-08 00:30:11.903734: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  0%|          | 1/19092 [01:44<555:06:34, 104.68s/it]


Decompose the "Content" into clear and simple propositions, ensuring they are interpretable out of context.
1. Split compound sentence into simple sentences. Maintain the original phrasing from the input whenever possible.
2. Remove the irrelevant content. 
3. For any named entity that is accompanied by additional descriptive information, separate this information into its own distinct proposition.
4. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the entities they refer to.
5. Present the results as a list of strings, formatted in JSON.
Example 1
Input:"English  Version 國立臺灣大學圖書資訊學系 Department and Graduate Institute of Library and Information Science, National Taiwan University  選單
					 主選單 學士班獎助學金 僅提供系上開設獎學金消息 圖資系專屬獎學金一覽表 2023.09更新 臺大文學院獎助學金 臺大獎助學金一覽表 臺大弱勢助學金 臺大生活學習獎助金 國立臺灣大學希望助學金 (請以關鍵字搜尋希望助學金) LIS, NTU © All rights reserved since 2014"
Output:[

  0%|          | 1/19092 [01:59<632:26:46, 119.26s/it]


Decompose the "Content" into clear and simple propositions, ensuring they are interpretable out of context.
1. Split compound sentence into simple sentences. Maintain the original phrasing from the input whenever possible.
2. Remove the irrelevant content. 
3. For any named entity that is accompanied by additional descriptive information, separate this information into its own distinct proposition.
4. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the entities they refer to.
5. Present the results as a list of strings, formatted in JSON.
Example 1
Input:"English  Version 國立臺灣大學圖書資訊學系 Department and Graduate Institute of Library and Information Science, National Taiwan University  選單
					 主選單 學士班獎助學金 僅提供系上開設獎學金消息 圖資系專屬獎學金一覽表 2023.09更新 臺大文學院獎助學金 臺大獎助學金一覽表 臺大弱勢助學金 臺大生活學習獎助金 國立臺灣大學希望助學金 (請以關鍵字搜尋希望助學金) LIS, NTU © All rights reserved since 2014"
Output:[




### 01-ai/Yi-6B-Chat

In [14]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "01-ai/Yi-6B-Chat"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", trust_remote_code=True).eval()

Downloading tokenizer_config.json:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/2.21G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

In [15]:
def generate_text(prompt_text):

    inputs = tokenizer(prompt_text, return_tensors="pt")
    outputs = model.generate(**inputs, pad_token_id=50256, max_new_tokens=2048)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

In [None]:
from tqdm import tqdm

content_list, url_list, timestamp_list = [], [], []
for idx, split in enumerate(tqdm(all_splits)):
    content = split.page_content
    url = split.metadata['url']
    timestamps = split.metadata['timestamp']
    prompt = split_prompt_template+content+'"\nOutput:'
    # print(prompt)
    output = generate_text(prompt)
    print(output)
    timestamp_list.append(timestamps)
    url_list.append(url)
    content_list.append(output[len(prompt):])
    # content_list.append(output)

    if idx>0:
        break

In [None]:
content_list[0]

### microsoft/phi-2

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "microsoft/phi-2"

# llm = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16, device_map="auto")
# tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", trust_remote_code=True).eval()

Downloading (…)of-00002.safetensors:   0%|          | 0.00/577M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

In [12]:
def generate_text(prompt_text):

    inputs = tokenizer(prompt_text, return_tensors="pt").to(0)
    outputs = model.generate(**inputs, pad_token_id=50256, max_new_tokens=2048)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

In [13]:
from tqdm import tqdm

content_list, url_list, timestamp_list = [], [], []
for idx, split in enumerate(tqdm(all_splits)):
    content = split.page_content
    url = split.metadata['url']
    timestamps = split.metadata['timestamp']
    prompt = split_prompt_template+content+'"\nOutput:'
    # print(prompt)
    output = generate_text(prompt)
    print(output)
    timestamp_list.append(timestamps)
    url_list.append(url)
    content_list.append(output[len(prompt):])
    # content_list.append(output)

    if idx>0:
        break

  0%|          | 0/19092 [00:00<?, ?it/s]This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
2024-01-08 00:48:02.931934: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-08 00:48:03.062900: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  0%|          | 1/19092 [00:47<251:10:25, 47.36s/it]


Decompose the "Content" into clear and simple propositions, ensuring they are interpretable out of context.
1. Split compound sentence into simple sentences. Maintain the original phrasing from the input whenever possible.
2. Remove the irrelevant content. 
3. For any named entity that is accompanied by additional descriptive information, separate this information into its own distinct proposition.
4. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the entities they refer to.
5. Present the results as a list of strings, formatted in JSON.
Example 1
Input:"English  Version 國立臺灣大學圖書資訊學系 Department and Graduate Institute of Library and Information Science, National Taiwan University  選單
					 主選單 學士班獎助學金 僅提供系上開設獎學金消息 圖資系專屬獎學金一覽表 2023.09更新 臺大文學院獎助學金 臺大獎助學金一覽表 臺大弱勢助學金 臺大生活學習獎助金 國立臺灣大學希望助學金 (請以關鍵字搜尋希望助學金) LIS, NTU © All rights reserved since 2014"
Output:[

  0%|          | 1/19092 [01:29<474:51:50, 89.55s/it]


KeyboardInterrupt: 

### Open-Orca/Mixtral-SlimOrca-8x7B

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "Open-Orca/Mixtral-SlimOrca-8x7B"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", trust_remote_code=True).eval()

In [4]:
def generate_text(prompt_text):

    inputs = tokenizer(prompt_text, return_tensors="pt").to(0)
    outputs = model.generate(**inputs, pad_token_id=50256, max_new_tokens=2048)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

In [12]:
from tqdm import tqdm

content_list, url_list, timestamp_list = [], [], []
for idx, split in enumerate(tqdm(all_splits)):
    content = split.page_content
    url = split.metadata['url']
    timestamps = split.metadata['timestamp']
    prompt = split_prompt_template+content+'"\nOutput:'
    # print(prompt)
    output = generate_text(prompt)
    print(output)
    timestamp_list.append(timestamps)
    url_list.append(url)
    content_list.append(output[len(prompt):])
    # content_list.append(output)

    if idx>0:
        break

  0%|          | 1/19092 [04:57<1580:06:03, 297.96s/it]


Decompose the "Content" into clear and simple propositions, ensuring they are interpretable out of context.
1. Split compound sentence into simple sentences. Maintain the original phrasing from the input whenever possible.
2. Remove the irrelevant content. 
3. For any named entity that is accompanied by additional descriptive information, separate this information into its own distinct proposition.
4. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the entities they refer to.
5. Present the results as a list of strings, formatted in JSON.
Input:"English  Version 國立臺灣大學圖書資訊學系 Department and Graduate Institute of Library and Information Science, National Taiwan University  選單
					 主選單 學士班獎助學金 僅提供系上開設獎學金消息 圖資系專屬獎學金一覽表 2023.09更新 臺大文學院獎助學金 臺大獎助學金一覽表 臺大弱勢助學金 臺大生活學習獎助金 國立臺灣大學希望助學金 (請以關鍵字搜尋希望助學金) LIS, NTU © All rights reserved since 2014"
Output:["國立臺灣大學圖書資

  0%|          | 1/19092 [5:04:59<97041:44:08, 18299.21s/it]


Decompose the "Content" into clear and simple propositions, ensuring they are interpretable out of context.
1. Split compound sentence into simple sentences. Maintain the original phrasing from the input whenever possible.
2. Remove the irrelevant content. 
3. For any named entity that is accompanied by additional descriptive information, separate this information into its own distinct proposition.
4. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the entities they refer to.
5. Present the results as a list of strings, formatted in JSON.
Input:"English  Version 國立臺灣大學圖書資訊學系 Department and Graduate Institute of Library and Information Science, National Taiwan University  選單
					 主選單 學士班獎助學金 僅提供系上開設獎學金消息 圖資系專屬獎學金一覽表 2023.09更新 臺大文學院獎助學金 臺大獎助學金一覽表 臺大弱勢助學金 臺大生活學習獎助金 國立臺灣大學希望助學金 (請以關鍵字搜尋希望助學金) LIS, NTU © All rights reserved since 2014"
Output:["國立臺灣大學圖書資


