# Arxiv Accelerator
In this notebook we are going to develop a simple gradio application that will search papers and will analize it with Llama 2

In [2]:
import urllib.request
import fitz
import re
import numpy as np
import tensorflow_hub as hub
import gradio as gr
import os
from sklearn.neighbors import NearestNeighbors
# Compability with Hugging Face models
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
import torch
import gradio as gr
import re
# Compability with ChatGPT
import openai

# Step 1 - PDF Analizer

The first thing that we want to create is the program that download and read and summarize papers

<img title="a title" alt="Alt text" src="./assets/paper.png">
The first example that we will use in this notebookwill we  : 
Attention Is All You Need

[https://arxiv.org/abs/1706.03762](https://arxiv.org/abs/1706.03762)

In [3]:
url = "https://arxiv.org/pdf/1706.03762.pdf"

In [4]:
def download_pdf(url, output_path):
    urllib.request.urlretrieve(url, output_path)

In [5]:
downloaded_file = "data.pdf"

In [6]:
download_pdf(url, downloaded_file)

In [7]:
def pdf_to_text(path, start_page=1, end_page=None):
    doc = fitz.open(path)
    total_pages = doc.page_count

    if end_page is None:
        end_page = total_pages

    text_list = []

    for i in range(start_page-1, end_page):
        text = doc.load_page(i).get_text("text")
        text = preprocess(text)
        text_list.append(text)

    doc.close()
    return text_list

def preprocess(text):
    text = text.replace('\n', ' ')
    text = re.sub('\s+', ' ', text)
    return text


In [8]:
text_list = pdf_to_text(downloaded_file)

In [9]:
len(text_list)

15

In [10]:
def text_to_chunks(texts, word_length=150, start_page=1):
    text_toks = [t.split(' ') for t in texts]
    page_nums = []
    chunks = []
    
    for idx, words in enumerate(text_toks):
        for i in range(0, len(words), word_length):
            chunk = words[i:i+word_length]
            if (i+word_length) > len(words) and (len(chunk) < word_length) and (
                len(text_toks) != (idx+1)):
                text_toks[idx+1] = chunk + text_toks[idx+1]
                continue
            chunk = ' '.join(chunk).strip()
            chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
            chunks.append(chunk)
    return chunks

In [11]:
chunks= text_to_chunks(text_list)

In [12]:
parts = len(chunks)

In [13]:
print("We have {} pieces of the article now".format(parts))

We have 41 pieces of the article now


Now above all the list of 41 pieces, we should reduce the amount of pieces, this is possible by usuing the Semantic Search.
Thie is great model used by Google that his the universal sentence encoder.

In [14]:
class SemanticSearch:
    def __init__(self):
        self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
        self.fitted = False
    def fit(self, data, batch=1000, n_neighbors=5):
        self.data = data
        self.embeddings = self.get_text_embedding(data, batch=batch)
        n_neighbors = min(n_neighbors, len(self.embeddings))
        self.nn = NearestNeighbors(n_neighbors=n_neighbors)
        self.nn.fit(self.embeddings)
        self.fitted = True
    def __call__(self, text, return_data=True):
        inp_emb = self.use([text])
        neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
        if return_data:
            return [self.data[i] for i in neighbors]
        else:
            return neighbors
    def get_text_embedding(self, texts, batch=1000):
        embeddings = []
        for i in range(0, len(texts), batch):
            text_batch = texts[i:(i+batch)]
            emb_batch = self.use(text_batch)
            embeddings.append(emb_batch)
        embeddings = np.vstack(embeddings)
        return embeddings

We summarize all the previous steps

In [15]:
recommender = SemanticSearch()

In [16]:
def load_recommender(path, start_page=1):
    global recommender
    texts = pdf_to_text(path, start_page=start_page)
    chunks = text_to_chunks(texts, start_page=start_page)
    recommender.fit(chunks)
    return 'Corpus Loaded.'

In [17]:
downloaded_file

'data.pdf'

In [18]:
load_recommender(downloaded_file , start_page=1)

'Corpus Loaded.'

In [19]:
question="Give me a summary of the abstract"

In [20]:
def generate_prompt(question):
    topn_chunks = recommender(question)
    results = ""
    results += 'search results:\n\n'
    for c in topn_chunks:
        results += c + '. \n\n'       
    instruction = "Instructions: Compose a comprehensive reply to the query using the search results given. "\
              "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
              "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
              "with the same name, create separate answers for each. Only include information found in the results and "\
              "don't add any additional information. Make sure the answer is correct and don't output false content. "\
              "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
              "search results which has nothing to do with the question. Only answer what is asked. The "\
              "answer should be short and concise."
    
    prompt = instruction + "\n\nQuery: {}".format(question) + " \n\n" + results + " \nAnswer:"
    return prompt

In [21]:
generate_prompt(question)

'Instructions: Compose a comprehensive reply to the query using the search results given. Cite each reference using [ Page Number] notation (every result has this number at the beginning). Citation should be done at the end of each sentence. If the search results mention multiple subjects with the same name, create separate answers for each. Only include information found in the results and don\'t add any additional information. Make sure the answer is correct and don\'t output false content. If the text does not relate to the query, simply state \'Found Nothing\'. Ignore outlier search results which has nothing to do with the question. Only answer what is asked. The answer should be short and concise.\n\nQuery: Give me a summary of the abstract \n\nsearch results:\n\n[Page no. 1] "Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works. Attention Is All You Need Ashish 

In [22]:
DEFAULT_SYSTEM_PROMPT = """\Compose a comprehensive reply to the query using the search results given. 
Cite each reference using [ Page Number] notation (every result has this number at the beginning). 
Citation should be done at the end of each sentence. If the search results mention multiple subjects 
with the same name, create separate answers for each. Only include information found in the results and 
don't add any additional information. Make sure the answer is correct and don't output false content. 
If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier 
search results which has nothing to do with the question. Only answer what is asked. The 
answer should be short and concise.\
"""

In [23]:
def get_prompt(message: str, chat_history: list[tuple[str, str]],
               system_prompt: str) -> str:
    texts = [f'<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n']
    # The first user input is _not_ stripped
    do_strip = False
    for user_input, response in chat_history:
        user_input = user_input.strip() if do_strip else user_input
        do_strip = True
        texts.append(f'{user_input} [/INST] {response.strip()} </s><s>[INST] ')
    message = message.strip() if do_strip else message
    texts.append(f'{message} [/INST]')
    return ''.join(texts)

In [24]:
def get_simple_prompt(question: str) -> str:
    topn_chunks = recommender(question)
    results = ""
    results += 'Search results:\n\n'
    for c in topn_chunks:
        results += c + '. \n\n' 
    message =  "\n\nQuery: {}".format(question) + " \n\n" + results + " \nAnswer:"
    texts = [f'<s>[INST] <<SYS>>\n{DEFAULT_SYSTEM_PROMPT}\n<</SYS>>\n\n']
    texts.append(f'{message} [/INST]')
    return ''.join(texts)

In [25]:
get_simple_prompt(question)

'<s>[INST] <<SYS>>\n\\Compose a comprehensive reply to the query using the search results given. \nCite each reference using [ Page Number] notation (every result has this number at the beginning). \nCitation should be done at the end of each sentence. If the search results mention multiple subjects \nwith the same name, create separate answers for each. Only include information found in the results and \ndon\'t add any additional information. Make sure the answer is correct and don\'t output false content. \nIf the text does not relate to the query, simply state \'Found Nothing\'. Ignore outlier \nsearch results which has nothing to do with the question. Only answer what is asked. The \nanswer should be short and concise.\n<</SYS>>\n\n\n\nQuery: Give me a summary of the abstract \n\nSearch results:\n\n[Page no. 1] "Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works

In [26]:
prompt=get_simple_prompt(question)

In [27]:
prompt

'<s>[INST] <<SYS>>\n\\Compose a comprehensive reply to the query using the search results given. \nCite each reference using [ Page Number] notation (every result has this number at the beginning). \nCitation should be done at the end of each sentence. If the search results mention multiple subjects \nwith the same name, create separate answers for each. Only include information found in the results and \ndon\'t add any additional information. Make sure the answer is correct and don\'t output false content. \nIf the text does not relate to the query, simply state \'Found Nothing\'. Ignore outlier \nsearch results which has nothing to do with the question. Only answer what is asked. The \nanswer should be short and concise.\n<</SYS>>\n\n\n\nQuery: Give me a summary of the abstract \n\nSearch results:\n\n[Page no. 1] "Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works

# Model Creation

In [28]:
from transformers import AutoTokenizer
import transformers
import torch
model_id = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.float16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [29]:
def make_question(question):
    sequences = pipeline(
        '{}\n'.format(question),
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=200,
    )
    for seq in sequences:
        print(f"Result: {seq['generated_text']}")

In [30]:
make_question("What is the capital of Italy")

Result: What is the capital of Italy

Answer: The capital of Italy is Rome.


In [43]:
def simple_run(question):
    prompt=generate_prompt(question)
    #prompt = get_simple_prompt(question)
    max_new_tokens=200
    sequences = pipeline(
        prompt,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=max_new_tokens+200,
    )
    for seq in sequences:
        print(f"Result: {seq['generated_text']}")

In [44]:
question="Give me a summary of the abstract"

In [45]:
simple_run(question)



OutOfMemoryError: CUDA out of memory. Tried to allocate 122.00 MiB (GPU 0; 8.00 GiB total capacity; 6.93 GiB already allocated; 0 bytes free; 7.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# Modeling

In [30]:
from threading import Thread
from typing import Iterator
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

In [31]:
model_id = 'meta-llama/Llama-2-7b-chat-hf'
if torch.cuda.is_available():
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map='auto'
    )
else:
    model = None
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [32]:
def get_input_token_length(prompt: str) -> int:
    #prompt = get_simple_prompt(question)
    input_ids = tokenizer([prompt], return_tensors='np', add_special_tokens=False)['input_ids']
    return input_ids.shape[-1]

In [33]:
get_input_token_length(prompt)

1440

In [34]:
def run(question: str,
        max_new_tokens: int = 1024,
        temperature: float = 0.7,
        top_p: float = 0.90,
        top_k: int = 20,
        repetition_penalty=1.15,
       ) -> Iterator[str]:
    prompt = get_simple_prompt(question)
    max_new_tokens=get_input_token_length(prompt)
    inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
    streamer = TextIteratorStreamer(tokenizer,
                                    timeout=10.,
                                    skip_prompt=True,
                                    skip_special_tokens=True)
    generate_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
        temperature=temperature,
        num_beams=1,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        yield ''.join(outputs)

In [35]:
question

'Give me a summary of the abstract'

In [36]:
run(question)

<generator object run at 0x000001CCD8A01540>

In [37]:
def generate(
    message: str,
    chat_history: list[tuple[str, str]],
    max_new_tokens: int = 1024,
    temperature: float = 0.7,
    top_p: float = 0.90,
    top_k: int = 20,
    repetition_penalty=1.15,    
) -> Iterator[list[tuple[str, str]]]:
    generator = run(message, max_new_tokens, temperature, top_p, top_k,repetition_penalty , )
    try:
        first_response = next(generator)
        yield history + [(message, first_response)]
    except StopIteration:
        yield history + [(message, '')]
    for response in generator:
        yield history + [(message, response)]

In [39]:
def process_example(message: str) -> tuple[str, list[tuple[str, str]]]:
    try:
        generator = generate(message, [], 1024, 1, 0.95, 50,1.5)
    except Exception as error:
      print("An error occurred:", error) # An error occurred: name 'x' is not defined        
            
    for x in generator:
        pass
    return '', x

In [40]:
process_example("Give me a summary of the abstract")

NameError: name 'history' is not defined

Exception in thread Thread-9 (generate):
Traceback (most recent call last):
  File "C:\Users\rusla\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "C:\Users\rusla\AppData\Local\Programs\Python\Python310\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "D:\ArxivChat\.arxiv_env\lib\site-packages\torch\utils\_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "D:\ArxivChat\.arxiv_env\lib\site-packages\transformers\generation\utils.py", line 1648, in generate
    return self.sample(
  File "D:\ArxivChat\.arxiv_env\lib\site-packages\transformers\generation\utils.py", line 2730, in sample
    outputs = self(
  File "D:\ArxivChat\.arxiv_env\lib\site-packages\torch\nn\modules\module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "D:\ArxivChat\.arxiv_env\lib\site-packages\accelerate\hooks.py", line 165, in new_forward
    outpu