# Arxiv Accelerator
In this notebook we are going to develop a simple gradio application that will search papers and will analize it with Llama 2

In [7]:
import urllib.request
import fitz
import re
import numpy as np
import tensorflow_hub as hub
import gradio as gr
import os
from sklearn.neighbors import NearestNeighbors
# Compability with Hugging Face models
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
import torch
import gradio as gr
import re
# Compability with ChatGPT
import openai

# Step 1 - PDF Analizer

The first thing that we want to create is the program that download and read and summarize papers

<img title="a title" alt="Alt text" src="./assets/paper.png">
The first example that we will use in this notebookwill we  : 
Attention Is All You Need

[https://arxiv.org/abs/1706.03762](https://arxiv.org/abs/1706.03762)

In [8]:
url = "https://arxiv.org/pdf/1706.03762.pdf"

In [9]:
def download_pdf(url, output_path):
    urllib.request.urlretrieve(url, output_path)

In [10]:
downloaded_file = "data.pdf"

In [11]:
download_pdf(url, downloaded_file)

In [12]:
def pdf_to_text(path, start_page=1, end_page=None):
    doc = fitz.open(path)
    total_pages = doc.page_count

    if end_page is None:
        end_page = total_pages

    text_list = []

    for i in range(start_page-1, end_page):
        text = doc.load_page(i).get_text("text")
        text = preprocess(text)
        text_list.append(text)

    doc.close()
    return text_list

def preprocess(text):
    text = text.replace('\n', ' ')
    text = re.sub('\s+', ' ', text)
    return text


In [13]:
text_list = pdf_to_text(downloaded_file)

In [14]:
len(text_list)

15

In [15]:
def text_to_chunks(texts, word_length=150, start_page=1):
    text_toks = [t.split(' ') for t in texts]
    page_nums = []
    chunks = []
    
    for idx, words in enumerate(text_toks):
        for i in range(0, len(words), word_length):
            chunk = words[i:i+word_length]
            if (i+word_length) > len(words) and (len(chunk) < word_length) and (
                len(text_toks) != (idx+1)):
                text_toks[idx+1] = chunk + text_toks[idx+1]
                continue
            chunk = ' '.join(chunk).strip()
            chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
            chunks.append(chunk)
    return chunks

In [16]:
chunks= text_to_chunks(text_list)

In [17]:
parts = len(chunks)

In [18]:
print("We have {} pieces of the article now".format(parts))

We have 41 pieces of the article now


Now above all the list of 41 pieces, we should reduce the amount of pieces, this is possible by usuing the Semantic Search.
Thie is great model used by Google that his the universal sentence encoder.

In [19]:
class SemanticSearch:
    def __init__(self):
        self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
        self.fitted = False
    def fit(self, data, batch=1000, n_neighbors=5):
        self.data = data
        self.embeddings = self.get_text_embedding(data, batch=batch)
        n_neighbors = min(n_neighbors, len(self.embeddings))
        self.nn = NearestNeighbors(n_neighbors=n_neighbors)
        self.nn.fit(self.embeddings)
        self.fitted = True
    def __call__(self, text, return_data=True):
        inp_emb = self.use([text])
        neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
        if return_data:
            return [self.data[i] for i in neighbors]
        else:
            return neighbors
    def get_text_embedding(self, texts, batch=1000):
        embeddings = []
        for i in range(0, len(texts), batch):
            text_batch = texts[i:(i+batch)]
            emb_batch = self.use(text_batch)
            embeddings.append(emb_batch)
        embeddings = np.vstack(embeddings)
        return embeddings

We summarize all the previous steps

In [20]:
recommender = SemanticSearch()

In [21]:
def load_recommender(path, start_page=1):
    global recommender
    texts = pdf_to_text(path, start_page=start_page)
    chunks = text_to_chunks(texts, start_page=start_page)
    recommender.fit(chunks)
    return 'Corpus Loaded.'

In [22]:
downloaded_file

'data.pdf'

In [23]:
load_recommender(downloaded_file , start_page=1)

'Corpus Loaded.'

In [24]:
question="Give me a summary of the abstract"

In [25]:
def generate_prompt(question):
    topn_chunks = recommender(question)
    results = ""
    results += 'search results:\n\n'
    for c in topn_chunks:
        results += c + '. \n\n'       
    instruction = "Instructions: Compose a comprehensive reply to the query using the search results given. "\
              "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
              "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
              "with the same name, create separate answers for each. Only include information found in the results and "\
              "don't add any additional information. Make sure the answer is correct and don't output false content. "\
              "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
              "search results which has nothing to do with the question. Only answer what is asked. The "\
              "answer should be short and concise."
    
    prompt = instruction + "\n\nQuery: {}".format(question) + " \n\n" + results + " \nAnswer:"
    return prompt

In [26]:
def generate_prompt3(question):
    topn_chunks = recommender(question)
    results = ""
    results += 'search results:\n\n'
    for c in topn_chunks:
        results += c + '. \n\n'       
    instruction = "Instructions: Compose a comprehensive reply to the query using the search results given. "\
              "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
              "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
              "with the same name, create separate answers for each. Only include information found in the results and "\
              "don't add any additional information. Make sure the answer is correct and don't output false content. "\
              "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
              "search results which has nothing to do with the question. Only answer what is asked. The "\
              "answer should be short and concise."\
              "Do not include the instructions in the answer."
    
    prompt = instruction + "\n\nQuery: {}".format(question) + " \n\n" + results + " \nAnswer:"
    return prompt

In [27]:
prompt1=generate_prompt(question)

In [28]:
prompt3=generate_prompt3(question)

In [29]:
DEFAULT_SYSTEM_PROMPT = """\Compose a comprehensive reply to the query using the search results given. 
Cite each reference using [ Page Number] notation (every result has this number at the beginning). 
Citation should be done at the end of each sentence. If the search results mention multiple subjects 
with the same name, create separate answers for each. Only include information found in the results and 
don't add any additional information. Make sure the answer is correct and don't output false content. 
If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier 
search results which has nothing to do with the question. Only answer what is asked. The 
answer should be short and concise.\
"""

In [30]:
def get_prompt(message: str, chat_history: list[tuple[str, str]],
               system_prompt: str) -> str:
    texts = [f'<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n']
    # The first user input is _not_ stripped
    do_strip = False
    for user_input, response in chat_history:
        user_input = user_input.strip() if do_strip else user_input
        do_strip = True
        texts.append(f'{user_input} [/INST] {response.strip()} </s><s>[INST] ')
    message = message.strip() if do_strip else message
    texts.append(f'{message} [/INST]')
    return ''.join(texts)

In [31]:
def get_simple_prompt(question: str) -> str:
    topn_chunks = recommender(question)
    results = ""
    results += 'Search results:\n\n'
    for c in topn_chunks:
        results += c + '. \n\n' 
    message =  "\n\nQuery: {}".format(question) + " \n\n" + results + " \nAnswer:"
    texts = [f'<s>[INST] <<SYS>>\n{DEFAULT_SYSTEM_PROMPT}\n<</SYS>>\n\n']
    texts.append(f'{message} [/INST]')
    return ''.join(texts)

In [32]:
prompt2=get_simple_prompt(question)

In [33]:
prompt2

'<s>[INST] <<SYS>>\n\\Compose a comprehensive reply to the query using the search results given. \nCite each reference using [ Page Number] notation (every result has this number at the beginning). \nCitation should be done at the end of each sentence. If the search results mention multiple subjects \nwith the same name, create separate answers for each. Only include information found in the results and \ndon\'t add any additional information. Make sure the answer is correct and don\'t output false content. \nIf the text does not relate to the query, simply state \'Found Nothing\'. Ignore outlier \nsearch results which has nothing to do with the question. Only answer what is asked. The \nanswer should be short and concise.\n<</SYS>>\n\n\n\nQuery: Give me a summary of the abstract \n\nSearch results:\n\n[Page no. 1] "Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works

# Model Creation

https://stackoverflow.com/questions/76772509/llama-2-7b-hf-repeats-context-of-question-directly-from-input-prompt-cuts-off-w

In [27]:
from transformers import AutoTokenizer
import transformers
import torch
model_id = "meta-llama/Llama-2-7b-chat-hf"
#model_id = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.float16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [28]:
def make_question_1(question):
    sequences = pipeline(
        '{}\n'.format(question),
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=200,
    )
    for seq in sequences:
        print(f"Result: {seq['generated_text']}")

In [29]:
make_question_1("What is the capital of Italy")

Result: What is the capital of Italy

Answer: The capital of Italy is Rome.


In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# Define the model name (Llama-2 7B-hf)
model_name = "meta-llama/Llama-2-7b-chat-hf" #model requires at least 12GB of GPU memory to run. 
#model_name ='EleutherAI/gpt-neo-2.7B' #16GB RAM NEEDED
#model_name = 'EleutherAI/gpt-neo-1.3B'
# Load the pre-trained model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading model.safetensors:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [2]:
# Function to generate text based on a prompt
def generate_text(prompt, max_length=100, num_return_sequences=1):
    input_ids = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=max_length)
    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
        )

    # Decode and print generated text
    generated_text = [tokenizer.decode(output_seq, skip_special_tokens=True) for output_seq in output]
    return generated_text

In [3]:
generate_text("What is the capital of Italy")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


["What is the capital of Italy?\n\nThe capital city of the Italian Republic is Rome. The capital is also the seat of government, the centre of commerce, and the administrative centre for the country.\nIt is located in the central part of Lazio, in central Italy, on the Adriatic coast. It is a city with a population of about 1.2 million people. Rome is one of Europe's most important cities, with an international airport, a large number of museums,"]

In [None]:
#Lama2 7b
'''
['What is the capital of Italy?\n\nThe capital city of the Italian Republic is Rome (Roma in Italian). Rome is located in central-western Italy and is home to many famous landmarks such as the Colosseum, the Pantheon, and the Vatican City. Rome has a population of over 2.8 million people and has been the seat of Italian government since 1865.']
'''

In [4]:
# Prompt for user input and generate text
while True:
    user_prompt = input("Enter a prompt (type 'exit' to quit): ")
    if user_prompt.lower() == 'exit':
        break
    generated_text = generate_text(user_prompt)
    print("\nGenerated Text:")
    for i, text in enumerate(generated_text):
        print(f"{i + 1}. {text}\n")

Enter a prompt (type 'exit' to quit):  What is the capital of Italy?



Generated Text:
1. What is the capital of Italy?
- Rome
The capital city of the Italian Republic is Rome. Rome is a special city that has a unique administrative status within Italy. It is home to the national government, the Prime Minister's office, and many other national institutions, but it is also a regional entity in its own right. As such, it has its mayor and local administration, as well as a number of special powers and responsibilities that set it apart from other Italian cities



Enter a prompt (type 'exit' to quit):  exit


If we want to avoid repetition of the input , we can encode the prompt using Llama tokenizer, find the length of the prompt token ids and remove them from the model output:

In [4]:
# Function to generate text based on a prompt
def generate_text_clean(prompt):
    prompt_tokens = tokenizer(prompt, return_tensors="pt")["input_ids"]
    start_index = prompt_tokens.shape[-1]
    output = model.generate(prompt_tokens, num_return_sequences=1)
    generation_output = output[0][start_index:]
    generation_text = tokenizer.decode(generation_output, skip_special_tokens=True)
    return generation_text

In [5]:
generate_text_clean("What is the capital of Italy")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'?\n\nThe capital of Italy is Rome, the largest city in'

In [14]:
generate_text_clean("What is the capital of Italy")

'?\n\nThe capital of Italy is Rome (Roma).'

In [43]:
prompt3[:2000]

'Instructions: Compose a comprehensive reply to the query using the search results given. Cite each reference using [ Page Number] notation (every result has this number at the beginning). Citation should be done at the end of each sentence. If the search results mention multiple subjects with the same name, create separate answers for each. Only include information found in the results and don\'t add any additional information. Make sure the answer is correct and don\'t output false content. If the text does not relate to the query, simply state \'Found Nothing\'. Ignore outlier search results which has nothing to do with the question. Only answer what is asked. The answer should be short and concise.Do not include the instructions in the answer.\n\nQuery: Give me a summary of the abstract \n\nsearch results:\n\n[Page no. 1] "Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scho

In [44]:
generate_text_clean(prompt3[:2000])

'ention mechanism, which is a key component of the Transformer architecture, allows the model to attend to different parts of the input sequence simultaneously and weigh their importance. This is in contrast to traditional recurrent neural network (RNN) architectures, which only consider the previous elements in the sequence when making predictions. The Transformer architecture has been shown to be highly effective in a variety of natural language processing tasks, including machine translation and text generation".\n\n[Page no. 4] "The Transformer architecture was introduced in a paper by Vaswani et al. in 2017. The authors proposed a new neural network architecture that uses self-attention mechanisms to process input sequences in parallel, rather than sequentially as in traditional RNNs. The Transformer architecture has since become widely used in natural language processing tasks, including machine translation, text generation, and question answering".\n\n[Page no. 5] "The Transform

In [32]:
generate_text_clean(prompt3[:2000])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'ention'

In [33]:
# Function to generate text based on a prompt
def generate_text_clean_new(prompt):
    prompt_tokens = tokenizer(prompt, return_tensors="pt")["input_ids"]
    start_index = prompt_tokens.shape[-1]
    output = model.generate(prompt_tokens, 
                            num_return_sequences=1,
                            max_length=457)
    generation_output = output[0][start_index:]
    generation_text = tokenizer.decode(generation_output, skip_special_tokens=True)
    return generation_text

In [34]:
generate_text_clean_new(prompt3[:2000])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'ention'

In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
#model_name = 'EleutherAI/gpt-neo-1.3B'
model_name = "meta-llama/Llama-2-7b-chat-hf"
# Load the pre-trained model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Set the pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token
# Function to generate text based on a prompt
def generate_text(prompt, max_new_tokens=500):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    start_index = input_ids.shape[-1]
    # Generate text with attention_mask and increased max_new_tokens
    with torch.no_grad():
        output = model.generate(input_ids, attention_mask=attention_mask, num_return_sequences=1, max_new_tokens=max_new_tokens)
    generation_output = output[0][start_index:]
    generation_text = tokenizer.decode(generation_output, skip_special_tokens=True)
    return generation_text


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
generate_text("What is the capital of Italy")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


'?\n\nThe capital of Italy is Rome (Roma).'

In [7]:
instruc='Instructions: Compose a comprehensive reply to the query using the search results given. Cite each reference using [ Page Number] notation (every result has this number at the beginning). Citation should be done at the end of each sentence. If the search results mention multiple subjects with the same name, create separate answers for each. Only include information found in the results and don\'t add any additional information. Make sure the answer is correct and don\'t output false content. If the text does not relate to the query, simply state \'Found Nothing\'. Ignore outlier search results which has nothing to do with the question. Only answer what is asked. The answer should be short and concise.Do not include the instructions in the answer.\n\nQuery: Give me a summary of the abstract \n\nsearch results:\n\n[Page no. 1] "Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works. Attention Is All You Need Ashish Vaswani∗ Google Brain avaswani@google.com Noam Shazeer∗ Google Brain noam@google.com Niki Parmar∗ Google Research nikip@google.com Jakob Uszkoreit∗ Google Research usz@google.com Llion Jones∗ Google Research llion@google.com Aidan N. Gomez∗ † University of Toronto aidan@cs.toronto.edu Łukasz Kaiser∗ Google Brain lukaszkaiser@google.com Illia Polosukhin∗ ‡ illia.polosukhin@gmail.com Abstract The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly". \n\n[Page no. 3] "self-att'

In [4]:
generate_text(instruc)

'ention mechanism, which is a key component of the Transformer efficiency. Unlike traditional recurrent neural networks (RNNs), the Transformer does not rely on recurrence to process sequences. Instead, it uses a self-attention mechanism to parallelize the computation of attention across all positions in a sequence. This allows the Transformer to process long sequences efficiently and scale to larger models".\n\n[Page no. 5] "In this paper, we introduce the Transformer, a new architecture for sequence-to-sequence translation that replaces traditional recurrent neural networks (RNNs) and convolutional neural networks (CNNs) with attention mechanisms. The Transformer relies solely on self-attention mechanisms to process sequences, allowing it to parallelize the computation of attention across all positions in a sequence. This allows the Transformer to efficiently process long sequences and scale to larger models".\n\n\nAnswer: Based on the search results provided, the'

In [8]:
generate_text(instruc)

'ention is a mechanism that allows the model to focus on specific parts of the input sequence when generating the output. This is particularly useful in natural language processing tasks, where the input sequences can be long and unstructured. In this paper, we propose a new attention mechanism called multi-head attention, which allows the model to jointly attend to information from different representation subspaces at different positions. This allows the model to capture a wider range of contextual relationships between different parts of the input sequence, leading to improved performance on a variety of natural language processing tasks".\n\n[Page no. 5] "attention is a key component of many state-of-the-art natural language processing models, including the Transformer architecture proposed in this paper. Attention allows the model to selectively focus on certain parts of the input sequence when generating the output, rather than using a fixed context or relying on the entire input

In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# Load the pre-trained model and tokenizer
#The following GPT-2 models from Hugging Face are compatible with GPT2LMHeadModel:
#model_name = "gpt2"
#model_name = "gpt2-medium"
model_name = "gpt2-large"
#model_name = "gpt2-xl"
#model_name = "gpt2-distilgpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Set the model to evaluation mode for faster inference
model.eval()
# Function to generate text based on a prompt
def generate_text(prompt, max_length=100, num_return_sequences=1):
    input_ids = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=max_length).to(device)

    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
        )

    # Decode and print generated text
    generated_text = [tokenizer.decode(output_seq, skip_special_tokens=True) for output_seq in output]
    return generated_text

# Prompt for user input and generate text
while True:
    user_prompt = input("Enter a prompt (type 'exit' to quit): ")
    if user_prompt.lower() == 'exit':
        break
    generated_text = generate_text(user_prompt)
    print("\nGenerated Text:")
    for i, text in enumerate(generated_text):
        print(f"{i + 1}. {text}\n")


Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Enter a prompt (type 'exit' to quit):  What is the capital of Italy


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Generated Text:
1. What is the capital of Italy?

The capital is Rome.
...
"The city of Rome is a city in which the people are united in a common interest. It is not a place where the rich and the poor live side by side. The people of the city are not divided by class, but by race, language, and religion. Rome has a great tradition of democracy, which is reflected in the fact that the Roman people have always been the most democratic of all



Enter a prompt (type 'exit' to quit):  exit


In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the pre-trained model and tokenizer
model_name = "gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set the padding token to '[PAD]'
tokenizer.pad_token = '[PAD]'

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set the model to evaluation mode for faster inference
model.eval()

# Function to generate text based on a prompt
def generate_text(prompt, max_length=100, num_return_sequences=1):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Generate text with attention_mask
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,  # Pass the attention_mask
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
        )

    # Decode and print generated text
    generated_text = [tokenizer.decode(output_seq, skip_special_tokens=True) for output_seq in output]
    return generated_text

# Prompt for user input and generate text
while True:
    user_prompt = input("Enter a prompt (type 'exit' to quit): ")
    if user_prompt.lower() == 'exit':
        break
    generated_text = generate_text(user_prompt)
    print("\nGenerated Text:")
    for i, text in enumerate(generated_text):
        print(f"{i + 1}. {text}\n")


Enter a prompt (type 'exit' to quit):  What is the capital of Italy?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Generated Text:
1. What is the capital of Italy?

The capital is Rome.
...
"The city of Rome is a city in which the people are united in a common interest. It is not a place where the rich and the poor live side by side. The people of the city are not divided by class, but by race, language, and religion. Rome has a great tradition of democracy, which is reflected in the fact that the Roman people have always been the most democratic of all



Enter a prompt (type 'exit' to quit):  What is a chatbot?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Generated Text:
1. What is a chatbot?

A chat bot is an artificial intelligence program that can be programmed to respond to a user's questions and provide answers.
. A chat robot is not a person. It is programmed with the intention of answering questions. The user can ask the bot questions, and the chatbots will respond. This is similar to the way a human would ask a question. However, the user is the one who is interacting with a bot. In this case, a conversation is



Enter a prompt (type 'exit' to quit):  exit


In [5]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the pre-trained model and tokenizer
model_name = "gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set the padding token to '[PAD]'
tokenizer.pad_token = '[PAD]'

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set the model to evaluation mode for faster inference
model.eval()

# Function to generate text based on a prompt
def generate_text(prompt, max_length=200, num_return_sequences=1):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Generate text with attention_mask and max_new_tokens
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,  # Pass the attention_mask
            max_length=max_length + 200,  # Increase max_length to accommodate the generated text
            num_return_sequences=num_return_sequences,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            max_new_tokens=200,  # Set max_new_tokens to control total tokens generated
        )

    # Decode and print generated text without the prompt
    generated_text = [tokenizer.decode(output_seq, skip_special_tokens=True)[len(prompt):] for output_seq in output]
    return generated_text

# Prompt for user input and generate text
while True:
    user_prompt = input("Enter a prompt (type 'exit' to quit): ")
    if user_prompt.lower() == 'exit':
        break
    generated_text = generate_text(user_prompt)
    print("\nGenerated Text:")
    for i, text in enumerate(generated_text):
        print(f"{i + 1}. {text}\n")


Enter a prompt (type 'exit' to quit):  what is the capital of Italy


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=200) and `max_length`(=400) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Generated Text:
1. ? Rome.

The capital is Rome, and the city is a city. It is not a country. Rome is an island. The city of Rome has no country, no state, nor a government. There is no government, but there is government in Rome and it is called the Roman Republic. This is why the Romans called it the Republic, because it was a republic. But the word "Republic" is used in the sense of a state. So the term "Roman Republic" means a political entity, not an entity. And the name "Rome" does not mean a place. "The city" in this case means the whole of the country of which Rome was the center. In other words, the "city" of "the city," the entire country is referred to as " Rome."
...
In the same way, in a sense, "America" refers to the United States of America. That is, it refers not to a particular place



Enter a prompt (type 'exit' to quit):  exit


In [6]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the pre-trained model and tokenizer
model_name = "gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set the padding token to '[PAD]'
tokenizer.pad_token = '[PAD]'

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set the model to evaluation mode for faster inference
model.eval()

# Function to generate text based on a prompt
def generate_text(prompt, max_length=400, num_return_sequences=1):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Calculate the number of tokens already present in the input prompt
    prompt_token_count = input_ids.shape[1]

    # Calculate the total number of tokens to be generated
    total_max_tokens = max_length + prompt_token_count

    # Generate text with attention_mask and max_new_tokens
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,  # Pass the attention_mask
            max_length=total_max_tokens,  # Set max_length to include both prompt and generated text
            num_return_sequences=num_return_sequences,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            max_new_tokens=400,  # Set max_new_tokens to control total tokens generated
            do_sample=True,  # This flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
        )

    # Define a delimiter to separate the instruction from the generated content
    delimiter = ">>>"
    
    # Decode and print generated text without the prompt and instructions
    generated_text = [tokenizer.decode(output_seq, skip_special_tokens=True) for output_seq in output]
    cleaned_generated_text = []

    for text in generated_text:
        parts = text.split(delimiter)
        if len(parts) >= 2:
            # Extract the generated content and remove leading/trailing whitespaces
            generated_content = parts[1].strip()
            cleaned_generated_text.append(generated_content)
        else:
            # No delimiter found, use the entire generated text
            cleaned_generated_text.append(text)

    return cleaned_generated_text

# Prompt for user input and generate text
while True:
    user_prompt = input("Enter a prompt (type 'exit' to quit): ")
    if user_prompt.lower() == 'exit':
        break
    generated_text = generate_text(user_prompt)
    print("\nGenerated Text:")
    for i, text in enumerate(generated_text):
        print(f"{i + 1}. {text}\n")


Enter a prompt (type 'exit' to quit):  what is the capital of Italy


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=400) and `max_length`(=406) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Generated Text:
1. what is the capital of Italy, Rome?' "

The question that was raised was: "How many people live in Rome?"
, or "Who has the biggest city in Italy?".




Enter a prompt (type 'exit' to quit):  What is the capital of Italy?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=400) and `max_length`(=407) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Generated Text:
1. What is the capital of Italy? Rome. Rome is one of the most beautiful cities in the world. It's a great city. I've lived in Italy for the last 20 years. That's why I'm here."

His next stop: the Vatican.
, a former president of Germany, is scheduled to attend the first meeting of a Vatican advisory board on Saturday, and his trip to Rome could be the subject of an international media frenzy. As a result, the Italian press has been busy speculating about what a pope's visit to the Holy See will mean for Europe and the Catholic Church. What is he going to do there? What will he talk about? And what is his message for world Catholics?
...
The pope will meet with heads of state and government from the United States, Russia and Japan, as well as religious leaders from several African nations. He will also meet Pope Benedict XVI, who is retiring after a papacy that lasted more than five decades. "He is a man of great experience," said an Italian diplomat who attended the 

Enter a prompt (type 'exit' to quit):  exit


In [4]:
instruc='Instructions: Compose a comprehensive reply to the query using the search results given. Cite each reference using [ Page Number] notation (every result has this number at the beginning). Citation should be done at the end of each sentence. If the search results mention multiple subjects with the same name, create separate answers for each. Only include information found in the results and don\'t add any additional information. Make sure the answer is correct and don\'t output false content. If the text does not relate to the query, simply state \'Found Nothing\'. Ignore outlier search results which has nothing to do with the question. Only answer what is asked. The answer should be short and concise.Do not include the instructions in the answer.\n\nQuery: Give me a summary of the abstract \n\nsearch results:\n\n[Page no. 1] "Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works. Attention Is All You Need Ashish Vaswani∗ Google Brain avaswani@google.com Noam Shazeer∗ Google Brain noam@google.com Niki Parmar∗ Google Research nikip@google.com Jakob Uszkoreit∗ Google Research usz@google.com Llion Jones∗ Google Research llion@google.com Aidan N. Gomez∗ † University of Toronto aidan@cs.toronto.edu Łukasz Kaiser∗ Google Brain lukaszkaiser@google.com Illia Polosukhin∗ ‡ illia.polosukhin@gmail.com Abstract The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly". \n\n[Page no. 3] "self-att'

In [34]:
prompt2

'<s>[INST] <<SYS>>\n\\Compose a comprehensive reply to the query using the search results given. \nCite each reference using [ Page Number] notation (every result has this number at the beginning). \nCitation should be done at the end of each sentence. If the search results mention multiple subjects \nwith the same name, create separate answers for each. Only include information found in the results and \ndon\'t add any additional information. Make sure the answer is correct and don\'t output false content. \nIf the text does not relate to the query, simply state \'Found Nothing\'. Ignore outlier \nsearch results which has nothing to do with the question. Only answer what is asked. The \nanswer should be short and concise.\n<</SYS>>\n\n\n\nQuery: Give me a summary of the abstract \n\nSearch results:\n\n[Page no. 1] "Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works

In [36]:
generate_text(instruc)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=400) and `max_length`(=800) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


['Instructions: Compose a comprehensive reply to the query using the search results given. Cite each reference using [ Page Number] notation (every result has this number at the beginning). Citation should be done at the end of each sentence. If the search results mention multiple subjects with the same name, create separate answers for each. Only include information found in the results and don\'t add any additional information. Make sure the answer is correct and don\'t output false content. If the text does not relate to the query, simply state \'Found Nothing\'. Ignore outlier search results which has nothing to do with the question. Only answer what is asked. The answer should be short and concise.Do not include the instructions in the answer.\n\nQuery: Give me a summary of the abstract \n\nsearch results:\n\n[Page no. 1] "Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or sch

In [2]:
Fix the following code :
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the pre-trained model and tokenizer
model_name = "gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set the padding token to '[PAD]'
tokenizer.pad_token = '[PAD]'

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set the model to evaluation mode for faster inference
model.eval()
# Function to generate text based on a prompt
def generate_text(prompt, max_length=400, num_return_sequences=1):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Calculate the number of tokens already present in the input prompt
    prompt_token_count = input_ids.shape[1]

    # Calculate the total number of tokens to be generated
    total_max_tokens = max_length + prompt_token_count

    # Generate text with attention_mask and max_new_tokens
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,  # Pass the attention_mask
            max_length=total_max_tokens,  # Set max_length to include both prompt and generated text
            num_return_sequences=num_return_sequences,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            max_new_tokens=400,  # Set max_new_tokens to control total tokens generated
            do_sample=True,  # This flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
        )

    # Define a delimiter to separate the instruction from the generated content
    delimiter = ">>>"
    
    # Decode and print generated text without the prompt and instructions
    generated_text = [tokenizer.decode(output_seq, skip_special_tokens=True) for output_seq in output]
    cleaned_generated_text = []

    for text in generated_text:
        parts = text.split(delimiter)
        if len(parts) >= 2:
            # Extract the generated content and remove leading/trailing whitespaces
            generated_content = parts[1].strip()
            cleaned_generated_text.append(generated_content)
        else:
            # No delimiter found, use the entire generated text
            cleaned_generated_text.append(text)

    return cleaned_generated_text

# Prompt for user input and generate text
while True:
    user_prompt = input("Enter a prompt (type 'exit' to quit): ")
    if user_prompt.lower() == 'exit':
        break
    generated_text = generate_text(user_prompt)
    print("\nGenerated Text:")
    for i, text in enumerate(generated_text):
        print(f"{i + 1}. {text}\n")


Enter a prompt (type 'exit' to quit):  What is the capital of Italy?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=400) and `max_length`(=407) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Generated Text:
1. What is the capital of Italy?

The capital, Rome, is in Italy, not in the Republic of Rome. In the Roman Republic, the capitol was in Rome and the senators and plebeians elected their local representatives in their own cities. The capital was the seat of government.
, it is called the Italian city and not the 'capital' of the country. Rome is not a state in itself. It is a city in a region of country, and there is no city of its own. Italy has been a province of a larger country since ancient times.



Enter a prompt (type 'exit' to quit):  >>>What is the capital of Italy>>>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=400) and `max_length`(=408) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Generated Text:
1. What is the capital of Italy



Enter a prompt (type 'exit' to quit):  What is the capital of italy?>>>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=400) and `max_length`(=409) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Generated Text:
1. Capital of Italia
- In italia, we have capital city, the city of Rome.



What about the rest of the world?
, italian capital
.

.



Enter a prompt (type 'exit' to quit):  exit


In [43]:
def simple_run(question):
    prompt=generate_prompt(question)
    #prompt = get_simple_prompt(question)
    max_new_tokens=200
    sequences = pipeline(
        prompt,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=max_new_tokens,
    )
    for seq in sequences:
        print(f"Result: {seq['generated_text']}")

In [44]:
question="Give me a summary of the abstract"

In [37]:
#simple_run(question)

# Modeling

In [30]:
from threading import Thread
from typing import Iterator
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

In [31]:
model_id = 'meta-llama/Llama-2-7b-chat-hf'
if torch.cuda.is_available():
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map='auto'
    )
else:
    model = None
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [32]:
def get_input_token_length(prompt: str) -> int:
    #prompt = get_simple_prompt(question)
    input_ids = tokenizer([prompt], return_tensors='np', add_special_tokens=False)['input_ids']
    return input_ids.shape[-1]

In [33]:
get_input_token_length(prompt)

1440

In [34]:
def run(question: str,
        max_new_tokens: int = 1024,
        temperature: float = 0.7,
        top_p: float = 0.90,
        top_k: int = 20,
        repetition_penalty=1.15,
       ) -> Iterator[str]:
    prompt = get_simple_prompt(question)
    max_new_tokens=get_input_token_length(prompt)
    inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
    streamer = TextIteratorStreamer(tokenizer,
                                    timeout=10.,
                                    skip_prompt=True,
                                    skip_special_tokens=True)
    generate_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
        temperature=temperature,
        num_beams=1,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        yield ''.join(outputs)

In [35]:
question

'Give me a summary of the abstract'

In [36]:
run(question)

<generator object run at 0x000001CCD8A01540>

In [37]:
def generate(
    message: str,
    chat_history: list[tuple[str, str]],
    max_new_tokens: int = 1024,
    temperature: float = 0.7,
    top_p: float = 0.90,
    top_k: int = 20,
    repetition_penalty=1.15,    
) -> Iterator[list[tuple[str, str]]]:
    generator = run(message, max_new_tokens, temperature, top_p, top_k,repetition_penalty , )
    try:
        first_response = next(generator)
        yield history + [(message, first_response)]
    except StopIteration:
        yield history + [(message, '')]
    for response in generator:
        yield history + [(message, response)]

In [39]:
def process_example(message: str) -> tuple[str, list[tuple[str, str]]]:
    try:
        generator = generate(message, [], 1024, 1, 0.95, 50,1.5)
    except Exception as error:
      print("An error occurred:", error) # An error occurred: name 'x' is not defined        
            
    for x in generator:
        pass
    return '', x

In [38]:
#process_example("Give me a summary of the abstract")