# Get most recent arXiv manifest

In [1]:
!gsutil cp gs://arxiv-dataset/metadata-v5/arxiv-metadata-oai.json .


Copying gs://arxiv-dataset/metadata-v5/arxiv-metadata-oai.json...
If you experience problems with multiprocessing on MacOS, they might be related to https://bugs.python.org/issue33725. You can disable multiprocessing by editing your .boto config or by adding the following flag to your command: `-o "GSUtil:parallel_process_count=1"`. Note that multithreading is still available even if you disable multiprocessing.

| [1 files][  4.2 GiB/  4.2 GiB]   25.8 MiB/s                                   
Operation completed over 1 objects/4.2 GiB.                                      


In [15]:
import os
import glob

# Get all the wikipedia parquet files from the specified directory
parquet_files = glob.glob('/Users/s2/Library/Mobile Documents/com~apple~CloudDocs/Datasets/dataclysm/wikipedia-titles/parquet/*.parquet')

for file in parquet_files:
    # Check if 'large' is in the filename
    if 'large' in file:
        # Replace 'large' with 'small' in the filename
        new_file = file.replace('large', 'small')
        # Rename the file
        os.rename(file, new_file)


# Generate Embeddings based on arXiv data manifest

In [None]:
import os
import json
import jsonlines
from tqdm import tqdm
from FlagEmbedding import FlagModel
from datasets import load_dataset
import psutil


# Define the model
model = FlagModel('BAAI/bge-small-en-v1.5', 
                  query_instruction_for_retrieval="Write a representation of the following title which is optimized for retrieval:",
                  use_fp16=True)

# Define the directory
directory = '/Users/s2/Repos/harness/output'
if not os.path.exists(directory):
    os.makedirs(directory)

# Load the dataset
dataset = load_dataset("json", data_files="arxiv-metadata-oai.jsonl", split="train", cache_dir=directory)

# Define the process_data function
def process_data(data):
    # Remove delimiters
    title = data['title'].replace('\n', ' ')
    abstract = data['abstract'].replace('\n', ' ')
    
    # Embed the 'title' and 'abstract' fields
    title_embedding = model.encode([title])
    abstract_embedding = model.encode([abstract])
    
    # Add the embeddings to the data
    data['title_embedding'] = title_embedding.tolist()
    data['abstract_embedding'] = abstract_embedding.tolist()
    
    return data

# Define the batch size
batch_size = 100000
batch_data = []
batch_index = 0

# Load the last processed index from a file if it exists
last_processed_index_file = os.path.join(directory, "last_processed_index.txt")
if os.path.isfile(last_processed_index_file):
    with open(last_processed_index_file, "r") as file:
        batch_index = int(file.read())

# Process the dataset
for i in tqdm(range(batch_index, len(dataset)), desc="Processing dataset"):
    data = dataset[i]
    processed_data = process_data(data)
    batch_data.append(processed_data)
    
    # If batch size is reached, write to file and reset batch data
    if len(batch_data) == batch_size:
        file_path = os.path.join(directory, f"{batch_index}_arxiv_metadata_oai.jsonl")
        with jsonlines.open(file_path, mode='w') as writer:
            for item in batch_data:
                writer.write(item)
        batch_data = []
        batch_index += 1
        # Save the last processed index to a file
        with open(last_processed_index_file, "w") as file:
            file.write(str(batch_index))

# Write remaining data to file
if batch_data:
    file_path = os.path.join(directory, f"{batch_index}_arxiv_metadata_oai.jsonl")
    with jsonlines.open(file_path, mode='w') as writer:
        for item in batch_data:
            writer.write(item)
    batch_data = []
    # Save the last processed index to a file
    with open(last_processed_index_file, "w") as file:
        file.write(str(batch_index))


# Convert to Parquet file

In [16]:
import os
import json
import jsonlines
from tqdm import tqdm
from FlagEmbedding import FlagModel
from datasets import load_dataset
import psutil
import pandas as pd

directory = '/Users/s2/Library/Mobile Documents/com~apple~CloudDocs/Datasets/dataclysm/wikipedia-titles-lite'
# Function to convert JSONL to Parquet
def convert_jsonl_to_parquet(jsonl_file_path, parquet_file_path):
    # Read the JSONL file into a pandas DataFrame
    df = pd.read_json(jsonl_file_path, lines=True)
    
    # Write the DataFrame to a Parquet file
    df.to_parquet(parquet_file_path)

# Convert all JSONL files in the directory to Parquet
for file_name in tqdm(os.listdir(directory), desc="Converting files"):
    if file_name.endswith(".jsonl"):
        jsonl_file_path = os.path.join(directory, file_name)
        parquet_file_path = os.path.join(directory, file_name.replace(".jsonl", ".parquet"))
        convert_jsonl_to_parquet(jsonl_file_path, parquet_file_path)

Converting files: 100%|██████████| 65/65 [05:43<00:00,  5.28s/it]


# Correct wrong ID type in Parquet file

In [14]:
import pyarrow.parquet as pq
from tqdm import tqdm
directory = "../output/parquet"

# Function to modify the first column type of a Parquet file
def modify_first_column_type(parquet_file_path):
    # Read the Parquet file into a pandas DataFrame
    df = pd.read_parquet(parquet_file_path)
    
    # Convert the first column to string type
    df[df.columns[0]] = df[df.columns[0]].astype(str)
    
    # Write the DataFrame back to the Parquet file
    df.to_parquet(parquet_file_path)

# Iterate through all the Parquet files in the directory
for file_name in tqdm(os.listdir(directory), desc="Processing files"):
    if file_name.endswith(".parquet"):
        # Get the full file path
        parquet_file_path = os.path.join(directory, file_name)
        # Modify the first column type of the Parquet file
        modify_first_column_type(parquet_file_path)


Processing files: 100%|██████████| 34/34 [01:22<00:00,  2.43s/it]


# Correct model name on Parquet files

# Grab a random Dataclysm arXiv paper's PDF

In [None]:
import random
%pip install wget
import wget
import os
from tqdm import tqdm
from FlagEmbedding import FlagModel
from datasets import load_dataset
import psutil
import pandas as pd

from datasets import load_dataset

# Load the huggingface dataset
dataset = load_dataset('somewheresystems/dataclysm-arxiv')

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset['train'])

# Grab a random entry from the DataFrame
random_entry = df.sample()

# Get the ID of the random entry
id = random_entry['id'].values[0]

# Construct the URL
url = f"https://arxiv.org/pdf/{id}.pdf"

# Download the PDF using wget
wget.download(url, out=os.path.join(directory, f"{id}.pdf"))


# Install some packages

In [5]:
%conda install -c pytorch faiss-cpu transformers datasets jsonlines
%pip install -U FlagEmbedding datasets
%pip install llama-cpp-python httpx

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.7.4
  latest version: 23.11.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.11.0



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.
Collecting transformers==4.34.0 (from FlagEmbedding)
  Using cached transformers-4.34.0-py3-none-any.whl.metadata (121 kB)
Collecting tokenizers<0.15,>=0.14 (from transformers==4.34.0->FlagEmbedding)
  Using cached tokenizers-0.14.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.7 kB)
INFO: pip is looking at multiple versions of tokenizers to determine which version is compatible with other requirements. This could take a while.
  Downloading tokenizers-0.14.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting sentence-transformers (from FlagEmbed

# Initialize arXiv Abstract + Title Indices
This process takes ~15 minutes to index (M3 Max)

# Initialize Wikipedia Database + Index
This process takes 2x as much time as arXiv to download, about ~12 minutes to index (M3 Max)

In [31]:
import numpy as np
from tqdm import tqdm
from FlagEmbedding import FlagModel
from datasets import load_dataset
import pandas as pd
import psutil

def print_memory_usage():
    print(f"Current memory usage: {psutil.Process().memory_info().rss / 1024 ** 2} MB")

print("Loading dataset...")
print_memory_usage()
dataclysm_wikipedia = load_dataset('somewheresystems/dataclysm-wikipedia', split="train")
print_memory_usage()

# Check the structure of the dataset, particularly the 'title_embedding' and 'abstract_embedding' columns
print(dataclysm_wikipedia)
print(dataclysm_wikipedia.column_names)
print(dataclysm_wikipedia.features)
print_memory_usage()

# Define a function to flatten the embeddings and add FAISS index
def flatten_and_add_faiss_index(dataset, column_name):
    embedding_shape = np.array(dataset[0][column_name]).shape
    if len(embedding_shape) == 2:
        print(f"Flattening {column_name} and adding FAISS index...")
        # Flatten the column before adding the FAISS index
        dataset = dataset.map(lambda x: {column_name: np.concatenate(x[column_name])})
        dataset = dataset.add_faiss_index(column=column_name)
        print(f"FAISS index for {column_name} added.")
    else:
        print(f"Cannot add FAISS index for {column_name}.")
    print_memory_usage()
    return dataset

# Add FAISS indices for 'title_embedding' and 'abstract_embedding' and save them to different datasets
dataclysm_wikipedia_indexed = flatten_and_add_faiss_index(dataclysm_wikipedia, 'title_embedding')
print_memory_usage()

print("Datasets loaded.")

# Define the model
print("Initializing model...")
model = FlagModel('BAAI/bge-small-en-v1.5', 
                  query_instruction_for_retrieval="Write a representation of the following query which is optimized for using a similarity search for retrieval:",
                  use_fp16=True)
print("Model initialized.")
print_memory_usage()

Loading dataset...
Current memory usage: 35964.890625 MB


Resolving data files: 100%|██████████| 65/65 [00:00<00:00, 113.29it/s]


Current memory usage: 36104.484375 MB
Dataset({
    features: ['id', 'url', 'title', 'text', 'title_embedding'],
    num_rows: 6458670
})
['id', 'url', 'title', 'text', 'title_embedding']
{'id': Value(dtype='int64', id=None), 'url': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'title_embedding': Sequence(feature=Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), length=-1, id=None)}
Current memory usage: 36104.484375 MB
Flattening title_embedding and adding FAISS index...


Map: 100%|██████████| 6458670/6458670 [12:42<00:00, 8470.21 examples/s]
100%|██████████| 6459/6459 [00:39<00:00, 165.03it/s]


FAISS index for title_embedding added.
Current memory usage: 77425.046875 MB
Current memory usage: 77425.234375 MB
Datasets loaded.
Initializing model...
Model initialized.
Current memory usage: 75355.0 MB


# Initialize arXiv Abstract + Title Indices
This process takes ~15 minutes to index (M3 Max)

In [33]:
import numpy as np
from tqdm import tqdm
from FlagEmbedding import FlagModel
from datasets import load_dataset
import pandas as pd
import psutil

def print_memory_usage():
    print(f"Current memory usage: {psutil.Process().memory_info().rss / 1024 ** 2} MB")

print("Loading dataset...")
print_memory_usage()
dataclysm_arxiv = load_dataset('somewheresystems/dataclysm-arxiv', split="train")
print_memory_usage()

# Check the structure of the dataset, particularly the 'title_embedding' and 'abstract_embedding' columns
print(dataclysm_arxiv)
print(dataclysm_arxiv.column_names)
print(dataclysm_arxiv.features)
print_memory_usage()

# Define a function to flatten the embeddings and add FAISS index
def flatten_and_add_faiss_index(dataset, column_name):
    embedding_shape = np.array(dataset[0][column_name]).shape
    if len(embedding_shape) == 2:
        print(f"Flattening {column_name} and adding FAISS index...")
        # Flatten the column before adding the FAISS index
        dataset = dataset.map(lambda x: {column_name: np.concatenate(x[column_name])})
        dataset = dataset.add_faiss_index(column=column_name)
        print(f"FAISS index for {column_name} added.")
    else:
        print(f"Cannot add FAISS index for {column_name}.")
    print_memory_usage()
    return dataset

# Add FAISS indices for 'title_embedding' and 'abstract_embedding' and save them to different datasets
dataclysm_title_indexed = flatten_and_add_faiss_index(dataclysm_arxiv, 'title_embedding')
dataclysm_abstract_indexed = flatten_and_add_faiss_index(dataclysm_arxiv, 'abstract_embedding')
print_memory_usage()

print("Datasets loaded.")

# Define the model
print("Initializing model...")
model = FlagModel('BAAI/bge-small-en-v1.5', 
                  query_instruction_for_retrieval="Write a representation of the following query which is optimized for using a similarity search for retrieval:",
                  use_fp16=True)
print("Model initialized.")
print_memory_usage()



Loading dataset...
Current memory usage: 71617.296875 MB


Resolving data files: 100%|██████████| 34/34 [00:00<00:00, 127.51it/s]


Current memory usage: 71782.09375 MB
Dataset({
    features: ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'abstract', 'report-no', 'categories', 'versions', 'title_embedding', 'abstract_embedding'],
    num_rows: 3360984
})
['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'abstract', 'report-no', 'categories', 'versions', 'title_embedding', 'abstract_embedding']
{'id': Value(dtype='string', id=None), 'submitter': Value(dtype='string', id=None), 'authors': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'comments': Value(dtype='string', id=None), 'journal-ref': Value(dtype='string', id=None), 'doi': Value(dtype='string', id=None), 'abstract': Value(dtype='string', id=None), 'report-no': Value(dtype='string', id=None), 'categories': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'versions': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'title_embedding': Sequence(

100%|██████████| 3361/3361 [00:19<00:00, 172.06it/s]


FAISS index for title_embedding added.
Current memory usage: 81655.0 MB
Flattening abstract_embedding and adding FAISS index...


Map: 100%|██████████| 3360984/3360984 [08:51<00:00, 6318.74 examples/s]
100%|██████████| 3361/3361 [00:20<00:00, 166.10it/s]


FAISS index for abstract_embedding added.
Current memory usage: 65721.453125 MB
Current memory usage: 65720.3125 MB
Datasets loaded.
Initializing model...
Model initialized.
Current memory usage: 64288.28125 MB


#  arXiv Composite Search with regex Rerank
Search by both Abstract and Title similarity, rank both descending by score. 
1. If a duplicate (title and abstract hit) is found, it increases the score by a factor of 2. 
2. If regex finds the query in the abstract, it increases the score by 0.1 (additive).

In [49]:
query = "Attention Is All You Need"
print("Encoding query...")
query_embedding = model.encode([query])
print("Query encoded.")

print("Retrieving examples by abstract similarity...")
scores_abstract, retrieved_examples_abstract = dataclysm_abstract_indexed.get_nearest_examples('abstract_embedding', query_embedding, k=10)
print("Examples retrieved.")

print("Retrieving examples by title similarity...")
scores_title, retrieved_examples_title = dataclysm_title_indexed.get_nearest_examples('title_embedding', query_embedding, k=10)
print("Examples retrieved.")

from IPython.display import display, HTML
import pandas as pd
import re

# Convert retrieved examples to DataFrame
df_abstract = pd.DataFrame(retrieved_examples_abstract)
df_title = pd.DataFrame(retrieved_examples_title)

# Calculate similarity score in percentage
df_abstract['similarity_score'] = scores_abstract
df_title['similarity_score'] = scores_title

# Add a column to denote the source of retrieval
df_abstract['source'] = 'A'
df_title['source'] = 'T'

# Drop 'title_embedding' and 'abstract_embedding' columns
df_abstract = df_abstract.drop(columns=['title_embedding', 'abstract_embedding'])
df_title = df_title.drop(columns=['title_embedding', 'abstract_embedding'])

# Drop empty columns
df_abstract = df_abstract.dropna(axis=1, how='all')
df_title = df_title.dropna(axis=1, how='all')

# Create a "click to expand" for the abstract so it doesn't take up much space
df_abstract['abstract'] = df_abstract['abstract'].apply(lambda x: f'<details><summary>Abstract</summary>{x}</details>')
df_title['abstract'] = df_title['abstract'].apply(lambda x: f'<details><summary>Abstract</summary>{x}</details>')

# Create a URL field with a hyperlink which is constructed by appending the id onto the end of arxiv.org/abs/
df_abstract['URL'] = df_abstract['id'].apply(lambda x: f'<a href="https://arxiv.org/abs/{x}">Link</a>')
df_title['URL'] = df_title['id'].apply(lambda x: f'<a href="https://arxiv.org/abs/{x}">Link</a>')

# Concatenate the two dataframes
df = pd.concat([df_abstract, df_title])

# Normalize the similarity score to be between 0 and 1
df['similarity_score'] = df['similarity_score'] / df['similarity_score'].max()

# Increase the score if the query is found in the abstract
df['similarity_score'] = df.apply(lambda row: row['similarity_score'] + 0.1 if re.search(query, row['abstract'], re.IGNORECASE) else row['similarity_score'], axis=1)

# Remove duplicates
df = df.drop_duplicates(subset=['id'])

# Sort by ascending similarity score
df = df.sort_values(by='similarity_score', ascending=False)

# Display the DataFrame
from IPython.display import Markdown, display
display(Markdown(f'QUERY: **{query}**'))
display(HTML(df.to_html(escape=False)))


Encoding query...
Query encoded.
Retrieving examples by abstract similarity...
Examples retrieved.
Retrieving examples by title similarity...
Examples retrieved.


QUERY: **Attention Is All You Need**

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,abstract,report-no,categories,versions,similarity_score,source,URL
6,math/9409228,,Alphonse P. Magnus,Painlev\'e equations for semi-classical recurrence coefficients,,,,Abstract The title says it all.\n,OP-SF 6 Sep 1994,[math.CA],[v1],1.0,A,Link
7,math/9803101,Robion C. Kirby,Robion C. Kirby and Laurence R. Taylor,A survey of 4-manifolds through the eyes of surgery,25 pages. To appear in Wall's 60th birthday volume,,,Abstract The title says it all.\n,,[math.GT],[v1],1.0,A,Link
4,1702.04226,Wenyun Ju,Wenyun Ju,Cascading Outage Simulation Based on Dynamic Fast Decoupled Load Flow\n Model,There is an error in equation (23),,,Abstract Frequency is an important\n,,[cs.SY],"[v1, v2, v3]",0.979529,A,Link
2,math/9803061,Hans Schneider,Hans Schneider (U Wisconsin - Madison),Some personal reminiscences of Olga Taussky,,,,Abstract The title says it all\n,,[math.HO math.RA],[v1],0.959123,A,Link
0,1806.06771,Christoph Anderson,"Christoph Anderson, Isabel H\""ubener, Ann-Kathrin Seipp, Sandra Ohly,\n Klaus David, Veljko Pejovic",A Survey of Attention Management Systems in Ubiquitous Computing\n Environments,"27 pages, 7 figures","Proceedings of the ACM on Interactive, Mobile, Wearable and\n Ubiquitous Technologies, vol. 2, no. 2, pp. 58:1-58:27, June 2018",10.1145/3214261,"Abstract Today's information and communication devices provide always-on connectivity,\ninstant access to an endless repository of information, and represent the most\ndirect point of contact to almost any person in the world. Despite these\nadvantages, devices such as smartphones or personal computers lead to the\nphenomenon of attention fragmentation, continuously interrupting individuals'\nactivities and tasks with notifications. Attention management systems aim to\nprovide active support in such scenarios, managing interruptions, for example,\nby postponing notifications to opportune moments for information delivery. In\nthis article, we review attention management system research with a particular\nfocus on ubiquitous computing environments. We first examine cognitive theories\nof attention and extract guidelines for practical attention management systems.\nMathematical models of human attention are at the core of these systems, and in\nthis article, we review sensing and machine learning techniques that make such\nmodels possible. We then discuss design challenges towards the implementation\nof such systems, and finally, we investigate future directions in this area,\npaving the way for new approaches and systems supporting users in their\nattention management.\n",,[cs.HC],[v1],0.956954,A,Link
8,1710.03743,Mat\=iss Rikters,"Mat\=iss Rikters, Mark Fishel",Confidence through Attention,,"Machine Translation Summit XVI, Nagoya, Japan, September 2017",,"Abstract Attention distributions of the generated translations are a useful bi-product\nof attention-based recurrent neural network translation models and can be\ntreated as soft alignments between the input and output tokens. In this work,\nwe use attention distributions as a confidence metric for output translations.\nWe present two strategies of using the attention distributions: filtering out\nbad translations from a large back-translated corpus, and selecting the best\ntranslation in a hybrid setup of two different translation systems. While\nmanual evaluation indicated only a weak correlation between our confidence\nscore and human judgments, the use-cases showed improvements of up to 2.22 BLEU\npoints for filtering and 0.99 points for hybrid translation, tested on\nEnglish<->German and English<->Latvian translation.\n",,[cs.CL],[v1],0.735877,T,Link
6,1810.10126,Yang Li,"Yang Li, Lukasz Kaiser, Samy Bengio, Si Si",Area Attention,8 pages plus references,,,"Abstract Existing attention mechanisms are trained to attend to individual items in a\ncollection (the memory) with a predefined, fixed granularity, e.g., a word\ntoken or an image grid. We propose area attention: a way to attend to areas in\nthe memory, where each area contains a group of items that are structurally\nadjacent, e.g., spatially for a 2D memory such as images, or temporally for a\n1D memory such as natural language sentences. Importantly, the shape and the\nsize of an area are dynamically determined via learning, which enables a model\nto attend to information with varying granularity. Area attention can easily\nwork with existing model architectures such as multi-head attention for\nsimultaneously attending to multiple areas in the memory. We evaluate area\nattention on two tasks: neural machine translation (both character and\ntoken-level) and image captioning, and improve upon strong (state-of-the-art)\nbaselines in all the cases. These improvements are obtainable with a basic form\nof area attention that is parameter free.\n",,[cs.LG cs.AI cs.CL stat.ML],"[v1, v2, v3, v4, v5, v6]",0.710768,T,Link
4,1810.13409,Ofir Press,"Ofir Press, Noah A. Smith",You May Not Need Attention,,,,"Abstract In NMT, how far can we get without attention and without separate encoding\nand decoding? To answer that question, we introduce a recurrent neural\ntranslation model that does not use attention and does not have a separate\nencoder and decoder. Our eager translation model is low-latency, writing target\ntokens as soon as it reads the first source token, and uses constant memory\nduring decoding. It performs on par with the standard attention-based model of\nBahdanau et al. (2014), and better on long sentences.\n",,[cs.CL],[v1],0.582988,T,Link
2,1804.02391,Saumya Jetley,"Saumya Jetley, Nicholas A. Lord, Namhoon Lee, Philip H.S. Torr",Learn To Pay Attention,International Conference on Learning Representations 2018,,,"Abstract We propose an end-to-end-trainable attention module for convolutional neural\nnetwork (CNN) architectures built for image classification. The module takes as\ninput the 2D feature vector maps which form the intermediate representations of\nthe input image at different stages in the CNN pipeline, and outputs a 2D\nmatrix of scores for each map. Standard CNN architectures are modified through\nthe incorporation of this module, and trained under the constraint that a\nconvex combination of the intermediate 2D feature vectors, as parameterised by\nthe score matrices, must \textit{alone} be used for classification.\nIncentivised to amplify the relevant and suppress the irrelevant or misleading,\nthe scores thus assume the role of attention values. Our experimental\nobservations provide clear evidence to this effect: the learned attention maps\nneatly highlight the regions of interest while suppressing background clutter.\nConsequently, the proposed function is able to bootstrap standard CNN\narchitectures for the task of image classification, demonstrating superior\ngeneralisation over 6 unseen benchmark datasets. When binarised, our attention\nmaps outperform other CNN-based attention maps, traditional saliency maps, and\ntop object proposals for weakly supervised segmentation as demonstrated on the\nObject Discovery dataset. We also demonstrate improved robustness against the\nfast gradient sign method of adversarial attack.\n",,[cs.CV cs.AI],"[v1, v2]",0.580535,T,Link
0,1706.03762,Ashish Vaswani,"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion\n Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin",Attention Is All You Need,"15 pages, 5 figures",,,"Abstract The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks in an encoder-decoder configuration. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer, based\nsolely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to be\nsuperior in quality while being more parallelizable and requiring significantly\nless time to train. Our model achieves 28.4 BLEU on the WMT 2014\nEnglish-to-German translation task, improving over the existing best results,\nincluding ensembles by over 2 BLEU. On the WMT 2014 English-to-French\ntranslation task, our model establishes a new single-model state-of-the-art\nBLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction\nof the training costs of the best models from the literature. We show that the\nTransformer generalizes well to other tasks by applying it successfully to\nEnglish constituency parsing both with large and limited training data.\n",,[cs.CL cs.LG],"[v1, v2, v3, v4, v5]",0.0,T,Link


# Wikipedia simple search (Title)

In [None]:
query = "Retrieval Augmented Generation"
print("Encoding query...")
query_embedding = model.encode([query])
print("Query encoded.")

print("Retrieving examples by title similarity...")
scores, retrieved_examples = dataclysm_wikipedia_indexed.get_nearest_examples('title_embedding', query_embedding, k=10)
print("Examples retrieved.")

from IPython.display import display, HTML
import pandas as pd

# Convert retrieved examples to DataFrame
df = pd.DataFrame(retrieved_examples)

# Calculate similarity score in percentage
df['similarity_score'] = scores


# Drop 'title_embedding' and 'abstract_embedding' columns
df = df.drop(columns=['title_embedding'])

# Drop empty columns
df = df.dropna(axis=1, how='all')

# Create a "click to expand" for the abstract so it doesn't take up much space
df['text'] = df['text'].apply(lambda x: f'<details><summary>Article Text</summary>{x}</details>')


# Create a URL field with a hyperlink 
df['url'] = df['url'].apply(lambda x: f'<a href="{url}">Link</a>')

# Sort by ascending similarity score
df = df.sort_values(by='similarity_score', ascending=False)

# Display the DataFrame
from IPython.display import Markdown, display
display(Markdown(f'QUERY: **{query}**'))
display(HTML(df.to_html(escape=False)))


# Download OpenHermes-2.5-Mistral-7B

In [102]:
%pip install huggingface-cli
!huggingface-cli download TheBloke/OpenHermes-2.5-Mistral-7B-GGUF openhermes-2.5-mistral-7b.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Consider using `hf_transfer` for faster downloads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
downloading https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf to /Users/s2/.cache/huggingface/hub/tmpdpa1h_vs
openhermes-2.5-mistral-7b.Q4_K_M.gguf: 100%|█| 4.37G/4.37G [02:37<00:00, 27.8MB/
./openhermes-2.5-mistral-7b.Q4_K_M.gguf


# Retrieval Augmented Generation

In [117]:
from llama_cpp import Llama
from llama_cpp import LlamaGrammar
import pandas as pd
import json
import httpx

model = "openhermes-2.5-mistral-7b.Q4_K_M.gguf"
prompt = f"{df[['id', 'title', 'abstract']].to_html(escape=False)} ### Instruction: Use the information above to answer the query: EXPLAIN {query} ### Response:"


llm = Llama(model_path=model, n_ctx=8096, last_n_tokens_size=256, n_threads=4, n_gpu_layers=0)

stream = llm.create_completion(prompt, stream=True, repeat_penalty=1.1, max_tokens=256, stop=["\n"], echo=False, temperature=0, mirostat_mode = 2, mirostat_tau=4.0, mirostat_eta=1.1)
result = ""
for output in stream:
    result += output['choices'][0]['text']

print(result)

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from openhermes-2.5-mistral-7b.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = teknium_openhermes-2.5-mistral-7b
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 l

 This is the title of a paper published in 2017 that introduced a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. The paper achieved significant improvements in machine translation tasks and established a new single-model state-of-the-art BLEU score of 41.8 on the WMT 2014 English-to-French translation task.



llama_print_timings:        load time =    6858.21 ms
llama_print_timings:      sample time =       9.76 ms /    94 runs   (    0.10 ms per token,  9628.19 tokens per second)
llama_print_timings: prompt eval time =   37015.65 ms /  2298 tokens (   16.11 ms per token,    62.08 tokens per second)
llama_print_timings:        eval time =    5096.66 ms /    93 runs   (   54.80 ms per token,    18.25 tokens per second)
llama_print_timings:       total time =   42232.10 ms /  2391 tokens


# Rerank results using an LLM (experimental)

In [121]:
from llama_cpp import Llama
from llama_cpp import LlamaGrammar
import pandas as pd
import json
import httpx
grammar_text = httpx.get("https://raw.githubusercontent.com/ggerganov/llama.cpp/master/grammars/json_arr.gbnf").text
grammar = LlamaGrammar.from_string(grammar_text)

model = "openhermes-2.5-mistral-7b.Q4_K_M.gguf"
prompt = f"""You are an expert at generating valid JSON.
###
Instruction:
Return a valid JSON Array containing arXiv ['id'] field reranked according to how relevant the result is to the query based on its other columns at that ['id']. Drop any items that are not relevant to the query. Return just an array of the IDs, like [x,y,z] and so on in the correct order:
        INDEX: {df[['id', 'title', 'abstract']].to_html(escape=False)}
        QUERY: {query}
        Take a deep breath, and solve the problem step-by-step.
###
Response:"""


llm = Llama(model_path=model, n_ctx=8096, last_n_tokens_size=256, n_threads=4, n_gpu_layers=0)

    
stream = llm.create_completion(prompt, stream=True, repeat_penalty=1.1, max_tokens=256, stop=["]"], echo=False, temperature=0, mirostat_mode = 2, mirostat_tau=4.0, mirostat_eta=1.1, grammar=grammar)
result = ""
for output in stream:
    result += output['choices'][0]['text']

result = result + "]"

# Check if the result is a string, an array string, or a single ID in an array and convert it to a list of IDs
if isinstance(result, str):
    result_ids = [result.strip('[]')]
elif isinstance(result, list):
    if isinstance(result[0], str):
        result_ids = [json.loads(res) for res in result]
    else:
        result_ids = result
# Print the result
print(result_ids)
import re

# Extract IDs from the potentially broken string using regex
result_ids = re.findall(r'"(.*?)"', result_ids[0])

# Filter the dataframe to only include rows with IDs in the result
filtered_df = df[df['id'].isin(result_ids)]

# Create a categorical type for sorting based on the order in result_ids
filtered_df['id'] = pd.Categorical(filtered_df['id'], categories=result_ids, ordered=True)

# Sort the dataframe based on the 'id' column
filtered_df = filtered_df.sort_values('id')

# Drop the similarity score column
filtered_df = filtered_df.drop(columns=['similarity_score'])

# Display the filtered dataframe as a table with hyperlinks
display(HTML(filtered_df.to_html(escape=False)))


from_string grammar:
root ::= arr 
arr ::= [[] [<U+000A>] ws arr_12 []] 
value ::= object | array | string | number | value_7 ws 
object ::= [{] ws object_16 [}] ws 
array ::= [[] ws array_20 []] ws 
string ::= ["] string_23 ["] ws 
number ::= number_24 number_30 number_34 ws 
value_7 ::= [t] [r] [u] [e] | [f] [a] [l] [s] [e] | [n] [u] [l] [l] 
ws ::= ws_36 
arr_9 ::= value arr_11 
arr_10 ::= [,] [<U+000A>] ws value 
arr_11 ::= arr_10 arr_11 | 
arr_12 ::= arr_9 | 
object_13 ::= string [:] ws value object_15 
object_14 ::= [,] ws string [:] ws value 
object_15 ::= object_14 object_15 | 
object_16 ::= object_13 | 
array_17 ::= value array_19 
array_18 ::= [,] ws value 
array_19 ::= array_18 array_19 | 
array_20 ::= array_17 | 
string_21 ::= [^"\] | [\] string_22 
string_22 ::= ["\/bfnrt] | [u] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] 
string_23 ::= string_21 string_23 | 
number_24 ::= number_25 number_26 
number_25 ::= [-] | 
number_26 ::= [0-9] | [1-9] number_27 
number_27 ::= [0

['\n "1706.03762",\n "1804.02391",\n "1810.13409",\n "1810.10126"\n']



llama_print_timings:        load time =    6855.33 ms
llama_print_timings:      sample time =     271.12 ms /    55 runs   (    4.93 ms per token,   202.86 tokens per second)
llama_print_timings: prompt eval time =   38534.66 ms /  2402 tokens (   16.04 ms per token,    62.33 tokens per second)
llama_print_timings:        eval time =    2956.72 ms /    54 runs   (   54.75 ms per token,    18.26 tokens per second)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['id'] = pd.Categorical(filtered_df['id'], categories=result_ids, ordered=True)
llama_print_timings:       total time =   41870.97 ms /  2456 tokens


Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,abstract,report-no,categories,versions,source,URL
0,1706.03762,Ashish Vaswani,"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion\n Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin",Attention Is All You Need,"15 pages, 5 figures",,,"Abstract The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks in an encoder-decoder configuration. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer, based\nsolely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to be\nsuperior in quality while being more parallelizable and requiring significantly\nless time to train. Our model achieves 28.4 BLEU on the WMT 2014\nEnglish-to-German translation task, improving over the existing best results,\nincluding ensembles by over 2 BLEU. On the WMT 2014 English-to-French\ntranslation task, our model establishes a new single-model state-of-the-art\nBLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction\nof the training costs of the best models from the literature. We show that the\nTransformer generalizes well to other tasks by applying it successfully to\nEnglish constituency parsing both with large and limited training data.\n",,[cs.CL cs.LG],"[v1, v2, v3, v4, v5]",T,Link
2,1804.02391,Saumya Jetley,"Saumya Jetley, Nicholas A. Lord, Namhoon Lee, Philip H.S. Torr",Learn To Pay Attention,International Conference on Learning Representations 2018,,,"Abstract We propose an end-to-end-trainable attention module for convolutional neural\nnetwork (CNN) architectures built for image classification. The module takes as\ninput the 2D feature vector maps which form the intermediate representations of\nthe input image at different stages in the CNN pipeline, and outputs a 2D\nmatrix of scores for each map. Standard CNN architectures are modified through\nthe incorporation of this module, and trained under the constraint that a\nconvex combination of the intermediate 2D feature vectors, as parameterised by\nthe score matrices, must \textit{alone} be used for classification.\nIncentivised to amplify the relevant and suppress the irrelevant or misleading,\nthe scores thus assume the role of attention values. Our experimental\nobservations provide clear evidence to this effect: the learned attention maps\nneatly highlight the regions of interest while suppressing background clutter.\nConsequently, the proposed function is able to bootstrap standard CNN\narchitectures for the task of image classification, demonstrating superior\ngeneralisation over 6 unseen benchmark datasets. When binarised, our attention\nmaps outperform other CNN-based attention maps, traditional saliency maps, and\ntop object proposals for weakly supervised segmentation as demonstrated on the\nObject Discovery dataset. We also demonstrate improved robustness against the\nfast gradient sign method of adversarial attack.\n",,[cs.CV cs.AI],"[v1, v2]",T,Link
4,1810.13409,Ofir Press,"Ofir Press, Noah A. Smith",You May Not Need Attention,,,,"Abstract In NMT, how far can we get without attention and without separate encoding\nand decoding? To answer that question, we introduce a recurrent neural\ntranslation model that does not use attention and does not have a separate\nencoder and decoder. Our eager translation model is low-latency, writing target\ntokens as soon as it reads the first source token, and uses constant memory\nduring decoding. It performs on par with the standard attention-based model of\nBahdanau et al. (2014), and better on long sentences.\n",,[cs.CL],[v1],T,Link
6,1810.10126,Yang Li,"Yang Li, Lukasz Kaiser, Samy Bengio, Si Si",Area Attention,8 pages plus references,,,"Abstract Existing attention mechanisms are trained to attend to individual items in a\ncollection (the memory) with a predefined, fixed granularity, e.g., a word\ntoken or an image grid. We propose area attention: a way to attend to areas in\nthe memory, where each area contains a group of items that are structurally\nadjacent, e.g., spatially for a 2D memory such as images, or temporally for a\n1D memory such as natural language sentences. Importantly, the shape and the\nsize of an area are dynamically determined via learning, which enables a model\nto attend to information with varying granularity. Area attention can easily\nwork with existing model architectures such as multi-head attention for\nsimultaneously attending to multiple areas in the memory. We evaluate area\nattention on two tasks: neural machine translation (both character and\ntoken-level) and image captioning, and improve upon strong (state-of-the-art)\nbaselines in all the cases. These improvements are obtainable with a basic form\nof area attention that is parameter free.\n",,[cs.LG cs.AI cs.CL stat.ML],"[v1, v2, v3, v4, v5, v6]",T,Link
