I suggest using Python 3.10 in a conda environment with this.

## Install Dependencies

In [None]:
%pip install -r requirements.txt
%pip install faiss-cpu faiss-gpu

# Initialize Wikipedia Database + Index
This process takes 2x as much time as arXiv to download, about ~12 minutes to index (M3 Max)

In [None]:
import numpy as np
from tqdm import tqdm
from FlagEmbedding import FlagModel
from datasets import load_dataset
import pandas as pd
import psutil

def print_memory_usage():
    print(f"Current memory usage: {psutil.Process().memory_info().rss / 1024 ** 2} MB")

print("Loading dataset...")
print_memory_usage()
dataclysm_wikipedia = load_dataset('somewheresystems/dataclysm-wikipedia', split="train")
print_memory_usage()

# Check the structure of the dataset, particularly the 'title_embedding' and 'abstract_embedding' columns
print(dataclysm_wikipedia)
print(dataclysm_wikipedia.column_names)
print(dataclysm_wikipedia.features)
print_memory_usage()

# Define a function to flatten the embeddings and add FAISS index
def flatten_and_add_faiss_index(dataset, column_name):
    embedding_shape = np.array(dataset[0][column_name]).shape
    if len(embedding_shape) == 2:
        print(f"Flattening {column_name} and adding FAISS index...")
        # Flatten the column before adding the FAISS index
        dataset = dataset.map(lambda x: {column_name: np.concatenate(x[column_name])})
        dataset = dataset.add_faiss_index(column=column_name)
        print(f"FAISS index for {column_name} added.")
    else:
        print(f"Cannot add FAISS index for {column_name}.")
    print_memory_usage()
    return dataset

# Add FAISS indices for 'title_embedding' and 'abstract_embedding' and save them to different datasets
dataclysm_wikipedia_indexed = flatten_and_add_faiss_index(dataclysm_wikipedia, 'title_embedding')
print_memory_usage()

print("Datasets loaded.")

# Define the model
print("Initializing model...")
model = FlagModel('BAAI/bge-small-en-v1.5', 
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
                  use_fp16=True)
print("Model initialized.")
print_memory_usage()

# Initialize arXiv Abstract + Title Indices
This process takes ~15 minutes to index (M3 Max)

In [None]:
import numpy as np
from tqdm import tqdm
from FlagEmbedding import FlagModel
from datasets import load_dataset
import pandas as pd
import psutil

def print_memory_usage():
    print(f"Current memory usage: {psutil.Process().memory_info().rss / 1024 ** 2} MB")

print("Loading dataset...")
print_memory_usage()
dataclysm_arxiv = load_dataset('somewheresystems/dataclysm-arxiv', split="train")
print_memory_usage()

# Check the structure of the dataset, particularly the 'title_embedding' and 'abstract_embedding' columns
print(dataclysm_arxiv)
print(dataclysm_arxiv.column_names)
print(dataclysm_arxiv.features)
print_memory_usage()

# Define a function to flatten the embeddings and add FAISS index
def flatten_and_add_faiss_index(dataset, column_name):
    embedding_shape = np.array(dataset[0][column_name]).shape
    if len(embedding_shape) == 2:
        print(f"Flattening {column_name} and adding FAISS index...")
        # Flatten the column before adding the FAISS index
        dataset = dataset.map(lambda x: {column_name: np.concatenate(x[column_name])})
        dataset = dataset.add_faiss_index(column=column_name)
        print(f"FAISS index for {column_name} added.")
    else:
        print(f"Cannot add FAISS index for {column_name}.")
    print_memory_usage()
    return dataset

# Add FAISS indices for 'title_embedding' and 'abstract_embedding' and save them to different datasets
dataclysm_title_indexed = flatten_and_add_faiss_index(dataclysm_arxiv, 'title_embedding')
dataclysm_abstract_indexed = flatten_and_add_faiss_index(dataclysm_arxiv, 'abstract_embedding')
print_memory_usage()

print("Datasets loaded.")

# Define the model
print("Initializing model...")
model = FlagModel('BAAI/bge-small-en-v1.5', 
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
                  use_fp16=True)
print("Model initialized.")
print_memory_usage()



# Initialize PubMed Title Indices

In [None]:
import numpy as np
from tqdm import tqdm
from FlagEmbedding import FlagModel
from datasets import load_dataset
import pandas as pd
import psutil

def print_memory_usage():
    print(f"Current memory usage: {psutil.Process().memory_info().rss / 1024 ** 2} MB")

print("Loading dataset...")
print_memory_usage()
dataclysm_pubmed = load_dataset('somewheresystems/dataclysm-pubmed', split="train")
print_memory_usage()

# Check the structure of the dataset, particularly the 'title_embedding' and 'abstract_embedding' columns
print(dataclysm_pubmed)
print(dataclysm_pubmed.column_names)
print(dataclysm_pubmed.features)
print_memory_usage()

def flatten_and_add_faiss_index(dataset, column_name):
    embedding_shape = np.array(dataset[0][column_name]).shape
    try:
        if len(embedding_shape) == 2:
            print(f"Flattening {column_name} and adding FAISS index...")
            # Flatten the column before adding the FAISS index
            dataset = dataset.map(lambda x: {column_name: np.concatenate(x[column_name])} if x[column_name] is not None else {})
            # Remove entries with no embeddings
            dataset = dataset.filter(lambda x: column_name in x)
            # If the column is 'abstract_embedding', remove entries with empty abstracts
            if column_name == 'abstract_embedding':
                dataset = dataset.filter(lambda x: len(x['abstract_embedding']) != 0)
            dataset = dataset.add_faiss_index(column=column_name)
            print(f"FAISS index for {column_name} added.")
        else:
            print(f"Cannot add FAISS index for {column_name}.")
    except Exception as e:
        print(f"Failed to add FAISS index for {column_name}. Error: {e}")
    print_memory_usage()
    return dataset

# Add FAISS indices for 'title_embedding' and 'abstract_embedding' and save them to different datasets
dataclysm_pubmed_title_indexed = flatten_and_add_faiss_index(dataclysm_pubmed, 'title_embedding')
dataclysm_pubmed_abstract_indexed = flatten_and_add_faiss_index(dataclysm_pubmed, 'abstract_embedding')
print_memory_usage()

print("Datasets loaded.")

# Define the model
print("Initializing model...")
model = FlagModel('BAAI/bge-small-en-v1.5', 
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
                  use_fp16=True)
print("Model initialized.")
print_memory_usage()



#  arXiv Composite Search with regex Rerank
Search by both Abstract and Title similarity, rank both descending by score. 
1. If a duplicate (title and abstract hit) is found, it increases the score by a factor of 2. 
2. If regex finds the query in the abstract, it increases the score by 0.1 (additive).

In [None]:
query = "Attention Is All You Need"
print("Encoding query...")
query_embedding = model.encode([query])
print("Query encoded.")

print("Retrieving examples by abstract similarity...")
scores_abstract, retrieved_examples_abstract = dataclysm_abstract_indexed.get_nearest_examples('abstract_embedding', query_embedding, k=10)
print("Examples retrieved.")

print("Retrieving examples by title similarity...")
scores_title, retrieved_examples_title = dataclysm_title_indexed.get_nearest_examples('title_embedding', query_embedding, k=10)
print("Examples retrieved.")

from IPython.display import display, HTML
import pandas as pd
import re

# Convert retrieved examples to DataFrame
df_abstract = pd.DataFrame(retrieved_examples_abstract)
df_title = pd.DataFrame(retrieved_examples_title)

# Calculate similarity score in percentage
df_abstract['similarity_score'] = scores_abstract
df_title['similarity_score'] = scores_title

# Add a column to denote the source of retrieval
df_abstract['source'] = 'A'
df_title['source'] = 'T'

# Drop 'title_embedding' and 'abstract_embedding' columns
df_abstract = df_abstract.drop(columns=['title_embedding', 'abstract_embedding'])
df_title = df_title.drop(columns=['title_embedding', 'abstract_embedding'])

# Drop empty columns
df_abstract = df_abstract.dropna(axis=1, how='all')
df_title = df_title.dropna(axis=1, how='all')

# Create a "click to expand" for the abstract so it doesn't take up much space
df_abstract['abstract'] = df_abstract['abstract'].apply(lambda x: f'<details><summary>Abstract</summary>{x}</details>')
df_title['abstract'] = df_title['abstract'].apply(lambda x: f'<details><summary>Abstract</summary>{x}</details>')

# Create a URL field with a hyperlink which is constructed by appending the id onto the end of arxiv.org/abs/
df_abstract['URL'] = df_abstract['id'].apply(lambda x: f'<a href="https://arxiv.org/abs/{x}">Link</a>')
df_title['URL'] = df_title['id'].apply(lambda x: f'<a href="https://arxiv.org/abs/{x}">Link</a>')

# Concatenate the two dataframes
df = pd.concat([df_abstract, df_title])

# Normalize the similarity score to be between 0 and 1
df['similarity_score'] = df['similarity_score'] / df['similarity_score'].max()

# Increase the score if the query is found in the abstract
df['similarity_score'] = df.apply(lambda row: row['similarity_score'] + 0.1 if re.search(query, row['abstract'], re.IGNORECASE) else row['similarity_score'], axis=1)

# Remove duplicates
df = df.drop_duplicates(subset=['id'])

# Sort by ascending similarity score
df = df.sort_values(by='similarity_score', ascending=False)

# Display the DataFrame
from IPython.display import Markdown, display
display(Markdown(f'QUERY: **{query}**'))
display(HTML(df.to_html(escape=False)))


# PubMed Simple Search (by title)

In [22]:
query = "bioinformatics"
print("Encoding query...")
query_embedding = model.encode([query])
print("Query encoded.")

print("Retrieving examples by title similarity...")
scores, retrieved_examples = dataclysm_pubmed_title_indexed.get_nearest_examples('title_embedding', query_embedding, k=10)
print("Examples retrieved.")

from IPython.display import display, HTML
import pandas as pd

# Convert retrieved examples to DataFrame
df = pd.DataFrame(retrieved_examples)

# Calculate similarity score in percentage
df['similarity_score'] = scores


# Drop 'title_embedding' and 'abstract_embedding' columns
df = df.drop(columns=['title_embedding'])

# Drop empty columns
df = df.dropna(axis=1, how='all')

# Create a collapsible element for 'abstract_embedding'
df['abstract_embedding'] = df['abstract_embedding'].apply(lambda x: f'<details><summary>Abstract Embedding</summary>{x}</details>')
# Sort by ascending similarity score
df = df.sort_values(by='similarity_score', ascending=False)

# Display the DataFrame
from IPython.display import Markdown, display
display(Markdown(f'QUERY: **{query}**'))
display(HTML(df.to_html(escape=False)))


Encoding query...
Query encoded.
Retrieving examples by title similarity...
Examples retrieved.


QUERY: **bioinformatics**

Unnamed: 0,PMID,ArticleTitle,AbstractText,abstract_embedding,similarity_score
9,1078478,[Interactive data processing in the area of medical biology research].,,Abstract Embedding[],0.432746
8,10197053,Genomics in the real world.,"The term genomics has evolved into a catch-all term for a variety of information intensive biological methodologies. While the promise of genomics in the bio/pharmaceutical industry is great, its impact on the drug discovery pipeline has not yet been realized, excluding a few notable exceptions. As companies acquire several years of experience in working with genomic data, it is likely that the impact on the discovery process will slowly emerge as we learn to integrate these new technologies into individual discovery programs. It is clear that extracting novel biologically valid targets targets from exponentially growing amounts of sequence data requires time and considerable investment in biological research infrastructure. In order to accelerate the process of target validation, a variety of functional genomics technologies are also being developed to try to predict the effect of inhibitory compounds in advance of development. Resources spent on early stage exploratory efforts such as these can pay off by improving the success rate for screening and medicinal chemistry.","Abstract Embedding[[-0.02972412109375, -0.0218505859375, -0.0156402587890625, -0.049652099609375, 0.0282440185546875, -0.01373291015625, 0.01904296875, 0.03106689453125, -0.01119232177734375, -0.005401611328125, 0.00621795654296875, 0.006938934326171875, 0.036407470703125, -0.0278472900390625, -0.028778076171875, 0.01494598388671875, -0.0170745849609375, -0.00878143310546875, -0.016815185546875, -0.0162506103515625, -0.00872802734375, 0.00682830810546875, 0.03985595703125, -0.0199737548828125, -0.00035119056701660156, 0.056640625, -0.005901336669921875, -0.04107666015625, -0.044708251953125, -0.2412109375, 0.002429962158203125, 0.027252197265625, 0.06005859375, -0.006435394287109375, -0.004337310791015625, 0.0404052734375, -0.0169525146484375, 0.0364990234375, -0.023193359375, 0.06732177734375, 0.0226898193359375, 0.048919677734375, -0.02874755859375, 0.0343017578125, 0.016754150390625, -0.038360595703125, -0.07293701171875, 0.0024261474609375, -0.0110321044921875, -0.02001953125, -0.060150146484375, -0.0460205078125, 0.000682830810546875, 0.04132080078125, -0.0194854736328125, 0.027740478515625, 0.01071929931640625, 0.0152130126953125, -0.0166778564453125, 0.0228118896484375, 0.04827880859375, 0.01195526123046875, -0.10736083984375, 0.039459228515625, 0.023223876953125, 0.0333251953125, -0.0233001708984375, -0.01274871826171875, 0.056854248046875, 0.04693603515625, -0.0200347900390625, 0.017608642578125, 0.0298919677734375, 0.0203857421875, 0.037689208984375, 0.046966552734375, 0.005489349365234375, -0.01146697998046875, 0.06787109375, -0.0205535888671875, 0.062286376953125, -0.0195770263671875, 0.0053863525390625, -0.068603515625, -0.030609130859375, 0.0090484619140625, -0.016998291015625, 0.02679443359375, 0.036895751953125, 0.018829345703125, 0.05462646484375, -0.0275726318359375, 0.01190948486328125, -4.029273986816406e-05, -0.08758544921875, -0.01280975341796875, 0.031097412109375, -0.05413818359375, 0.02117919921875, 0.39794921875, -0.051177978515625, 0.04034423828125, -0.08868408203125, -0.0183563232421875, 0.00841522216796875, -0.047088623046875, -0.027740478515625, -0.070068359375, 0.0309295654296875, -0.0352783203125, 0.052276611328125, 0.01080322265625, 0.02154541015625, 0.01629638671875, 0.009429931640625, -0.005878448486328125, -0.007598876953125, 0.0119781494140625, -0.033966064453125, 0.0204010009765625, -0.025634765625, 0.02191162109375, 0.0306396484375, 0.040313720703125, 0.00598907470703125, 0.048065185546875, -0.01715087890625, 0.08099365234375, 0.01151275634765625, -0.00931549072265625, 0.0550537109375, 0.04315185546875, -0.05670166015625, 0.035308837890625, 0.001903533935546875, -0.016845703125, -0.035186767578125, 0.005634307861328125, -0.0294342041015625, 0.0120849609375, -0.0181121826171875, 0.01141357421875, -0.0211181640625, -0.055938720703125, -0.081298828125, 0.1116943359375, 0.07818603515625, 0.045166015625, -0.040191650390625, -0.054412841796875, -0.0181121826171875, 0.0712890625, 0.0254669189453125, 0.037750244140625, -0.01291656494140625, 0.06121826171875, 0.0199737548828125, -0.0511474609375, -0.04779052734375, 0.0263519287109375, -0.07696533203125, 0.030670166015625, -0.061126708984375, 0.11712646484375, 0.0095367431640625, -0.036285400390625, 0.00738525390625, -0.0036468505859375, 0.041107177734375, 0.0210723876953125, -0.0001533031463623047, -0.028289794921875, 0.02850341796875, 0.0261688232421875, -0.1029052734375, -0.048919677734375, -0.03143310546875, -0.0196075439453125, -0.01483917236328125, -0.0090484619140625, 0.0389404296875, -0.013092041015625, 0.01470947265625, 0.0175628662109375, -0.001068115234375, -0.054412841796875, -0.036346435546875, 0.0215606689453125, 0.01250457763671875, 0.03814697265625, -0.04071044921875, 0.0670166015625, -0.00115203857421875, -0.0093994140625, -0.044647216796875, -0.08746337890625, -0.00955963134765625, 0.045745849609375, -0.010589599609375, -0.072265625, 0.023101806640625, 0.049835205078125, 0.048309326171875, 0.05615234375, -0.00431060791015625, -0.01180267333984375, -0.00550079345703125, -0.0227508544921875, 0.04510498046875, 0.00939178466796875, -0.053680419921875, 0.0212249755859375, -0.01363372802734375, 0.0084991455078125, -0.0283203125, 0.003833770751953125, 0.0140228271484375, 0.0216064453125, 0.00811767578125, 0.039947509765625, 0.0007691383361816406, 0.031402587890625, -0.01641845703125, -0.299560546875, 0.004253387451171875, -0.01285552978515625, 0.032073974609375, -0.024871826171875, -0.00743865966796875, -0.007511138916015625, 0.0162506103515625, -0.038177490234375, 0.0120849609375, -0.00452423095703125, 0.11517333984375, -0.011138916015625, -0.045013427734375, -0.037841796875, -0.055511474609375, -0.00026297569274902344, -0.020050048828125, -0.033233642578125, -0.027923583984375, -0.007724761962890625, -0.0161285400390625, 0.0019855499267578125, -0.0299224853515625, 0.03021240234375, -0.0216522216796875, 0.1373291015625, 0.026702880859375, -0.059783935546875, 0.032501220703125, 0.01287078857421875, 0.021453857421875, -0.026031494140625, -0.1156005859375, 0.0122528076171875, -0.01050567626953125, 0.05987548828125, -0.0193939208984375, 0.004425048828125, 0.00470733642578125, -0.0097503662109375, -0.003009796142578125, -0.025115966796875, -0.07421875, -0.034271240234375, 0.0185546875, -0.010406494140625, -0.01366424560546875, -0.02008056640625, 0.02789306640625, 0.0699462890625, 0.06353759765625, 0.07904052734375, -0.0226287841796875, 0.009765625, -0.036956787109375, -0.03619384765625, 0.056915283203125, -0.0193023681640625, -0.033355712890625, 0.0343017578125, 0.041015625, -0.0235748291015625, 0.03326416015625, 0.008758544921875, -0.0927734375, -0.0396728515625, -0.01715087890625, 0.043792724609375, -0.060150146484375, 0.029937744140625, 0.108642578125, -0.0273590087890625, 0.037078857421875, 0.0731201171875, -0.00366973876953125, -0.0133819580078125, -0.11102294921875, -0.055999755859375, 0.06805419921875, 0.0293121337890625, -0.01224517822265625, 0.031219482421875, -0.01476287841796875, -0.031036376953125, 0.002773284912109375, 0.0428466796875, -0.0341796875, 0.01171875, -0.034210205078125, 0.002994537353515625, -0.030609130859375, -0.00394439697265625, -0.08441162109375, 0.042022705078125, 0.007053375244140625, -0.2008056640625, 0.053802490234375, 0.00765228271484375, 0.0236358642578125, 0.036712646484375, -0.01262664794921875, 0.06732177734375, -0.1002197265625, 0.0164642333984375, 0.033416748046875, -0.006378173828125, -0.0036602020263671875, 0.07421875, -0.0305633544921875, 0.00408935546875, 0.0274200439453125, 0.1270751953125, -0.05841064453125, 0.0152130126953125, -0.01371002197265625, 0.06964111328125, 0.0294647216796875, 0.181884765625, -0.031494140625, 0.01155853271484375, 0.0008349418640136719, -0.013397216796875, -0.040802001953125, -0.026336669921875, 0.0195770263671875, -0.003330230712890625, 0.024169921875, 0.050445556640625, -0.034423828125, 0.0011043548583984375, 0.074462890625, 0.033416748046875, -0.039764404296875, -0.039031982421875, -0.01328277587890625, 0.047271728515625, -0.04949951171875, -0.052215576171875, -0.005962371826171875, 0.06158447265625, -0.077392578125, -0.049468994140625, -0.05657958984375, -0.0290069580078125, 0.055908203125, -0.042510986328125, -0.0111541748046875, 0.017852783203125, -0.0016431808471679688, -0.00701904296875, -0.0114593505859375, 0.0169830322265625, 0.0031032562255859375, 0.033447265625, 0.0052947998046875, -0.026275634765625, 0.0355224609375, -0.08453369140625, 0.07550048828125, -0.01177978515625]]",0.432628
7,10047737,[Utility of genome databases: future perspectives].,,Abstract Embedding[],0.431294
6,10203836,Microbial genomics.,,Abstract Embedding[],0.431236
5,10193187,Genomics and the biology of parasites.,"Despite the advances of modern medicine, the threat of chronic illness, disfigurement, or death that can result from parasitic infection still affects the majority of the world population, retarding economic development. For most parasitic diseases, current therapeutics often leave much to be desired in terms of administration regime, toxicity, or effectiveness and potential vaccines are a long way from market. Our best prospects for identifying new targets for drug, vaccine, and diagnostics development and for dissecting the biological basis of drug resistance, antigenic diversity, infectivity and pathology lie in parasite genome analysis, and international mapping and gene discovery initiatives are under way for a variety of protozoan and helminth parasites. These are far from ideal experimental organisms, and the influence of biological and genomic characteristics on experimental approaches is discussed, progress is reviewed and future prospects are examined.","Abstract Embedding[[-0.01416015625, -0.01340484619140625, 0.038238525390625, 0.020843505859375, 0.00539398193359375, -0.026885986328125, 0.052703857421875, 0.110107421875, -0.05792236328125, -0.058074951171875, 0.02374267578125, -0.061279296875, 0.00754547119140625, -0.00139617919921875, -0.04156494140625, 0.0092926025390625, -0.061370849609375, 0.01433563232421875, 0.0229644775390625, 0.03375244140625, -0.007472991943359375, -0.0049591064453125, 0.031219482421875, -0.061279296875, -0.04742431640625, 0.04010009765625, 0.0219879150390625, 0.01035308837890625, -0.0301361083984375, -0.192626953125, -0.01238250732421875, -0.0296783447265625, -0.016510009765625, -0.03558349609375, -0.004352569580078125, -0.039276123046875, -0.04278564453125, 0.0173492431640625, -0.009063720703125, 0.0433349609375, 0.0419921875, 0.057708740234375, -0.012237548828125, 0.027313232421875, 0.01523590087890625, -0.07501220703125, -0.0780029296875, 0.019256591796875, 0.045928955078125, -0.038848876953125, -0.034454345703125, -0.07171630859375, 0.00010001659393310547, 0.076416015625, -0.002529144287109375, -0.022003173828125, 0.0004782676696777344, 0.006107330322265625, 0.0015153884887695312, 0.051025390625, 0.00678253173828125, 0.037353515625, -0.0858154296875, 0.06304931640625, 0.040924072265625, 0.047088623046875, -0.04498291015625, -0.035003662109375, 0.0184326171875, -0.017333984375, -0.048065185546875, 0.006359100341796875, 0.044158935546875, 0.03509521484375, 0.00691986083984375, 0.053863525390625, -0.00876617431640625, -0.052734375, 0.06500244140625, 0.0162506103515625, 0.036163330078125, -0.007965087890625, 0.04278564453125, -0.0007901191711425781, -0.03973388671875, -0.004245758056640625, 0.004497528076171875, 0.0102081298828125, 0.0209808349609375, 0.01043701171875, 0.04766845703125, -0.0175628662109375, 0.0287017822265625, 0.0171966552734375, -0.0789794921875, -0.004314422607421875, -0.00971221923828125, -0.0181884765625, -0.0007534027099609375, 0.44287109375, 0.019805908203125, -0.0220947265625, -0.04803466796875, 0.002834320068359375, -0.0185394287109375, 0.00868988037109375, 0.0168304443359375, -0.06134033203125, 0.0288543701171875, 0.056640625, 0.01274871826171875, 0.0110626220703125, -0.0249786376953125, 0.0293121337890625, 0.0091400146484375, 0.0013113021850585938, 0.01090240478515625, 0.027679443359375, -0.0330810546875, 0.039703369140625, -0.0104522705078125, -0.039093017578125, 0.040924072265625, -0.03533935546875, 0.049072265625, 0.034088134765625, -0.016876220703125, 0.12060546875, 0.01279449462890625, 0.005558013916015625, 0.050201416015625, 0.045440673828125, -0.0236968994140625, 0.06597900390625, 0.01453399658203125, 0.0008058547973632812, -0.0305328369140625, -0.01470947265625, -0.03265380859375, 0.03826904296875, -0.059783935546875, -0.044036865234375, -0.044464111328125, -0.08966064453125, -0.09033203125, 0.08746337890625, 0.02880859375, 0.0227203369140625, -0.06292724609375, -0.020660400390625, 0.0029239654541015625, 0.06597900390625, 0.0251312255859375, 0.05084228515625, -0.0587158203125, 0.056854248046875, 0.0239105224609375, 0.003040313720703125, -0.004695892333984375, -0.02850341796875, -0.0853271484375, 0.00013518333435058594, -0.047088623046875, 0.1500244140625, -0.0194091796875, -0.05511474609375, -0.00972747802734375, 0.0037822723388671875, 0.018310546875, 0.042205810546875, -0.0546875, -0.00433349609375, 0.02679443359375, 0.00041294097900390625, -0.05615234375, -0.01873779296875, -0.091552734375, 0.007213592529296875, 0.02728271484375, 0.013427734375, -0.0094146728515625, -0.01538848876953125, -0.039947509765625, 0.0034503936767578125, -0.02130126953125, -0.0030384063720703125, -0.025238037109375, 0.07086181640625, -0.023162841796875, 0.01018524169921875, -0.002964019775390625, 0.002025604248046875, -0.0540771484375, 0.04083251953125, 0.0263214111328125, -0.0482177734375, -0.005977630615234375, -0.0188140869140625, -0.01137542724609375, -0.043914794921875, 0.03240966796875, 0.061065673828125, 0.0269622802734375, 0.0545654296875, -0.006961822509765625, -0.020904541015625, 0.05792236328125, -0.050262451171875, 0.0576171875, 0.003692626953125, -0.0147857666015625, 0.023101806640625, -0.0498046875, 0.0176544189453125, -0.055419921875, 0.01525115966796875, -0.0087738037109375, 0.03271484375, 0.10400390625, 0.0260162353515625, 0.0184783935546875, 0.055694580078125, -0.049072265625, -0.28759765625, 0.0125274658203125, -0.046722412109375, -0.032135009765625, -0.055572509765625, -0.0199432373046875, -0.023834228515625, 0.006107330322265625, 0.036590576171875, 0.01922607421875, 0.026947021484375, 0.08319091796875, 0.0301361083984375, 0.01549530029296875, -0.07757568359375, 0.0138397216796875, -0.01922607421875, -0.026824951171875, 0.00255584716796875, 0.01983642578125, 0.040496826171875, -0.04827880859375, 0.0955810546875, -0.078125, -0.022674560546875, -0.055206298828125, 0.1297607421875, 0.04052734375, -0.004878997802734375, 0.00836181640625, 0.01122283935546875, -0.002361297607421875, -0.0245819091796875, -0.0736083984375, -0.005191802978515625, -0.0251922607421875, 0.01506805419921875, -0.049407958984375, -0.034454345703125, -0.009735107421875, -0.01221466064453125, -0.00992584228515625, 0.01385498046875, -0.035675048828125, -0.03302001953125, 0.03875732421875, -0.017974853515625, 0.01538848876953125, 0.04315185546875, 0.0325927734375, 0.0010728836059570312, 0.040740966796875, 0.03704833984375, -0.0284881591796875, -0.053436279296875, 0.03631591796875, 0.0299224853515625, 0.014556884765625, -0.04339599609375, 0.028961181640625, 0.007625579833984375, 0.0224456787109375, -0.01494598388671875, 0.0290985107421875, -0.02313232421875, -0.0262908935546875, -0.03680419921875, -0.0137786865234375, 0.032958984375, -0.0391845703125, 0.011383056640625, 0.12469482421875, 0.01322174072265625, 0.0087738037109375, 0.031768798828125, 0.0014495849609375, 0.049774169921875, -0.00983428955078125, -0.01055145263671875, 0.07354736328125, 0.032623291015625, -0.004566192626953125, -0.02984619140625, 0.01953125, 0.0031032562255859375, 0.03924560546875, 0.0400390625, -0.00301361083984375, -0.0077056884765625, -0.02935791015625, -0.0018863677978515625, -0.020965576171875, 0.005107879638671875, -0.085205078125, -0.043426513671875, 0.0440673828125, -0.209716796875, 0.057769775390625, 0.006107330322265625, 0.02569580078125, -0.03350830078125, -0.0009388923645019531, 0.0478515625, -0.0496826171875, -0.0029582977294921875, -0.017974853515625, 0.0888671875, 0.035247802734375, 0.06268310546875, 0.0093994140625, 0.0139617919921875, -0.0059356689453125, 0.06396484375, -0.061492919921875, 0.017059326171875, -0.028717041015625, -0.01207733154296875, -0.005550384521484375, 0.1802978515625, -0.04498291015625, -0.004344940185546875, 0.038116455078125, -0.001384735107421875, 0.045745849609375, -0.0251007080078125, 0.01922607421875, -0.0074615478515625, -0.020660400390625, 0.0062103271484375, -0.04168701171875, -0.00030422210693359375, 0.039581298828125, 0.0243682861328125, -0.020965576171875, -0.046783447265625, -0.031402587890625, -0.0290679931640625, 0.004180908203125, 0.01226806640625, -0.0010900497436523438, 0.03753662109375, -0.0159454345703125, -0.032806396484375, -0.0325927734375, -0.050506591796875, 0.03326416015625, -0.11529541015625, -0.018280029296875, 0.0096893310546875, -0.028076171875, -0.00432586669921875, -0.03973388671875, 0.0235443115234375, -0.017852783203125, 0.0011816024780273438, -0.0194854736328125, 0.0160980224609375, -0.0032444000244140625, -0.1124267578125, 0.08453369140625, -0.0005488395690917969]]",0.423589
4,10089485,The Nucleic Acid Database: A resource for nucleic acid science.,The Nucleic Acid Database (NDB) distributes information about nucleic acid-containing structures. Here the information content of the database as well as the query capabilities are described. A summary of how the technology developed by this project has been used to develop other macromolecular databases is given.,"Abstract Embedding[[-0.017120361328125, -0.010833740234375, -0.056640625, -0.0198822021484375, 0.04217529296875, 0.004749298095703125, 0.01934814453125, -0.01849365234375, 0.0102386474609375, 0.05279541015625, 0.007022857666015625, -0.0014896392822265625, 0.05224609375, -0.0247802734375, -0.0070343017578125, 0.0068817138671875, -0.040252685546875, -0.03900146484375, 0.00853729248046875, 0.010894775390625, 0.0290374755859375, 0.0075531005859375, -0.0160369873046875, -0.01352691650390625, -0.0034809112548828125, 0.1119384765625, 0.0282440185546875, -0.01410675048828125, -0.05120849609375, -0.1971435546875, 0.0276336669921875, 0.0175018310546875, 0.0187530517578125, -0.05499267578125, 0.020050048828125, -0.0203704833984375, 0.026123046875, -0.0226593017578125, -0.0423583984375, 0.015899658203125, 0.0518798828125, -0.020751953125, -0.0109405517578125, -0.006687164306640625, 0.08270263671875, -0.04656982421875, -0.010345458984375, -0.01009368896484375, 0.0259246826171875, -0.0290679931640625, -0.0367431640625, -0.06597900390625, -0.040252685546875, 0.0087890625, 0.0015773773193359375, 0.0213775634765625, 0.021484375, 0.0067901611328125, -0.02105712890625, -0.0203857421875, 0.075927734375, 0.0024776458740234375, -0.089111328125, 0.091064453125, 0.0501708984375, 0.045867919921875, 0.01067352294921875, -0.018951416015625, 0.04180908203125, 0.032379150390625, -0.0155487060546875, -0.0247039794921875, 0.006359100341796875, 0.047698974609375, 0.02276611328125, -0.031005859375, 0.050201416015625, -0.0274658203125, 0.0288848876953125, -0.0191650390625, 0.001708984375, -0.0506591796875, 0.040740966796875, -0.05401611328125, -0.0238800048828125, 0.01410675048828125, -0.07666015625, -0.0350341796875, -0.01580810546875, 0.0147705078125, 0.017486572265625, 0.0098419189453125, 0.038299560546875, 0.0233917236328125, -0.0645751953125, -0.02874755859375, -0.016632080078125, -0.039703369140625, 0.082275390625, 0.34814453125, -0.04266357421875, 0.03173828125, -0.03057861328125, -0.036956787109375, -0.049835205078125, -0.032745361328125, 0.0011377334594726562, -0.0433349609375, -0.01446533203125, 0.01090240478515625, 0.03643798828125, -0.029815673828125, 0.0204315185546875, -0.0841064453125, 0.02734375, -0.0081329345703125, -0.015899658203125, -0.0216827392578125, -0.049224853515625, -0.01206207275390625, -0.06439208984375, 0.01337432861328125, -0.0038700103759765625, 0.06158447265625, 0.06756591796875, -0.011749267578125, -0.05267333984375, 0.075927734375, 0.056182861328125, 0.0113372802734375, 0.056488037109375, 0.037353515625, -0.030792236328125, -0.031982421875, -0.002399444580078125, 0.006069183349609375, 0.02020263671875, -0.01241302490234375, 0.0031585693359375, 0.037200927734375, -0.01190948486328125, -0.02227783203125, -0.06976318359375, 0.0031147003173828125, -0.1383056640625, 0.061920166015625, -0.061279296875, 0.0430908203125, -0.060333251953125, -0.034393310546875, 0.028106689453125, 0.0775146484375, 0.015777587890625, -0.0254974365234375, 0.01371002197265625, 0.033050537109375, -0.0261688232421875, -0.04901123046875, -0.050048828125, -0.023223876953125, -0.053436279296875, 0.006076812744140625, 0.022369384765625, 0.10101318359375, -0.0084991455078125, -0.09149169921875, -0.020538330078125, 0.02056884765625, 0.05584716796875, -0.0301666259765625, 0.0294189453125, 0.01161956787109375, -0.051177978515625, 0.0465087890625, -0.0290374755859375, -0.0072021484375, -0.0004143714904785156, 0.035003662109375, 0.057769775390625, -0.00974273681640625, 0.025543212890625, -0.0498046875, -0.038909912109375, -0.0032672882080078125, 0.0157928466796875, -0.033203125, -0.084716796875, 0.035552978515625, 0.045013427734375, 0.034515380859375, -0.029632568359375, 0.043731689453125, -0.0225372314453125, 0.0190277099609375, -0.0034046173095703125, -0.058929443359375, 0.073974609375, 0.02978515625, -0.01248931884765625, -0.044036865234375, 0.07073974609375, 0.03570556640625, -0.00339508056640625, 0.11236572265625, 0.00537109375, -0.0290985107421875, -0.0225067138671875, -0.02789306640625, -0.00246429443359375, -0.0214691162109375, 0.004192352294921875, -0.0136566162109375, -0.10772705078125, -0.002368927001953125, -0.0894775390625, -0.058349609375, 0.033843994140625, 0.041168212890625, 0.051025390625, -0.01190948486328125, -0.01654052734375, -0.03973388671875, -0.007205963134765625, -0.279296875, 0.031402587890625, 0.0237579345703125, 0.01788330078125, -0.018096923828125, 0.002048492431640625, -0.00806427001953125, -0.040985107421875, -0.022064208984375, 0.02313232421875, 0.0318603515625, 0.0145111083984375, 0.0132904052734375, -0.08221435546875, -0.0679931640625, 0.06781005859375, 0.07049560546875, -0.0253143310546875, -0.07855224609375, -0.002117156982421875, 0.051513671875, -0.017303466796875, -0.0133056640625, -0.01238250732421875, 0.032623291015625, -0.0240631103515625, 0.1099853515625, -0.0609130859375, 0.01325225830078125, 0.038116455078125, 0.0501708984375, 0.034454345703125, -0.07061767578125, -0.08154296875, 0.0030422210693359375, -0.036224365234375, -0.07025146484375, -0.0003921985626220703, -0.031982421875, -0.033538818359375, 0.005115509033203125, 0.028594970703125, 0.036651611328125, -0.0909423828125, -0.0017671585083007812, -0.005756378173828125, 0.02752685546875, -0.0030193328857421875, -0.004596710205078125, -0.01180267333984375, 0.00464630126953125, 0.000606536865234375, 0.017974853515625, 0.0771484375, 0.033416748046875, -0.004352569580078125, -0.0182647705078125, -0.03631591796875, -0.0249481201171875, 0.0255126953125, 0.03765869140625, 0.00861358642578125, -0.04791259765625, 0.02667236328125, -0.01210784912109375, -0.066650390625, 0.04302978515625, 0.0209503173828125, 0.06298828125, -0.07769775390625, -0.0014715194702148438, 0.1072998046875, -0.05078125, 0.071533203125, 0.035003662109375, 0.034881591796875, -0.019439697265625, -0.0843505859375, -0.049163818359375, -0.0133056640625, 0.02734375, -0.056854248046875, 0.087646484375, -0.0007505416870117188, 0.056610107421875, 0.05096435546875, 0.1007080078125, 0.001972198486328125, -0.010009765625, 0.0174407958984375, 0.0321044921875, -0.048095703125, 0.0186767578125, -0.032623291015625, 0.01544189453125, -0.0060882568359375, -0.2496337890625, 0.09503173828125, 0.0002777576446533203, 0.033203125, 0.04730224609375, 0.0501708984375, 0.073974609375, -0.002567291259765625, 0.01241302490234375, 0.0263519287109375, 0.0860595703125, 0.0277862548828125, -0.00739288330078125, -0.065673828125, -0.014892578125, 0.04376220703125, 0.1312255859375, -0.056396484375, 0.031463623046875, 0.03814697265625, 0.0198974609375, 0.034881591796875, 0.1578369140625, 0.01430511474609375, 0.01416015625, 0.002773284912109375, -0.016326904296875, 0.0034999847412109375, -0.0194549560546875, 0.002716064453125, -0.008392333984375, 0.03765869140625, 0.061370849609375, -0.0621337890625, -0.0433349609375, 0.06243896484375, 0.0015840530395507812, 0.0020084381103515625, -0.001331329345703125, -0.0053863525390625, -0.0428466796875, 0.0093231201171875, -0.053802490234375, 0.00577545166015625, 0.0733642578125, -0.006778717041015625, -0.061920166015625, -0.07012939453125, 0.026519775390625, 0.0079345703125, -0.01125335693359375, -0.01114654541015625, 0.0635986328125, -0.002521514892578125, -0.0283203125, -0.036773681640625, 0.04315185546875, 0.016448974609375, 0.02288818359375, -0.0022106170654296875, 0.022735595703125, -0.01192474365234375, -0.0675048828125, 0.06683349609375, 0.0009241104125976562]]",0.419733
3,10175124,The monster code: biology and the computer sciences.,,Abstract Embedding[],0.416959
2,10203763,"Bioinformatics, pharma and farmers.",,Abstract Embedding[],0.397592
1,10191386,How will bioinformatics influence metabolic engineering?,"Ten microbial genomes have been fully sequenced to date, and the sequencing of many more genomes is expected to be completed before the end of the century. The assignment of function to open reading frames (ORFs) is progressing, and for some genomes over 70% of functional assignments have been made. The majority of the assigned ORFs relate to metabolic functions. Thus, the complete genetic and biochemical functions of a number of microbial cells may be soon available. From a metabolic engineering standpoint, these developments open a new realm of possibilities. Metabolic analysis and engineering strategies can now be built on a sound genomic basis. An important question that now arises; how should these tasks be approached? Flux-balance analysis (FBA) has the potential to play an important role. It is based on the fundamental principle of mass conservation. It requires only the stoichiometric matrix, the metabolic demands, and some strain specific parameters. Importantly, no enzymatic kinetic data is required. In this article, we show how the genomically defined microbial metabolic genotypes can be analyzed by FBA. Fundamental concepts of metabolic genotype, metabolic phenotype, metabolic redundancy and robustness are defined and examples of their use given. We discuss the advantage of this approach, and how FBA is expected to find uses in the near future. FBA is likely to become an important analysis tool for genomically based approaches to metabolic engineering, strain design, and development.","Abstract Embedding[[-0.0275421142578125, 0.00980377197265625, -0.045135498046875, 0.00040531158447265625, 0.070556640625, -0.0007672309875488281, -0.03692626953125, 0.01983642578125, -0.048492431640625, -0.0031642913818359375, -0.00417327880859375, -0.11627197265625, 0.0122833251953125, -0.0229034423828125, -0.026458740234375, -0.01158905029296875, -0.035308837890625, 0.038482666015625, 0.034210205078125, -0.053985595703125, 0.064208984375, -0.003948211669921875, -0.039215087890625, 0.0034027099609375, 0.041900634765625, -0.0081024169921875, 0.007724761962890625, -0.023101806640625, -0.090087890625, -0.269287109375, 0.0227813720703125, 0.0263671875, 0.04595947265625, -0.030609130859375, -0.0133514404296875, 0.0307769775390625, -0.00766754150390625, -0.021484375, -0.041290283203125, 0.0777587890625, 0.00789642333984375, 0.0240020751953125, 0.038848876953125, 0.0293426513671875, -0.0117645263671875, -0.0198822021484375, -0.0278472900390625, 0.0406494140625, 0.03350830078125, -0.033721923828125, -0.04229736328125, -0.05712890625, 0.004974365234375, 0.0377197265625, -0.0335693359375, 0.0118560791015625, 0.04736328125, -0.01038360595703125, -0.0215606689453125, 0.006076812744140625, 0.045074462890625, -0.00821685791015625, -0.11993408203125, 0.0111846923828125, 0.05517578125, -0.00719451904296875, -0.030059814453125, -0.0272216796875, 0.043365478515625, 0.0196990966796875, -0.076904296875, 0.037109375, -0.0035190582275390625, -0.00994110107421875, 0.04302978515625, 0.041351318359375, 0.010833740234375, -0.0155487060546875, 0.005767822265625, 0.023651123046875, 0.00640869140625, 0.02825927734375, 0.01393890380859375, -0.0085296630859375, -0.060211181640625, 0.009429931640625, 0.01103973388671875, -0.02301025390625, 0.037078857421875, 0.052215576171875, -0.0107421875, -0.04791259765625, 0.024810791015625, 0.0296783447265625, -0.06573486328125, -0.02532958984375, 0.045135498046875, -0.035369873046875, 0.04638671875, 0.4306640625, -0.0404052734375, 0.05560302734375, 0.0239410400390625, -0.0005412101745605469, 0.04559326171875, -0.043853759765625, 0.01271820068359375, 0.00499725341796875, 0.01181793212890625, -0.042144775390625, 0.0302734375, -0.0010786056518554688, -0.029052734375, 0.01210784912109375, 0.0120086669921875, -0.006988525390625, -0.0008611679077148438, 0.005725860595703125, -0.05523681640625, 0.036102294921875, -0.002079010009765625, -0.0205230712890625, -0.0014495849609375, 0.01184844970703125, 0.05853271484375, -0.0248870849609375, -0.005626678466796875, 0.0758056640625, 0.032135009765625, 0.0178070068359375, 0.055450439453125, 0.0748291015625, -0.0279541015625, -0.0005159378051757812, -0.054901123046875, 0.01416778564453125, -0.0081939697265625, -0.024444580078125, 0.01303863525390625, -0.0022602081298828125, -0.00620269775390625, 0.020172119140625, -0.037750244140625, -0.0943603515625, -0.10052490234375, 0.07745361328125, 0.0284576416015625, 0.047882080078125, -0.0175933837890625, -0.0281219482421875, 0.043243408203125, 0.054656982421875, 0.01403045654296875, -0.03173828125, 0.07354736328125, 0.07891845703125, -0.01336669921875, 0.0004892349243164062, -0.0126800537109375, 0.0063323974609375, -0.0970458984375, 0.014007568359375, 0.00905609130859375, 0.07373046875, 0.037445068359375, -0.0165863037109375, -0.0369873046875, 0.0197601318359375, -0.0069580078125, -0.01654052734375, 0.03436279296875, -0.0097808837890625, 0.040679931640625, -0.0218505859375, -0.018310546875, -0.0191192626953125, -0.00319671630859375, -0.0206146240234375, 0.034332275390625, 0.044036865234375, 0.02862548828125, 0.00530242919921875, -0.01340484619140625, 0.01290130615234375, 0.035308837890625, 0.020782470703125, -0.031768798828125, -0.0016336441040039062, -0.038848876953125, 0.037445068359375, -0.01739501953125, 0.0004849433898925781, -0.03216552734375, 0.09564208984375, -0.025787353515625, -0.02764892578125, 0.022369384765625, 0.037109375, -0.022216796875, -0.06268310546875, 0.022430419921875, 0.05517578125, 0.01454925537109375, 0.0191192626953125, -0.0101165771484375, -0.03704833984375, -0.04229736328125, 0.0243682861328125, -0.00689697265625, 0.052947998046875, -0.06646728515625, -0.0041046142578125, -0.08184814453125, 0.019989013671875, -0.061004638671875, 0.0313720703125, 0.04034423828125, 0.00603485107421875, 0.0311126708984375, -0.0026798248291015625, 0.059112548828125, -0.02349853515625, -0.035919189453125, -0.31591796875, -0.032257080078125, 0.0111846923828125, -0.0291595458984375, -0.00402069091796875, -0.00827789306640625, -0.007389068603515625, -0.01526641845703125, -0.0293731689453125, -0.0103912353515625, 0.05206298828125, 0.05029296875, -0.05523681640625, -0.031585693359375, -0.039031982421875, 0.00677490234375, 0.0126495361328125, -0.0770263671875, -0.04736328125, -0.0017328262329101562, 0.0631103515625, -0.0113525390625, 0.0919189453125, 0.0032138824462890625, 0.058349609375, -0.0178680419921875, 0.092529296875, -0.03045654296875, 0.08966064453125, 0.0097198486328125, 0.007228851318359375, 0.0023632049560546875, 0.035308837890625, -0.028778076171875, -0.0124664306640625, -0.00994110107421875, 0.0189971923828125, -0.05633544921875, 0.024017333984375, 0.004299163818359375, -0.004222869873046875, 0.0243072509765625, -0.011688232421875, -0.130859375, -0.026031494140625, -0.0289154052734375, -0.03662109375, -0.0751953125, -0.009613037109375, -0.0270538330078125, -0.022369384765625, 0.01043701171875, 0.041229248046875, -0.0272216796875, 0.0157928466796875, -0.0222015380859375, 0.005443572998046875, -0.02886962890625, -0.057830810546875, 0.00928497314453125, -0.00563812255859375, 0.0193328857421875, -0.00672149658203125, 0.058837890625, 0.0079345703125, 0.0009250640869140625, -0.06488037109375, 0.0256195068359375, 0.0452880859375, -0.06427001953125, 0.0251617431640625, 0.07666015625, -0.0008788108825683594, 0.01558685302734375, 0.0301055908203125, 0.0215911865234375, 0.01800537109375, -0.06512451171875, -0.038787841796875, -0.01123809814453125, 0.047637939453125, -0.04412841796875, 0.03973388671875, -0.01392364501953125, 0.007640838623046875, 0.060760498046875, 0.0019588470458984375, -0.0296630859375, 0.005817413330078125, 0.042999267578125, 0.001377105712890625, -0.0196380615234375, -0.0193328857421875, -0.0369873046875, 0.0479736328125, 0.03643798828125, -0.2181396484375, 0.027801513671875, 0.032562255859375, 0.04840087890625, -0.0014085769653320312, 0.01007080078125, 0.07159423828125, -0.072998046875, 0.015716552734375, 0.0239715576171875, 0.06878662109375, 0.0092010498046875, 0.0307769775390625, 0.0406494140625, 0.04034423828125, 0.0404052734375, 0.0771484375, -0.0523681640625, 0.059356689453125, -0.040679931640625, 0.03826904296875, 0.007274627685546875, 0.1546630859375, -0.0134429931640625, 0.025146484375, 0.0055694580078125, -0.0253143310546875, 0.0281982421875, -0.024169921875, 0.0071563720703125, 0.021453857421875, -0.00467681884765625, 0.07464599609375, -0.024139404296875, -0.00130462646484375, 0.046173095703125, 0.031341552734375, -0.04608154296875, -0.043487548828125, -0.0252838134765625, 0.04217529296875, -0.02978515625, -0.0266571044921875, -0.041748046875, 0.059356689453125, -0.056549072265625, -0.092041015625, -0.07452392578125, -0.0213775634765625, 0.0037670135498046875, -0.0295257568359375, 0.01092529296875, 0.019683837890625, -0.001888275146484375, 0.00618743896484375, -0.028839111328125, -0.0280609130859375, -0.00981903076171875, -0.045166015625, -0.028045654296875, 0.0193328857421875, 0.0166473388671875, -0.06396484375, 0.08599853515625, -0.022064208984375]]",0.395189
0,10066490,Genomics and computational molecular biology.,"There has been a dramatic increase in the number of completely sequenced bacterial genomes during the past two years as a result of the efforts both of public genome agencies and the pharmaceutical industry. The availability of completely sequenced genomes permits more systematic analyses of genes, evolution and genome function than was otherwise possible. Using computational methods - which are used to identify genes and their functions including statistics, sequence similarity, motifs, profiles, protein folds and probabilistic models - it is possible to develop characteristic genome signatures, assign functions to genes, identify pathogenic genes, identify metabolic pathways, develop diagnostic probes and discover potential drug-binding sites. All of these directions are critical to understanding bacterial growth, pathogenicity and host-pathogen interactions.","Abstract Embedding[[-0.05792236328125, -0.0160369873046875, -0.038909912109375, -0.031280517578125, 0.05096435546875, -0.01432037353515625, -0.0167388916015625, 0.0762939453125, -0.002201080322265625, -0.004703521728515625, 0.007671356201171875, -0.023468017578125, 0.06329345703125, -0.032379150390625, -0.02178955078125, -0.006542205810546875, -0.052886962890625, 0.0234222412109375, 0.004009246826171875, -0.022735595703125, 0.034912109375, 0.03472900390625, -0.0192108154296875, -0.0517578125, 0.0248565673828125, 0.037322998046875, 0.0242462158203125, 0.019195556640625, -0.05242919921875, -0.2178955078125, -0.00045871734619140625, 0.01629638671875, 0.06622314453125, -0.044891357421875, 0.0118255615234375, -0.02520751953125, 0.032196044921875, -0.005840301513671875, -0.00463104248046875, -0.01122283935546875, 0.036163330078125, -0.0143585205078125, 0.059417724609375, -0.0093994140625, 0.0254974365234375, -0.00605010986328125, -0.01629638671875, 0.034942626953125, 0.042388916015625, -0.07562255859375, -0.07086181640625, -0.04571533203125, -0.02728271484375, 0.0654296875, -0.033477783203125, -0.01493072509765625, 0.012542724609375, -0.0103759765625, 0.0083160400390625, 0.003086090087890625, 0.022674560546875, -0.0006613731384277344, -0.094970703125, 0.056488037109375, 0.00299072265625, 0.043426513671875, 0.00077056884765625, -0.050628662109375, 0.08251953125, 0.07476806640625, -0.0587158203125, -0.0127716064453125, -0.0206298828125, 0.041534423828125, 0.016693115234375, 0.0301513671875, -0.0198974609375, -0.037139892578125, 0.040740966796875, -0.0183563232421875, 0.0264434814453125, -0.0270233154296875, 0.04998779296875, -0.022705078125, -0.058380126953125, 0.032318115234375, -0.00933074951171875, -0.0272979736328125, -0.0018110275268554688, 0.0217132568359375, 0.0401611328125, -0.02301025390625, 0.04486083984375, 0.00559234619140625, -0.085205078125, -0.035125732421875, 0.018035888671875, -0.03863525390625, 0.05279541015625, 0.3916015625, -0.1064453125, -0.026214599609375, -0.02215576171875, 0.03240966796875, 0.0196990966796875, -0.00556182861328125, 0.028564453125, -0.043731689453125, -0.0282135009765625, -0.019683837890625, -0.0270538330078125, 0.0164642333984375, -0.0092620849609375, -0.006031036376953125, 0.0115203857421875, 0.01235198974609375, -0.03546142578125, 0.0162353515625, -0.055938720703125, -0.006038665771484375, 0.0086822509765625, -0.0011529922485351562, 0.0284271240234375, 0.039703369140625, 0.024322509765625, 0.060150146484375, -0.042083740234375, 0.08526611328125, -0.0015583038330078125, 0.01157379150390625, 0.057708740234375, 0.04815673828125, -0.0526123046875, 0.0711669921875, 0.00997161865234375, 0.004283905029296875, -0.0289306640625, -0.031341552734375, 0.00655364990234375, 0.002246856689453125, -0.034149169921875, 0.0116729736328125, -0.011322021484375, -0.08154296875, -0.104736328125, 0.11004638671875, 0.0086822509765625, 0.041259765625, -0.027252197265625, -0.04766845703125, 0.040435791015625, 0.042877197265625, 0.0291748046875, -0.0394287109375, 0.00946807861328125, 0.07232666015625, 0.040130615234375, -0.05487060546875, 0.0081024169921875, 0.032806396484375, -0.1319580078125, 0.0215301513671875, -0.004711151123046875, 0.1365966796875, 0.0360107421875, -0.07086181640625, -0.006175994873046875, 0.009765625, 0.052520751953125, 0.0005292892456054688, -0.01386260986328125, 0.041748046875, -0.0005130767822265625, 0.0195465087890625, -0.05377197265625, -0.0005478858947753906, -0.036102294921875, 0.01114654541015625, 0.0029754638671875, 0.0350341796875, -0.00750732421875, -0.035797119140625, -0.03814697265625, 0.058990478515625, -0.0028247833251953125, -0.032470703125, -0.0247802734375, -0.0067138671875, -0.0212554931640625, 0.0158233642578125, -0.006023406982421875, 0.00931549072265625, -0.001964569091796875, -0.0242919921875, 0.01195526123046875, -0.057281494140625, 0.0003497600555419922, 0.051849365234375, -0.003772735595703125, -0.01276397705078125, 0.03265380859375, 0.04144287109375, -0.015655517578125, -0.03155517578125, 0.056732177734375, -0.0494384765625, 0.032806396484375, 0.00826263427734375, -0.01311492919921875, 0.042236328125, -0.04644775390625, -0.006458282470703125, -0.07244873046875, 0.010467529296875, -0.06768798828125, -0.016510009765625, 0.041229248046875, 0.056060791015625, 0.032440185546875, 0.01082611083984375, 0.027435302734375, 0.017791748046875, -0.041229248046875, -0.31494140625, -0.01139068603515625, -0.0340576171875, -0.0307159423828125, -0.006744384765625, 0.0181121826171875, -0.0318603515625, 0.0015735626220703125, -0.0194854736328125, -0.0007467269897460938, 0.02081298828125, 0.056854248046875, -0.0187530517578125, -0.0452880859375, -0.052947998046875, 0.03472900390625, 0.020050048828125, -0.0594482421875, -0.09368896484375, -0.01493072509765625, 0.03826904296875, 0.01491546630859375, 0.018585205078125, -0.03326416015625, 0.01122283935546875, -0.005733489990234375, 0.10858154296875, -0.021331787109375, 0.0860595703125, 0.01351165771484375, -0.00972747802734375, -0.0292510986328125, -0.022369384765625, -0.07440185546875, -0.007625579833984375, -0.00994873046875, 0.0187225341796875, -0.0364990234375, 0.046783447265625, -0.018829345703125, 0.00894927978515625, 0.0043487548828125, -0.0250091552734375, -0.00372314453125, -0.024200439453125, -0.0091400146484375, -0.0177459716796875, -0.05621337890625, 0.01166534423828125, 0.05328369140625, -0.008575439453125, 0.027191162109375, 0.0264892578125, 0.005458831787109375, -0.016387939453125, 0.0017385482788085938, -0.0066070556640625, -0.023590087890625, -0.0174713134765625, 0.0291748046875, 0.0157318115234375, 0.01287841796875, -0.0262603759765625, 0.0367431640625, 0.01052093505859375, -0.042083740234375, -0.0177001953125, 0.0360107421875, 0.07415771484375, -0.0677490234375, -0.04144287109375, 0.085205078125, -0.036041259765625, 0.004909515380859375, 0.0457763671875, 0.0114593505859375, -0.0121917724609375, -0.12457275390625, -0.046142578125, 0.0012054443359375, 0.0645751953125, -0.0289306640625, 0.0243988037109375, 0.0085906982421875, 0.0305633544921875, 0.01493072509765625, 0.01041412353515625, -0.007373809814453125, 0.01441192626953125, 0.0689697265625, -0.044830322265625, -0.04327392578125, 0.02264404296875, -0.02398681640625, -0.005817413330078125, 0.028472900390625, -0.206298828125, 0.0592041015625, -0.0184173583984375, 0.06646728515625, -0.0218658447265625, -0.01357269287109375, 0.0794677734375, -0.09429931640625, 0.01180267333984375, -0.0028896331787109375, 0.0469970703125, 0.0389404296875, 0.06610107421875, -0.048553466796875, 0.0238800048828125, 0.053955078125, 0.10931396484375, -0.032867431640625, 0.02557373046875, -0.0002510547637939453, 0.032196044921875, 0.05303955078125, 0.17919921875, -0.08087158203125, 0.06329345703125, 0.0135955810546875, 0.0198516845703125, -0.0243988037109375, -0.043701171875, -0.01261138916015625, 0.0533447265625, -0.0374755859375, 0.023895263671875, -0.0058135986328125, -0.03570556640625, 0.0570068359375, 0.08331298828125, -0.01739501953125, -0.032318115234375, -0.013275146484375, -0.01103973388671875, 0.0074920654296875, -0.056060791015625, -0.033416748046875, 0.07537841796875, -0.0133819580078125, -0.0175323486328125, -0.052520751953125, 0.00334930419921875, 0.03472900390625, -0.0806884765625, 0.048736572265625, 0.03619384765625, 0.03192138671875, -0.001506805419921875, -0.009857177734375, 0.01294708251953125, -0.0019817352294921875, -0.0231475830078125, -0.014892578125, 0.032623291015625, 0.00025725364685058594, -0.0574951171875, 0.11383056640625, -0.056488037109375]]",0.338992


# Wikipedia simple search (Title)
Searches for a Wikipedia article based on title similarity to query. Useful for looking up terms.

In [None]:
query = "Retrieval Augmented Generation"
print("Encoding query...")
query_embedding = model.encode([query])
print("Query encoded.")

print("Retrieving examples by title similarity...")
scores, retrieved_examples = dataclysm_wikipedia_indexed.get_nearest_examples('title_embedding', query_embedding, k=10)
print("Examples retrieved.")

from IPython.display import display, HTML
import pandas as pd

# Convert retrieved examples to DataFrame
df = pd.DataFrame(retrieved_examples)

# Calculate similarity score in percentage
df['similarity_score'] = scores


# Drop 'title_embedding' and 'abstract_embedding' columns
df = df.drop(columns=['title_embedding'])

# Drop empty columns
df = df.dropna(axis=1, how='all')

# Create a "click to expand" for the abstract so it doesn't take up much space
df['text'] = df['text'].apply(lambda x: f'<details><summary>Article Text</summary>{x}</details>')


# Create a URL field with a hyperlink 
df['url'] = df['url'].apply(lambda x: f'<a href="{url}">Link</a>')

# Sort by ascending similarity score
df = df.sort_values(by='similarity_score', ascending=False)

# Display the DataFrame
from IPython.display import Markdown, display
display(Markdown(f'QUERY: **{query}**'))
display(HTML(df.to_html(escape=False)))


# Download OpenHermes-2.5-Mistral-7B

In [None]:
%pip install huggingface-cli
!huggingface-cli download TheBloke/OpenHermes-2.5-Mistral-7B-GGUF openhermes-2.5-mistral-7b.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False

# Retrieval Augmented Generation

In [None]:
from llama_cpp import Llama
from llama_cpp import LlamaGrammar
import pandas as pd
import json
import httpx

model = "openhermes-2.5-mistral-7b.Q4_K_M.gguf"
prompt = f"{df[['id', 'title', 'abstract']].to_html(escape=False)} ### Instruction: Use the information above to answer the query: EXPLAIN {query} ### Response:"


llm = Llama(model_path=model, n_ctx=8096, last_n_tokens_size=256, n_threads=4, n_gpu_layers=0)

stream = llm.create_completion(prompt, stream=True, repeat_penalty=1.1, max_tokens=256, stop=["\n"], echo=False, temperature=0, mirostat_mode = 2, mirostat_tau=4.0, mirostat_eta=1.1)
result = ""
for output in stream:
    result += output['choices'][0]['text']

print(result)

# Rerank results using an LLM (experimental)
This uses LLaMA grammars / llama.cpp to return back a list instructing the LLM to rerank and drop irrelevant results. May or may not work.

In [None]:
from llama_cpp import Llama
from llama_cpp import LlamaGrammar
import pandas as pd
import json
import httpx
grammar_text = httpx.get("https://raw.githubusercontent.com/ggerganov/llama.cpp/master/grammars/json_arr.gbnf").text
grammar = LlamaGrammar.from_string(grammar_text)

model = "openhermes-2.5-mistral-7b.Q4_K_M.gguf"
prompt = f"""You are an expert at generating valid JSON.
###
Instruction:
Return a valid JSON Array containing arXiv ['id'] field reranked according to how relevant the result is to the query based on its other columns at that ['id']. Drop any items that are not relevant to the query. Return just an array of the IDs, like [x,y,z] and so on in the correct order:
        INDEX: {df[['id', 'title', 'abstract']].to_html(escape=False)}
        QUERY: {query}
        Take a deep breath, and solve the problem step-by-step.
###
Response:"""


llm = Llama(model_path=model, n_ctx=8096, last_n_tokens_size=256, n_threads=4, n_gpu_layers=0)

    
stream = llm.create_completion(prompt, stream=True, repeat_penalty=1.1, max_tokens=256, stop=["]"], echo=False, temperature=0, mirostat_mode = 2, mirostat_tau=4.0, mirostat_eta=1.1, grammar=grammar)
result = ""
for output in stream:
    result += output['choices'][0]['text']

result = result + "]"

# Check if the result is a string, an array string, or a single ID in an array and convert it to a list of IDs
if isinstance(result, str):
    result_ids = [result.strip('[]')]
elif isinstance(result, list):
    if isinstance(result[0], str):
        result_ids = [json.loads(res) for res in result]
    else:
        result_ids = result
# Print the result
print(result_ids)
import re

# Extract IDs from the potentially broken string using regex
result_ids = re.findall(r'"(.*?)"', result_ids[0])

# Filter the dataframe to only include rows with IDs in the result
filtered_df = df[df['id'].isin(result_ids)]

# Create a categorical type for sorting based on the order in result_ids
filtered_df['id'] = pd.Categorical(filtered_df['id'], categories=result_ids, ordered=True)

# Sort the dataframe based on the 'id' column
filtered_df = filtered_df.sort_values('id')

# Drop the similarity score column
filtered_df = filtered_df.drop(columns=['similarity_score'])

# Display the filtered dataframe as a table with hyperlinks
display(HTML(filtered_df.to_html(escape=False)))
