In [1]:
def cleanup(input_string):
    return input_string.replace('N', '')

In [2]:
import pandas as pd
import lmdb
import pickle
import ast

# Function to store CSV data in LMDB
def store_csv_in_lmdb(db_name, csv_file, lmdb_path):
    
    
    # Read CSV file
    df = pd.read_csv(csv_file)
    
    # Create LMDB environment
    map_size = 4 * 1024 * 1024 * 1024 * 1024  # 6TB in bytes
    env = lmdb.open(lmdb_path, map_size=map_size, max_dbs=10)  # 1GB map size
    db_ = env.open_db(str.encode(db_name))
    
    with env.begin(write=True) as txn:
        
        for index, row in df.iterrows():
            # Use DNA sequence as key
            key = cleanup(row['sequence']).encode()
            
            # Convert string representation of list to actual list
            embedding = ast.literal_eval(row['dna_embedding'])
            
            # Serialize embedding as value
            value = pickle.dumps(embedding)
            
            # Store in LMDB
            txn.put(key, value, db = db_)
    
    print(f"Stored {len(df)} rows in LMDB")

In [3]:
# Function to retrieve a value from LMDB
def get_embedding_from_lmdb(db_name, lmdb_path, dna_sequence):
    env = lmdb.open(lmdb_path, readonly=True, max_dbs=10)

    db_ = env.open_db(str.encode(db_name)) #throws an error if database doesnt exist

    with env.begin(write=False) as txn:
        value = txn.get(dna_sequence.encode(), db = db_)
        if value:
            return pickle.loads(value)
        else:
            return None

In [4]:
# Example usage
csv_file  = "dna_embeddings.csv"  # Replace with your CSV file path
lmdb_path = "dna_embeddings_lmdb"  # Replace with desired LMDB database path

In [5]:
# Store CSV in LMDB
store_csv_in_lmdb("database1", csv_file, lmdb_path)
store_csv_in_lmdb("database2", csv_file, lmdb_path)

Stored 9 rows in LMDB
Stored 9 rows in LMDB


In [6]:
# Retrieve and print a specific embedding
dna_sequence = "TTCTCACGTAATGAACATTATAATCTTCTCACTCAAGATG"

In [7]:
embedding = get_embedding_from_lmdb('database1', lmdb_path, cleanup(dna_sequence))

In [8]:
len(embedding)

480

In [9]:
#throws an error for files not found
embedding = get_embedding_from_lmdb('database3', lmdb_path, cleanup(dna_sequence))

ReadonlyError: mdb_dbi_open: Permission denied

In [10]:
def summarize_lmdb(lmdb_path):
    env = lmdb.open(lmdb_path, readonly=True, max_dbs=10)

    try:
        with env.begin() as txn:
            cursor = txn.cursor()
            
            print(f"LMDB at {lmdb_path}:")
            
            # Summarize main database
            main_stat = txn.stat()
            print("\nDatabase: main")
            print(f"  Entries: {main_stat['entries']}")
            print(f"  Page size: {main_stat['psize']} bytes")
            print(f"  Tree depth: {main_stat['depth']}")
            
            # Iterate through all keys (database names)
            for key in cursor.iternext(keys=True, values=False):
                db_name = key.decode('utf-8')
                db = env.open_db(key)
                
                with env.begin(db=db) as db_txn:
                    stat = db_txn.stat()
                
                print(f"\nDatabase: {db_name}")
                print(f"  Entries: {stat['entries']}")
                print(f"  Page size: {stat['psize']} bytes")
                print(f"  Tree depth: {stat['depth']}")
    
    finally:
        env.close()

# Usage
summarize_lmdb(lmdb_path)

LMDB at dna_embeddings_lmdb:

Database: main
  Entries: 2
  Page size: 4096 bytes
  Tree depth: 1

Database: database1
  Entries: 9
  Page size: 4096 bytes
  Tree depth: 1

Database: database2
  Entries: 9
  Page size: 4096 bytes
  Tree depth: 1
