In [1]:
def cleanup(input_string):
    return input_string.replace('N', '')

In [2]:
import pandas as pd
import lmdb
import pickle
import ast

# Function to store CSV data in LMDB
def store_csv_in_lmdb(csv_file, lmdb_path):
    # Read CSV file
    df = pd.read_csv(csv_file)
    
    # Create LMDB environment
    map_size = 4 * 1024 * 1024 * 1024 * 1024  # 6TB in bytes
    env = lmdb.open(lmdb_path, map_size=map_size)  # 1GB map size
    
    with env.begin(write=True) as txn:
        for index, row in df.iterrows():
            # Use DNA sequence as key
            key = cleanup(row['sequence']).encode()
            
            # Convert string representation of list to actual list
            embedding = ast.literal_eval(row['dna_embedding'])
            
            # Serialize embedding as value
            value = pickle.dumps(embedding)
            
            # Store in LMDB
            txn.put(key, value)
    
    print(f"Stored {len(df)} rows in LMDB")

# Function to retrieve a value from LMDB
def get_embedding_from_lmdb(lmdb_path, dna_sequence):
    env = lmdb.open(lmdb_path)
    with env.begin() as txn:
        value = txn.get(dna_sequence.encode())
        if value:
            return pickle.loads(value)
        else:
            return None

# Example usage
csv_file = "dna_embeddings.csv"  # Replace with your CSV file path
lmdb_path = "dna_embeddings_lmdb"  # Replace with desired LMDB database path

# Store CSV in LMDB
store_csv_in_lmdb(csv_file, lmdb_path)

Stored 9 rows in LMDB


In [3]:
# Retrieve and print a specific embedding
dna_sequence = "TTCTCACGTAATGAACATTATAATCTTCTCACTCAAGATG"  # Replace with a DNA sequence from your data
embedding = get_embedding_from_lmdb(lmdb_path, dna_sequence)
# if embedding:
#     print("Retrieved embedding:", embedding)
# else:
#     print("DNA sequence not found")

In [4]:
len(embedding)

480

In [5]:
embedding = get_embedding_from_lmdb(lmdb_path, cleanup(dna_sequence))

In [6]:
len(embedding)

480

In [7]:
env = lmdb.open(lmdb_path)

with env.begin() as txn:
    num_entries = txn.stat()['entries']

print(f"Number of entries: {num_entries}")

Number of entries: 9
