# Fasta to LMDB

This notebook provides a simple utility to convert from a `fasta` file to `lmdb` (lightning memory-mapped database), which provides high performance reads and writes for training a model compared to loading data from a `fasta` file

Specifically, we run this notebook for the three `fasta` files created in notebook 1, for all sequences with a sequence length < 1024.

This notebook is necessary to run the TAPE models on subcellular location, which is shown in notebook 3 on Google colab.

In [17]:
from collections import defaultdict
from pathlib import Path
import pickle
import os
import sys

from Bio import SeqIO
import lmdb

In [18]:
FASTA_TRAIN = list()
FASTA_VALID = list()
FASTA_TEST = list()

In [19]:
# Function to read and parse a fasta file and extract the sequence ID, label, and sequence
def read_fasta(fasta_path, fasta_list):
    train_file = f'../{fasta_path}'
    for sequence in SeqIO.parse(str(train_file), 'fasta'):
        id = sequence.id
        label = sequence.description.split()[1]
        seq = str(sequence.seq)
        fasta_list.append({
            'id': id,
            'label': label,
            'primary': seq
        })

In [20]:
# Function to write a key/value pair to a local LMDB database
def write_record_to_lmdb(db, key, value):
    """
    Write (key,value) to db
    """
    success = False
    while not success:
        txn = db.begin(write=True)
        try:
            txn.put(key, value)
            txn.commit()
            success = True
        except lmdb.MapFullError:
            txn.abort()
            # double the map_size
            curr_limit = db.info()['map_size']
            new_limit = curr_limit * 2
            print(f'Doubling LMDB map size to {new_limit}')
            db.set_mapsize(new_limit)

In [21]:
# Given the sequence ID, label, and sequence for each protein, write the records to an LMDB file
def write_sc_lmdb(split, fasta_list):
    map_size = sys.getsizeof(pickle.dumps(fasta_list)) * 10
    env = lmdb.open((f'../data/deeploc/deeploc_{split}.lmdb'), map_size=map_size)
    for i in range(len(fasta_list)):
        write_record_to_lmdb(env, str(i).encode(), pickle.dumps(fasta_list[i]))
    write_record_to_lmdb(env, b'num_examples', pickle.dumps(len(fasta_list)))

In [22]:
# Utility function to read sequences from an LMDB file to verify that the files were converted correctly
def read_sc_lmdb(split, verbose_flag=False):
    env = lmdb.open(f'../data/deeploc/deeploc_{split}.lmdb')
    with env.begin(write=False) as txn:
        num_examples = pickle.loads(txn.get(b'num_examples'))
        print(f'{split} has num_examples={num_examples}')

    if verbose_flag:
        label_counts = defaultdict(int)
        with env.begin(write=False) as txn:
            for index in range(num_examples):
                item = pickle.loads(txn.get(str(index).encode()))
                if 'id' not in item:
                    item['id'] = str(index)
                label_counts[item['label']] += 1

        print(label_counts)

In [23]:
# Create dir to write LMDB files if it does not already exist
!mkdir -p ../data/deeploc

In [24]:
# Create LMDB files for train/valid/test
MAX_SEQ_LENGTH = 1024

# Train
read_fasta(f'data/deeploc_train_{MAX_SEQ_LENGTH}.fasta', FASTA_TRAIN)
write_sc_lmdb('train', FASTA_TRAIN)
read_sc_lmdb('train', True)

# Validation
read_fasta(f'data/deeploc_valid_{MAX_SEQ_LENGTH}.fasta', FASTA_VALID)
write_sc_lmdb('valid', FASTA_VALID)
read_sc_lmdb('valid', True)

# Test
read_fasta(f'data/deeploc_test_{MAX_SEQ_LENGTH}.fasta', FASTA_TEST)
write_sc_lmdb('test', FASTA_TEST)
read_sc_lmdb('test', True)

train has num_examples=9097
defaultdict(<class 'int'>, {'6': 2532, '0': 854, '5': 1073, '9': 1381, '1': 1559, '3': 242, '4': 217, '8': 536, '2': 592, '7': 111})
valid has num_examples=1011
defaultdict(<class 'int'>, {'6': 281, '9': 154, '5': 119, '0': 95, '8': 60, '3': 27, '1': 173, '2': 66, '4': 24, '7': 12})
test has num_examples=2462
defaultdict(<class 'int'>, {'0': 224, '1': 424, '2': 162, '3': 63, '4': 57, '5': 299, '6': 675, '7': 28, '8': 147, '9': 383})
