In [17]:
import pandas as pd
import subprocess
import os

In [18]:
def blastn_dic(path_input, path_output):
    """
    Creation af a BLAST database of our whole genome. It uses the BLAST :sup:`R` command line, see BLAST
    `Command Line Application User Manual`_ for more information.


    The generation of the properly database will be placed in the directory where ``path_input`` is.
    It is recommended to use a dedicated folder to this FASTA file so the database is written next to it.

    :param path_input: path to a FASTA file.
    :type path_input: string

    :param path_output: path to the output folder where the BLAST database will be created.
    :type path_output: string

    :return: a BLAST database.
    :rtype: Muitiples files (**.nhr**, **.nin**, **.nog**, **.nsd**, **.nsi** and **.nsq** extensions)
    """

    # Remember is "path.input.dic_path" for "argparse".
    try:
        # "parse_seqids" is used to keep the sequence ID in the output.
        cmd = f"makeblastdb -in {path_input} -dbtype nucl -parse_seqids -out {path_output}"
        subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    except Exception:
        print("\nError: Blast Dictionary couldn't be created")
        

In [19]:
genome_path = "~/Documents/Work_CBMSO/Bringaud_testing/0.1.data/LmjF_V4.0_20040630_SIMPLE_NAMES.fasta"
genome_path = os.path.expanduser(genome_path)
genome_name = os.path.basename(genome_path)
dict_folder = os.path.join(os.path.dirname(genome_path), "blastn_dict_simple_names")
os.makedirs(dict_folder, exist_ok=True)
genome_save_path = os.path.join(dict_folder, genome_name)
blastn_dic(genome_path, genome_save_path)

In [20]:
def get_sequence(start_coor, end_coor, strand, chromosome, path_genome):
    cmd = f'blastdbcmd -db {path_genome} -entry {chromosome} -range {start_coor}-{end_coor} -strand {strand} -outfmt %s'
    sequence = subprocess.run(cmd, shell=True, capture_output=True, text=True, universal_newlines=True, executable='/usr/bin/bash')
    sequence = sequence.stdout.strip()
    return sequence


In [5]:
not_captured = pd.read_csv('../not_captured-first_output_vs_bringaud.csv', sep=',', header=0)
print(not_captured.shape)
print(not_captured.dtypes)
not_captured.head()

(400, 3)
sseqid    object
sstart     int64
send       int64
dtype: object


Unnamed: 0,sseqid,sstart,send
0,LmjF.01,226657,226940
1,LmjF.05,90841,91072
2,LmjF.06,57422,57917
3,LmjF.06,178220,178503
4,LmjF.08,54807,55176


In [None]:
# Update DataFrame with the 'sseq' column
not_captured['sseq'] = not_captured.apply(lambda row: get_sequence(row['sstart'], row['end_coor'], row['strand'], row['chromosome'], row['path_genome']), axis=1)