# 🧬 BLAST Search
Purpose: Perform a BLAST-like sequence similarity search using a custom database. Includes database preparation, parameter display, and result export.

In [6]:
import pandas as pd
import numpy as np

from Database_comparator import db_compare

## ⚙️ Configuration

In [None]:
CONFIG_FILE = "_DEFAULT_SIMPLE_config_file.xlsx" # Path to the configuration file containing database connection details
OUTPUT_FILE = "Outputs/blast_out.csv" # Path to the output file where results will be saved
OUTPUT_FORMAT = "csv"  # Output format can be 'csv','xlsx', 'tsv' or 'md'

QUERY_FILE = "test_query.xlsx" # Path to the file containing query sequences
QUERY_SEQUENCE_COLUMN = "sequence"  # Column name in the query file that contains sequences

NUMBER_OF_PROCESSORS = 3  # Number of processors to use for parallel processing
INDEX_OF_DATABASE_TO_SEARCH = 0  # Index of the database to search against (0-based index) - Check config file for available databases

## 🔧 Advanced Configuration
Warning ⚠️: Advanced configuration options. Please refer to the documentation for detailed information. This settings are optional and can be omitted for standard use cases. Those values **will rewrite** the values in the config file.

In [None]:
configuration_dict = {
    # Database connection details
    "Query_path": QUERY_FILE, # Path to the query database file
    "Query_sequence_column": QUERY_SEQUENCE_COLUMN, # Column name in the query database containing sequences

    # Number of processors to use for parallel processing
    "Number_of_processors": NUMBER_OF_PROCESSORS,

    # Blast settings
    "Blast_e_value": 0.05, # E-value threshold for BLAST searches
    "Blast_name_of_created_database": "blast_db", # Name of the BLAST database to create
    "Blast_output_name": "blast_output.txt", # Name of the BLAST output file
}

## 🧪 Testing

In [None]:
db = db_compare.DB_comparator(CONFIG_FILE, log_tag="BLAST", log_project="BLAST Project", configuration_dict=configuration_dict)
db.test.start()
print(db)

## 🔍 Run BLAST Workflow
Blast search on all databases defined in the config file.

In [9]:
db.blast.blast_database_info() # Provides information about the BLAST database

db.blast.blast_make_database(name="BLAST_Database") # Creates BLAST database
db.blast.blast_search_and_analyze_matches_in_database() # Query is input database

Inserted databases:
{'path': 'Databases/test_database.csv', 'sequence_column_name': 'sequence', 'results_column': 'Databases/test_database.csv', 'identifier_of_seq': ['Value_identifier', 'Text_identifier']}
-----------------------------------------------------
Matrix: BLOSUM62
GaPenalties: Existence: 11, Extension: 1
Neighboring words threshold: 11
Window for multiple hits: 40


Unnamed: 0,sequence,info,Databases/test_database.csv
0,CPTGGAQGKHIPQSF,62,[seq: CPTGGAQGKHIPQSF identifier:(Value_identi...
1,CKASIPQGTHGGQPF,6,[seq: CKASIPQGTHGGQPF identifier:(Value_identi...
2,CPIQGHPASQGGKTF,83,[seq: CPIQGHPASQGGKTF identifier:(Value_identi...
3,CATGIHGQSQPKPGF,22,[seq: CATGIHGQSQPKPGF identifier:(Value_identi...
4,CPKTGQQSAHGGPIF,73,[seq: CPKTGQQSAHGGPIF identifier:(Value_identi...
...,...,...,...
95,CQHQTAPKIPSGGGF,38,[seq: CQHQTAPKIPSGGGF identifier:(Value_identi...
96,CQKPPGTGGHQISAF,50,[seq: CQKPPGTGGHQISAF identifier:(Value_identi...
97,CQGAIHSKGQPTGPF,50,[seq: CQGAIHSKGQPTGPF identifier:(Value_identi...
98,CGGTPQSQHAIPKGF,43,[seq: CGGTPQSQHAIPKGF identifier:(Value_identi...


## 📤 Export and Display Results

In [10]:
db.export_data_frame(output_file=OUTPUT_FILE, data_format=OUTPUT_FORMAT)
display(pd.read_csv(OUTPUT_FILE)) 

Unnamed: 0,sequence,info,Databases/test_database.csv
0,CPTGGAQGKHIPQSF,62,[seq: CPTGGAQGKHIPQSF identifier:(Value_identi...
1,CKASIPQGTHGGQPF,6,[seq: CKASIPQGTHGGQPF identifier:(Value_identi...
2,CPIQGHPASQGGKTF,83,[seq: CPIQGHPASQGGKTF identifier:(Value_identi...
3,CATGIHGQSQPKPGF,22,[seq: CATGIHGQSQPKPGF identifier:(Value_identi...
4,CPKTGQQSAHGGPIF,73,[seq: CPKTGQQSAHGGPIF identifier:(Value_identi...
...,...,...,...
95,CQHQTAPKIPSGGGF,38,[seq: CQHQTAPKIPSGGGF identifier:(Value_identi...
96,CQKPPGTGGHQISAF,50,[seq: CQKPPGTGGHQISAF identifier:(Value_identi...
97,CQGAIHSKGQPTGPF,50,[seq: CQGAIHSKGQPTGPF identifier:(Value_identi...
98,CGGTPQSQHAIPKGF,43,[seq: CGGTPQSQHAIPKGF identifier:(Value_identi...
