# 🧬 Aligner Search
Purpose: Execute alignment-based search across all configured databases, using multiprocessing and exporting the results to the selected file format.

In [None]:
import pandas as pd
import numpy as np

from Database_comparator import db_compare

## ⚙️ Configuration

In [2]:
CONFIG_FILE = "_DEFAULT_SIMPLE_config_file.xlsx" # Path to the configuration file containing database connection details
OUTPUT_FILE = "aligner_out.csv" # Path to the output file where results will be saved
OUTPUT_FORMAT = "csv"  # Output format can be 'csv','xlsx', 'tsv' or 'md'

QUERY_FILE = "test_query.xlsx" # Path to the file containing query sequences
QUERY_SEQUENCE_COLUMN = "sequence"  # Column name in the query file that contains sequences

NUMBER_OF_PROCESSORS = 3  # Number of processors to use for parallel processing
INDEX_OF_DATABASE_TO_SEARCH = 0  # Index of the database to search against (0-based index) - Check config file for available databases

## 🔧 Advanced Configuration
Warning ⚠️: Advanced configuration options. Please refer to the documentation for detailed information. This settings are optional and can be omitted for standard use cases. Those values **will rewrite** the values in the config file.

In [None]:
configuration_dict = {
    # Database connection details
    "Query_path": QUERY_FILE, # Path to the query database file
    "Query_sequence_column": QUERY_SEQUENCE_COLUMN, # Column name in the query database containing sequences

    # Number of processors to use for parallel processing
    "Number_of_processors": NUMBER_OF_PROCESSORS,

    # Aligner settings
    "Aligner_tolerance": 0.9, # Tolerance for the aligner (0 to 1) - higher means more similar
    "Aligner_gap_score": -1000, # Penalty for gaps in the alignment
    "Aligner_mismatch_score": -1, # Penalty for mismatches in the alignment
    "Aligner_match_score": 2, # Score for matches in the alignment
    "Aligner_matrix": None, # Scoring matrix to use for alignment (e.g., BLOSUM62, PAM250) or None for using match/mismatch scores
    "Aligner_mode": "global", # Alignment mode: 'global' or 'local'
    
}

## 🧪 Testing

In [4]:
db = db_compare.DB_comparator(config_file=CONFIG_FILE, log_tag="Aligner", log_project="Aligner Project", configuration_dict=configuration_dict)
db.test.start()
print(db)

╒════════════════════════╤════════════╤═══════════════════╤══════════════════════╕
│ Test Name              │ Status     │ File Comparison   │   Execution Time (s) │
╞════════════════════════╪════════════╪═══════════════════╪══════════════════════╡
│ Initialization Test    │ ✅ Success │ N/A               │                 0.55 │
├────────────────────────┼────────────┼───────────────────┼──────────────────────┤
│ Exporting Test         │ ✅ Success │ N/A               │                 0.12 │
├────────────────────────┼────────────┼───────────────────┼──────────────────────┤
│ Exact Match Test       │ ✅ Success │ ✅ Match          │                 0.09 │
├────────────────────────┼────────────┼───────────────────┼──────────────────────┤
│ Hamming Distances Test │ ✅ Success │ ✅ Match          │                 5.22 │
├────────────────────────┼────────────┼───────────────────┼──────────────────────┤
│ Aligner Test           │ ✅ Success │ ✅ Match          │                 1.26 │
├───────────

## 🔍 Run Aligner Search
Run the search on the database with the specified index. Check the config file for available databases. If you want to search all databases, uncomment the line below.

In [16]:
db.aligner.aligner_search_in_single_database(database_index=INDEX_OF_DATABASE_TO_SEARCH, parallel=True) # Multiprocessing enabled (parallel=True)

# If you want to search all databases, uncomment the line below and comment the line above
# db.aligner.aligner_search_in_all_databases(parallel=True) # Multiprocessing enabled (parallel=True)

## 📤 Export and Display Results

In [17]:
db.export_data_frame(output_file=OUTPUT_FILE, data_format=OUTPUT_FORMAT)
display(pd.read_csv(OUTPUT_FILE)) 

Unnamed: 0,sequence,info,Databases/test_database.csv
0,CPTGGAQGKHIPQSF,62,[seq: CPTGGAQGKHIPQSF (Value_identifier: 8) (T...
1,CKASIPQGTHGGQPF,6,[seq: CKASIPQGTHGGQPF (Value_identifier: 10) (...
2,CPIQGHPASQGGKTF,83,[seq: CPIQGHPASQGGKTF (Value_identifier: 16) (...
3,CATGIHGQSQPKPGF,22,[seq: CATGIHGQSQPKPGF (Value_identifier: 20) (...
4,CPKTGQQSAHGGPIF,73,[seq: CPKTGQQSAHGGPIF (Value_identifier: 16) (...
...,...,...,...
95,CQHQTAPKIPSGGGF,38,[seq: CQHQTAPKIPSGGGF (Value_identifier: 388) ...
96,CQKPPGTGGHQISAF,50,[seq: CQKPPGTGGHQISAF (Value_identifier: 392) ...
97,CQGAIHSKGQPTGPF,50,[seq: CQGAIHSKGQPTGPF (Value_identifier: 396) ...
98,CGGTPQSQHAIPKGF,43,[seq: CGGTPQSQHAIPKGF (Value_identifier: 400) ...
