# 🧬 Exact Match Search
Purpose: Perform an exact match search across all configured databases and export the results in the selected format.

In [1]:
import pandas as pd
import numpy as np
from Database_comparator import db_compare

ModuleNotFoundError: No module named 'Database_comparator'

## ⚙️ Configuration

In [None]:
CONFIG_FILE = "_DEFAULT_SIMPLE_config_file.xlsx" # Path to the configuration file containing database connection details
OUTPUT_FILE = "exact_out.csv" # Path to the output file where results will be saved
OUTPUT_FORMAT = "csv"  # Output format can be 'csv','xlsx', 'tsv' or 'md'

QUERY_FILE = "test_query.xlsx" # Path to the file containing query sequences
QUERY_SEQUENCE_COLUMN = "sequence"  # Column name in the query file that contains sequences

NUMBER_OF_PROCESSORS = 3  # Number of processors to use for parallel processing
INDEX_OF_DATABASE_TO_SEARCH = 0  # Index of the database to search against (0-based index) - Check config file for available databases

## 🔧 Advanced Configuration
Warning ⚠️: Advanced configuration options. Please refer to the documentation for detailed information. This settings are optional and can be omitted for standard use cases. Those values **will rewrite** the values in the config file.

In [None]:
configuration_dict = {
    # Database connection details
    "Query_path": QUERY_FILE, # Path to the query database file
    "Query_sequence_column": QUERY_SEQUENCE_COLUMN, # Column name in the query database containing sequences

    # Number of processors to use for parallel processing
    "Number_of_processors": NUMBER_OF_PROCESSORS,
}

## 🧪 Testing

In [None]:
db = db_compare.DB_comparator(CONFIG_FILE, log_tag="Exact_match", log_project="ExactMatch Project", configuration_dict=configuration_dict)
db.test.start() 
print(db)

Initialization test failed with error: [Errno 2] No such file or directory: 'TMP_testing_folder/test_config_file.txt'
╒════════════════════════╤═══════════╤═══════════════════╤══════════════════════╕
│ Test Name              │ Status    │ File Comparison   │ Execution Time (s)   │
╞════════════════════════╪═══════════╪═══════════════════╪══════════════════════╡
│ Initialization Test    │ ❌ Failed │ N/A               │ N/A                  │
├────────────────────────┼───────────┼───────────────────┼──────────────────────┤
│ Exporting Test         │ ❌ Failed │ N/A               │ N/A                  │
├────────────────────────┼───────────┼───────────────────┼──────────────────────┤
│ Exact Match Test       │ ❌ Failed │ ❌ Not compared   │ N/A                  │
├────────────────────────┼───────────┼───────────────────┼──────────────────────┤
│ Hamming Distances Test │ ❌ Failed │ ❌ Not compared   │ N/A                  │
├────────────────────────┼───────────┼───────────────────┼──────────

## 🔍 Run Exact Match Search
Run the search on the database with the specified index. Check the config file for available databases. If you want to search all databases, uncomment the line below.

In [None]:
db.exact_match.exact_match_search_in_single_database(database_index=INDEX_OF_DATABASE_TO_SEARCH, parallel=True) # Multiprocessing enabled (parallel=True)

# If you want to search all databases, uncomment the line below and comment the line above
# db.exact_match.exact_match_search_in_all_databases(parallel=True) # Multiprocessing enabled (parallel=True)

## 📤 Export and Display Results

In [None]:
db.export_data_frame(output_file=OUTPUT_FILE, data_format=OUTPUT_FORMAT)
display(pd.read_csv(OUTPUT_FILE)) 

Unnamed: 0,sequence,info,Databases/test_database.csv
0,CPTGGAQGKHIPQSF,62,[seq: CPTGGAQGKHIPQSF (Value_identifier: 8) (T...
1,CKASIPQGTHGGQPF,6,[seq: CKASIPQGTHGGQPF (Value_identifier: 10) (...
2,CPIQGHPASQGGKTF,83,[seq: CPIQGHPASQGGKTF (Value_identifier: 16) (...
3,CATGIHGQSQPKPGF,22,[seq: CATGIHGQSQPKPGF (Value_identifier: 20) (...
4,CPKTGQQSAHGGPIF,73,[seq: CPKTGQQSAHGGPIF (Value_identifier: 16) (...
...,...,...,...
95,CQHQTAPKIPSGGGF,38,[seq: CQHQTAPKIPSGGGF (Value_identifier: 388) ...
96,CQKPPGTGGHQISAF,50,[seq: CQKPPGTGGHQISAF (Value_identifier: 392) ...
97,CQGAIHSKGQPTGPF,50,[seq: CQGAIHSKGQPTGPF (Value_identifier: 396) ...
98,CGGTPQSQHAIPKGF,43,[seq: CGGTPQSQHAIPKGF (Value_identifier: 400) ...
