In [None]:
import os
current_dir = os.getcwd()

config_database_dir = os.path.join(current_dir[:current_dir.rfind('HIV_pipeline_main')], 'HIV_pipeline_main/config/general')
os.chdir(config_database_dir)
from db_operations import db_wrapper, extract_table
from data_uploader import upload_df_to_table
from user_prompter import data_upload_and_header_matching
from stats_plotter import plot_distribution, calculate_stats

config_seq_dir = os.path.join(current_dir[:current_dir.rfind('HIV_pipeline_main')], 'HIV_pipeline_main/config/seq')
os.chdir(config_seq_dir)
from mafft_mac_installer import install_and_activate_mafft
from qc import process_sequences, categorize_hiv_typing, categorize_hiv1_subtyping
from hiv_typing_alignment_worker import perform_hiv_typing
from hiv_subtyping_alignment_worker import perform_hiv_subtyping
from parallel_alignment_processor import process_sequence_alignment_parallel

In [None]:
database = "swe_db"
user = "raeuf"
password = "SWE_db_raeuf_pw"
host = "localhost" 
port = "5432"

In [None]:
def process_sequence_data(database, user, password, host, port):
    """
    Process sequence data including uploading, processing, typing, and subtyping.

    Args:
        database (str): The name of the database.
        user (str): The username for database access.
        password (str): The password for database access.
        host (str): The host address of the database.
        port (int): The port number of the database.

    Returns:
        tuple: A tuple containing various processed data and results, including:
            - uploaded_sequences_df (DataFrame): DataFrame containing uploaded sequence data.
            - filtered_rows (DataFrame): DataFrame containing filtered rows from data upload and header matching.
            - sequence_processing_result (str): Result of sequence processing.
            - post_qc_sequences_df (DataFrame): DataFrame containing sequences after quality control.
            - typed_hiv_sequences_df (DataFrame): DataFrame containing typed HIV sequences.
            - categorized_hiv_typing_results (dict): Dictionary containing categorized HIV typing results.
            - hiv1_subtyped_sequences_df (DataFrame): DataFrame containing HIV-1 subtyped sequences.
            - categorized_hiv1_subtyping_results (dict): Dictionary containing categorized HIV-1 subtyping results.
            - known_hiv1_subtypes (DataFrame): DataFrame containing known HIV-1 subtypes.
            - upload_results (tuple): A tuple containing results of sequence data upload.
            - uploaded_sequences (DataFrame): DataFrame containing uploaded sequences.
            - not_uploaded_sequences (DataFrame): DataFrame containing sequences that were not uploaded.
    """
    # Assign default values to variables
    processed_rows = None
    filtered_rows = None
    sequence_processing_result = None
    post_qc_sequences_df = None
    typed_hiv_sequences_df = None
    categorized_hiv_typing_results = None
    hiv1_subtyped_sequences_df = None
    categorized_hiv1_subtyping_results = None
    known_hiv1_subtypes = None
    uploaded_sequences = None
    not_uploaded_sequences = None

    try:
        # Call functions to process sequence data
        db_wrapper(database, user, password, host, port)
        processed_rows, filtered_rows = data_upload_and_header_matching()
        
        # Check if data is available and not empty
        if processed_rows is not None and not processed_rows.empty:
            sequence_processing_result, post_qc_sequences_df = process_sequences(processed_rows)
            hiv_type_ref_seq_table = extract_table(database, user, password, host, port, 'hiv_type_ref_seq')
            mafft_executable = install_and_activate_mafft()  # Install and activate MAFFT
            typed_hiv_sequences_df = process_sequence_alignment_parallel(post_qc_sequences_df, 
                                                                            hiv_type_ref_seq_table, 
                                                                            'seq_cleaned', 
                                                                            perform_hiv_typing, 
                                                                            mafft_executable)
            # Check if typing result is a string (indicating error)
            if isinstance(typed_hiv_sequences_df, str):
                raise ValueError(f"Error: {typed_hiv_sequences_df}")
            else:
                categorized_hiv_typing_results = categorize_hiv_typing(typed_hiv_sequences_df)
                hiv_subtype_con_ref_seq_table = extract_table(database, user, password, host, port, table_name='hiv_subtype_con_ref_seq')
                hiv1_subtyped_sequences_df = process_sequence_alignment_parallel(categorized_hiv_typing_results['hiv1_df'], 
                                                                                   hiv_subtype_con_ref_seq_table, 
                                                                                   'extracted_pol_query_seq_cleaned', 
                                                                                   perform_hiv_subtyping, 
                                                                                   mafft_executable)
                # Check if subtyping result is a string (indicating error)
                if isinstance(hiv1_subtyped_sequences_df, str):
                    raise ValueError(f"Error: {hiv1_subtyped_sequences_df}")
                else:
                    categorized_hiv1_subtyping_results, known_hiv1_subtypes = categorize_hiv1_subtyping(hiv1_subtyped_sequences_df)
                    upload_results = upload_df_to_table(database, user, password, host, port,
                                                        table_name='seq', df=known_hiv1_subtypes)
                    # Check if upload result is a string (indicating error)
                    if isinstance(upload_results, str):
                        raise ValueError(upload_results)
                    else:
                        uploaded_sequences, not_uploaded_sequences = upload_results
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        # Handle the error, log it, or perform any other necessary actions
    
    # Return processed data and results
    return (processed_rows, filtered_rows, sequence_processing_result, post_qc_sequences_df, 
            typed_hiv_sequences_df, categorized_hiv_typing_results, hiv1_subtyped_sequences_df, 
            categorized_hiv1_subtyping_results, known_hiv1_subtypes,
            uploaded_sequences, not_uploaded_sequences)


In [None]:
# Call the function
(processed_rows, filtered_rows, sequence_processing_result, post_qc_sequences_df, 
 typed_hiv_sequences_df, categorized_hiv_typing_results, hiv1_subtyped_sequences_df, 
 categorized_hiv1_subtyping_results, known_hiv1_subtypes, uploaded_sequences, 
 not_uploaded_sequences) = process_sequence_data(database, user, password, host, port)

In [None]:
# Print descriptive strings and variables for each set
print("Processed Sequences DataFrame:\n")
print("Contains the DataFrame with processed sequences.")
processed_rows

In [None]:
print("Filtered Rows DataFrame:\n")
print("Contains rows been filtered because of invalid datatype.")
filtered_rows

----

In [None]:
print("Sequence Processing Result:\n")
print("Contains the result of sequence processing.")
print(sequence_processing_result["summary"])

In [None]:
print(sequence_processing_result["empty_statement"])
sequence_processing_result['empty_df']

In [None]:
print(sequence_processing_result["duplicate_statement"])
sequence_processing_result["duplicate_df"]

In [None]:
print(sequence_processing_result["n_only_statement"])
sequence_processing_result["n_only_df"]

In [None]:
print(sequence_processing_result["low_acgt_ratio_statement"])
sequence_processing_result["low_acgt_ratio_df"]

In [None]:
print(sequence_processing_result["short_statement"])
sequence_processing_result["short_df"]

-----

In [None]:
print("Post-QC Sequences DataFrame:\n")
print("Contains the DataFrame with sequences after quality control.")
post_qc_sequences_df


In [None]:
plot_distribution(post_qc_sequences_df, 'seq_cleaned_len', 'lightblue', tick_interval=50,
                  switch='no', title='Distribution of Sequence Lengths', xlabel='Sequence Length', 
                  ylabel='Frequency')

In [None]:
calculate_stats(post_qc_sequences_df, 'seq_cleaned_len')

In [None]:
print("Typed HIV Sequences DataFrame:\n")
print("Contains the DataFrame with typed HIV sequences.")
typed_hiv_sequences_df

---

In [None]:
print("Categorized HIV Typing Results:\n")
print("Contains the categorized results of HIV typing.")
print(categorized_hiv_typing_results["summary"])

In [None]:
print(categorized_hiv_typing_results["short_statement"])
categorized_hiv_typing_results['short_df']

In [None]:
print(categorized_hiv_typing_results["hiv2_statement"])
categorized_hiv_typing_results['hiv2_df']

In [None]:
print(categorized_hiv_typing_results["not_hiv_statement"])
categorized_hiv_typing_results['not_hiv_df']

In [None]:
categorized_hiv_typing_results['hiv1_df']

---

In [None]:
print("HIV-1 Subtyped Sequences DataFrame:\n")
print("Contains the DataFrame with HIV-1 subtyped sequences.")
hiv1_subtyped_sequences_df

In [None]:
print(categorized_hiv1_subtyping_results["subtype_UN_statement"])
categorized_hiv1_subtyping_results['subtype_UN']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_UI_statement"])
categorized_hiv1_subtyping_results['subtype_UI']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_A1_statement"])
categorized_hiv1_subtyping_results['subtype_A1']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_A2_statement"])
categorized_hiv1_subtyping_results['subtype_A2']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_A3_statement"])
categorized_hiv1_subtyping_results['subtype_A3']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_A6_statement"])
categorized_hiv1_subtyping_results['subtype_A6']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_B_statement"])
categorized_hiv1_subtyping_results['subtype_B']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_C_statement"])
categorized_hiv1_subtyping_results['subtype_C']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_D_statement"])
categorized_hiv1_subtyping_results['subtype_D']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_F1_statement"])
categorized_hiv1_subtyping_results['subtype_F1']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_F2_statement"])
categorized_hiv1_subtyping_results['subtype_F2']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_G_statement"])
categorized_hiv1_subtyping_results['subtype_G']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_H_statement"])
categorized_hiv1_subtyping_results['subtype_H']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_J_statement"])
categorized_hiv1_subtyping_results['subtype_J']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_K_statement"])
categorized_hiv1_subtyping_results['subtype_K']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_L_statement"])
categorized_hiv1_subtyping_results['subtype_L']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_01_AE_statement"])
categorized_hiv1_subtyping_results['subtype_01_AE']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_02_AG_statement"])
categorized_hiv1_subtyping_results['subtype_02_AG']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_A4_statement"])
categorized_hiv1_subtyping_results['subtype_A4']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_03_A6B_statement"])
categorized_hiv1_subtyping_results['subtype_03_A6B']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_A7_statement"])
categorized_hiv1_subtyping_results['subtype_A7']

In [None]:
print(categorized_hiv1_subtyping_results["subtype_A8_statement"])
categorized_hiv1_subtyping_results['subtype_A8']

In [None]:
print("Known HIV-1 Subtypes DataFrame:\n")
print("Contains the DataFrame with known HIV-1 subtypes.")
known_hiv1_subtypes

---

In [None]:
print("Uploaded Sequences DataFrame:\n")
print("Contains the DataFrame with uploaded sequences.")
uploaded_sequences

In [None]:
print("Not Uploaded Sequences DataFrame:\n")
print("Contains the DataFrame with sequences that were not uploaded.")
not_uploaded_sequences

---

In [None]:
seq_table = extract_table(database, user, password, host, port, 'seq')
seq_table