In [None]:
# @title Click on the Start Button or Press Ctrl(Cmd on Mac)+Enter (Might slow down the browser! If crashed just reload the page)
# Widget UI Imports
import ipywidgets as widgets
from ipywidgets import Layout
from IPython.display import display, clear_output
import requests, gdown, time, datetime
import numpy as np
import pandas as pd
import re
import os, shutil
import urllib.request
from matplotlib import pyplot as plt
import seaborn as sns
from io import BytesIO
import pathlib
import zipfile
import pathlib

# Colab specific (Comment these out on local)
from google.colab import files
os.chdir('/content')


# When Running the Cell again in Colab, all files in cwd must be deleted
# To preserve memory
# Can comment this when working on local

main_base_dir_parent = os.getcwd()
main_base_dir_parent_path = pathlib.Path(main_base_dir_parent)

# Adjust current working directory if already inside 'AlphaCrossXL Files'
if 'AlphaCrossXL Files' in main_base_dir_parent:
    os.chdir(main_base_dir_parent_path.parent)
    main_base_dir_parent = os.getcwd()

main_base_dir = os.path.join(main_base_dir_parent, "AlphaCrossXL Files")

if os.path.exists(main_base_dir):
    # If the folder exists, iterate and delete contents except 'AlphaFold Structures'
    for item in os.listdir(main_base_dir):
        item_path = os.path.join(main_base_dir, item)
        if os.path.isdir(item_path) and item == "AlphaFold Structures":
            # Skip 'AlphaFold Structures' directory
            continue
        elif os.path.isdir(item_path):
            shutil.rmtree(item_path)
        else:
            os.remove(item_path)
else:
    # If the folder does not exist, create it
    os.mkdir(main_base_dir)

os.chdir(main_base_dir)



class MultipleChainError(Exception):
    pass
class ResidueLocationError(Exception):
    pass
class MultiplePeptideIterationsError(Exception):
    pass
class PeptideNotFoundError(Exception):
    pass


class AlphaCrossXL:
    def __init__(self):
        self.data_file = None
        self.fasta_db = None
        self.main_base_dir_parent = None
        self.base_dir = None
        self.residue_distance_threshold = None
        self.plddt_threshold = None
        self.input_file_columns = None
        self.input_uniprotid_column = None
        self.input_peptide_a_column = None
        self.input_peptide_b_column = None
        self.input_link_site_a_column = None
        self.input_link_site_b_column = None
        self.input_xlink_types_column = None
        self.x_link_types = None
        self.user_x_link_type_chosen = None
        self.is_visualization_allowed = False
        self.is_manual_protein_struct = False
        self.manual_protein_struct_file = None
        self.are_manual_structures_verified = False

        self.XLMS_cif_files_protein_names = None
        self.erroneous_XLMS_cif_files_protein_names = []

        self.XLMS_raw_input = None
        self.XLMS_input = None
        self.XLMS_proteins = None
        self.XLMS_Chain_1 = None
        self.XLMS_Chain_2 = None
        self.fa = None
        self.XLMS_DF = None
        self.XLMS_DF_NO_DUPES_NO_SHARED = None
        self.XLMS_proteins_with_structure_info = None
        self.XLMS_proteins_with_structure_info_df = None
        self.XLMS_proteins_without_structure_info = None
        self.df_bplt = None
        self.df_hplt = None
        self.df_cplt = None

    def initialize_input_file(self):
        '''
        Initializes the input data file for processing.

        args: None
        creates: self.input_file_columns, self.XLMS_raw_input
        return: None

        Improvement Suggestion:
        Try to dynamically choose the required columns from the input file

        '''
        if self.data_file.endswith(".csv"):
            self.XLMS_raw_input = pd.read_csv(self.data_file)
        elif self.data_file.endswith(".xlsx"):
            self.XLMS_raw_input = pd.read_excel(self.data_file)
        else:
            raise Exception("File Format Error")

        input_columns = list((self.XLMS_raw_input.columns))
        self.input_file_columns = input_columns

    def get_input_file_columns(self):
        '''
        This function returns the columns of the input file.

        args: None
        creates: None
        return: input_file_columns
        '''
        return self.input_file_columns

    def set_input_file_columns(self, user_chosen_columns_dict, reset=False):
        '''
        This function sets the columns of the input file.

        args: user_chosen_columns_dict
        creates: self.XLMS_raw_input
        return: None
        '''
        if reset:
            self.XLMS_raw_input.rename(
                {
                    "Peptide A": self.input_peptide_a_column,
                    "Residue 1": self.input_link_site_a_column,
                    "Peptide B": self.input_peptide_b_column,
                    "Residue 2": self.input_link_site_b_column,
                    "X-link type": self.input_xlink_types_column,
                    "uniprotID": self.input_uniprotid_column,
                },
                axis=1,
                inplace=True,
            )
        else:
            self.input_peptide_a_column = user_chosen_columns_dict["Peptide A"]
            self.input_link_site_a_column = user_chosen_columns_dict["Residue 1"]
            self.input_peptide_b_column = user_chosen_columns_dict["Peptide B"]
            self.input_link_site_b_column = user_chosen_columns_dict["Residue 2"]
            self.input_xlink_types_column = user_chosen_columns_dict["X-link type"]
            self.input_uniprotid_column = user_chosen_columns_dict["uniprotID"]

            self.XLMS_raw_input.rename(
                    {
                        self.input_peptide_a_column: "Peptide A",
                        self.input_link_site_a_column: "Residue 1",
                        self.input_peptide_b_column: "Peptide B",
                        self.input_link_site_b_column: "Residue 2",
                        self.input_xlink_types_column: "X-link type",
                        self.input_uniprotid_column: "uniprotID",
                    },
                    axis=1,
                    inplace=True,
            )


    def get_input_xlink_types(self):
        '''
        This function returns the unique cross-link types found in input file.

        args: None
        creates: None
        return: x_link_types
        '''
        x_link_types = list(self.XLMS_raw_input["X-link type"].unique())
        self.x_link_types = x_link_types

        return self.x_link_types

    def set_input_xlink_type_threshold_dist(self, user_x_link_type_chosen, user_threshold_dist_chosen, user_threshold_plddt_chosen, reset=False):
        '''
        This function sets the cross-link type and threshold distance
        chosen by the user.

        args: user_x_link_type_chosen
        creates: self.XLMS_input, self.XLMS_proteins,
                self.XLMS_Chain_1, self.XLMS_Chain_2,
                self.residue_distance_threshold
        return: None
        '''
        if reset:
            self.XLMS_input = None
            self.XLMS_proteins = None
            self.XLMS_Chain_1 = None
            self.XLMS_Chain_2 = None
            self.residue_distance_threshold = None
            self.user_x_link_type_chosen = None
            self.plddt_threshold = None
        else:
            self.residue_distance_threshold = user_threshold_dist_chosen
            self.user_x_link_type_chosen = user_x_link_type_chosen
            self.plddt_threshold = user_threshold_plddt_chosen

            self.XLMS_input = self.XLMS_raw_input[
                self.XLMS_raw_input["X-link type"] == self.user_x_link_type_chosen
            ]
            # Store the Input File Restricted to Chosen Cross-Link
            self.XLMS_input.to_csv(os.path.join(self.base_dir, f"xlms_input_{self.user_x_link_type_chosen}.csv"))

            self.XLMS_proteins = self.XLMS_input["uniprotID"].unique()
            self.XLMS_Chain_1 = self.XLMS_input["Peptide A"].unique()
            self.XLMS_Chain_2 = self.XLMS_input["Peptide B"].unique()

    def process_fasta(self):
        '''
        This function processes the FASTA file.

        Improvement Suggestion: Instead of FASTA Database Upload
        Just download FASTA on the fly
        Snippet:
        uurl = "https://rest.uniprot.org/uniprotkb/stream?"
        output_format = "compressed=false&fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Csequence%2Cxref_pdb&format=tsv"
        uurl += output_format
        query = "model_organism%3A9606%20AND%20(reviewed:true)%20AND%20(database:pdb)%20AND%20(database:alphafolddb)"
        uurl += "&query=" + query
        ureq = requests.get(uurl)
        udata = pd.read_csv(BytesIO(ureq.content), delimiter="\t")

        args: None
        creates: self.fa
        return: None
        '''
        import pyfastx as pyfx

        self.fa = pyfx.Fasta(self.fasta_db, key_func=lambda x: x.split("|")[1])

    def verify_and_process_manual_protein_struct_file(self, reset=False):
        """
        Verify and process uploaded protein structure files.

        args:
            reset (bool, optional): If True, reset all manual structure data. Defaults to False.
        creates:
            self.XLMS_cif_files_protein_names
            self.are_manual_structures_verified
            self.manual_protein_struct_file
            self.is_manual_protein_struct
        raises:
            Exception: If processing fails
        return:
            None
        """
        if self.manual_protein_struct_file is not None and self.is_manual_protein_struct and not reset:
            try:
                os.mkdir(os.path.join(self.base_dir,"Uploaded Structures"))
                os.chdir(os.path.join(self.base_dir,"Uploaded Structures"))
            except:
                shutil.rmtree(os.path.join(self.base_dir, "Uploaded Structures"))
                os.mkdir(os.path.join(self.base_dir,"Uploaded Structures"))
                os.chdir(os.path.join(self.base_dir,"Uploaded Structures"))

            with zipfile.ZipFile(self.manual_protein_struct_file, 'r') as zip_ref:
                zip_ref.extractall(os.path.join(self.base_dir,"Uploaded Structures"))

            self.XLMS_cif_files_protein_names = [name[:-4] for name in os.listdir(".") if name.endswith(".cif")]

            if len(self.XLMS_cif_files_protein_names) == 0:
                # clear_output()
                os.chdir(self.base_dir)
                shutil.rmtree(os.path.join(self.base_dir, "Uploaded Structures"))
                self.XLMS_cif_files_protein_names = None
                self.are_manual_structures_verified = False
                self.manual_protein_struct_file = None
                self.is_manual_protein_struct = False
                print("No .CIF Structure Files Found")
            else:
                self.are_manual_structures_verified = True
                print("Verified Manual Protein Structure Files")
                print(".CIF Files for UniProt IDs: ", self.XLMS_cif_files_protein_names)
                print("Please note: It's your responsibility to make sure the .CIF files are named appropriately.")
            os.chdir(self.base_dir)
        elif reset:
            os.chdir(self.base_dir)
            shutil.rmtree(os.path.join(self.base_dir, "Uploaded Structures"))
            self.XLMS_cif_files_protein_names = None
            self.are_manual_structures_verified = False
            self.manual_protein_struct_file = None
            self.is_manual_protein_struct = False

        else:
            raise Exception("Error in workflow: verify_and_process_manual_protein_struct_file. Contact Development Team.")


    def cleanList(self, list_var):
        """
        This function cleans the list and returns a string

        args:
            list_var: list of strings
        return:
            list_str: string of the list
        """
        list_str = str(list_var)
        list_str = list_str.replace("[", "")
        list_str = list_str.replace("]", "")
        list_str = list_str.replace("'", "")
        return list_str

    def get_peptide_starts(self, row):
        """
        This function returns the start positions of the peptides in the FASTA file
        args:
            row: row of the dataframe
        return:
            start_list_a: list of start positions of peptide A
            start_list_b: list of start positions of peptide B
        """

        peptide_a = re.compile(row["Peptide A"])
        peptide_b = re.compile(row["Peptide B"])
        start_list_a = []
        start_list_b = []

        for m in peptide_a.finditer(self.fa[row["uniprotID"]].seq):
            start_list_a.append(m.start())

        for m in peptide_b.finditer(self.fa[row["uniprotID"]].seq):
            start_list_b.append(m.start())

        return [start_list_a, start_list_b]

    def peptide_start_a(self, row):
        """
        This function cleans the list of the start positions of peptides
        args:
            row: row of the dataframe
        return:
            list_str: string of the list of peptide A
        """
        return self.cleanList(row[0])

    def peptide_start_b(self, row):
        """
        This function cleans the list of the start positions of peptides
        args:
            row: row of the dataframe
        return:
            list_str: string of the list of peptide B
        """
        return self.cleanList(row[1])

    def remove_shared_peptides(self, row):
        """
        This function removes the shared peptides
        where shared peptides are:
            - not found in the protein
            - found multiple times in the protein
        args:
            row: row of the dataframe
        return:
            True: if the peptides are not shared
            False: if the peptides are shared

        Need to update this (or optimize the previous steps to treat multiple peptide occurences)
        """
        if len(row["Peptide A Starting Position"]) > 0 and len(row["Peptide B Starting Position"]) > 0:
            print(row["Peptide A Starting Position"], row["Peptide B Starting Position"])
            # If there are multiple peptide occurences
            if ',' in row["Peptide A Starting Position"] or ',' in row["Peptide B Starting Position"]:
                return False
            else:
                return True
        else:
            return False

    def get_actual_pos_from_residue_pep_a(self, row):
        """
        This function returns the actual position of the residue in the chain
        args:
            row: row of the dataframe
        return:
            residue_a_loc: actual position of the residue in the chain

        """
        peptide_a_start = int(row["Peptide A Starting Position"])
        residue_a = row["Residue 1"]
        residue_a_loc = re.findall(r"\d+", residue_a)
        if len(residue_a_loc) == 1:
            return int(residue_a_loc[0]) + peptide_a_start
        else:
            raise Exception("Residue Location Format Incorrect.")

    def get_actual_pos_from_residue_pep_b(self, row):
        """
        This function returns the actual position of the residue in the chain
        args:
            row: row of the dataframe
        return:
            residue_b_loc: actual position of the residue in the chain
        """

        peptide_b_start = int(row["Peptide B Starting Position"])
        residue_b = row["Residue 2"]
        residue_b_loc = re.findall(r"\d+", residue_b)
        if len(residue_b_loc) == 1:
            return int(residue_b_loc[0]) + peptide_b_start
        else:
            raise Exception("Residue Location Format Incorrect.")

    def convert_to_xlms_format(self):

        temp = self.XLMS_input.apply(lambda row: self.get_peptide_starts(row), axis=1)
        temp2 = temp.copy()

        self.XLMS_input["Peptide A Starting Position"] = temp2.apply(
            lambda row: self.peptide_start_a(row)
        ).copy()
        self.XLMS_input["Peptide B Starting Position"] = temp2.apply(
            lambda row: self.peptide_start_b(row)
        ).copy()

        self.XLMS_DF = self.XLMS_input[
            [
                "uniprotID",
                "X-link type",
                "Peptide A",
                "Residue 1",
                "Peptide A Starting Position",
                "Peptide B",
                "Residue 2",
                "Peptide B Starting Position",
            ]
        ].copy()

        print(self.XLMS_DF.head())

    def calculate_absolute_chain_pos(self):
        """
            Calculate absolute positions for both peptides in parallel.

            Args:
                df: DataFrame with peptide positions and residue information

            Returns:
                DataFrame with added absolute position columns
            """
        #self.XLMS_DF['Is Shared'] = self.XLMS_DF.apply(lambda row: self.remove_shared_peptides(row), axis=1)
        XLMS_DF_NO_SHARED = self.XLMS_DF[
            self.XLMS_DF.apply(lambda row: self.remove_shared_peptides(row), axis=1)
        ].copy()
        #print(XLMS_DF_NO_SHARED)
        XLMS_DF_NO_SHARED["Residue 1 Position"] = XLMS_DF_NO_SHARED.apply(
            lambda row: self.get_actual_pos_from_residue_pep_a(row), axis=1
        )
        XLMS_DF_NO_SHARED["Residue 2 Position"] = XLMS_DF_NO_SHARED.apply(
            lambda row: self.get_actual_pos_from_residue_pep_b(row), axis=1
        )

        self.XLMS_DF_NO_DUPES_NO_SHARED = XLMS_DF_NO_SHARED.drop_duplicates().copy()

        #print(self.XLMS_DF_NO_DUPES_NO_SHARED.head())

    def proteins_from_alphafold(self):
        '''
        This function downloads mmCIF protein structure files from AlphaFold Protein Structure Database
        and stores them in a directory named AlphaFold Structures.

        args:
            None
        creates:
            self.XLMS_proteins_with_structure_info,
            self.XLMS_proteins_left
            AlphaFold Structures directory is created if it doesn't exist
        return:
            None
        '''
        current_working_dir = self.base_dir
        clear_output()
        try:
            os.mkdir(os.path.join(self.base_dir, "AlphaFold Structures"))
            os.chdir(os.path.join(self.base_dir, "AlphaFold Structures"))
            print("Created Directory for AlphaFold Structures: ", os.getcwd())
            self.XLMS_proteins_with_structure_info = []
            self.XLMS_proteins_left = self.XLMS_proteins

        except FileExistsError:
            #shutil.rmtree(os.path.join(self.base_dir, "AlphaFold Structures"))
            #os.mkdir(os.path.join(self.base_dir, "AlphaFold Structures"))
            os.chdir(os.path.join(self.base_dir, "AlphaFold Structures"))
            print("Directory for AlphaFold Structures already exists: ", os.getcwd())
            #os.chdir('/content/AlphaFold Structures')
            current_dir = os.getcwd()
            sub_dir_list = [sub_dir[0].split('/')[-1] for sub_dir in os.walk(current_dir)]
            sub_dir_list.remove('AlphaFold Structures')
            irrelevant_sub_dir_list = [sub_dir for sub_dir in sub_dir_list if sub_dir not in self.XLMS_proteins]
            for sub_dir in irrelevant_sub_dir_list:
                shutil.rmtree(os.path.join(self.base_dir, "AlphaFold Structures", sub_dir))
            relevant_sub_dir_list = [sub_dir for sub_dir in sub_dir_list if sub_dir in self.XLMS_proteins]
            print("Irrelevant Sub-Directories were ")
            self.XLMS_proteins_with_structure_info = relevant_sub_dir_list
            self.XLMS_proteins_left = [protein for protein in self.XLMS_proteins if protein not in self.XLMS_proteins_with_structure_info]
        except Exception as e:
            print(f"An unexpected error occurred while creating the directory: {e}")

        print("Downloading Structures from AlphaFold.")
        self.XLMS_proteins_without_structure_info = []
        for protein in self.XLMS_proteins_left:
            os.makedirs( # Not really required
                os.path.join(self.base_dir, "AlphaFold Structures"), exist_ok=True
            )
            os.chdir(os.path.join(self.base_dir, "AlphaFold Structures"))
            os.makedirs(
                os.path.join(self.base_dir, "AlphaFold Structures", protein),
                exist_ok=True,
            )
            os.chdir(os.path.join(self.base_dir, "AlphaFold Structures", protein))
            try:
                # Change this URL for updates to AlphaFold Structures
                urllib.request.urlretrieve(
                    f"https://alphafold.ebi.ac.uk/files/AF-{protein}-F1-model_v4.cif",
                    f"{protein}.cif",
                )
                print(
                    f"AlphaFold Structure downloaded for the following protein: {protein}"
                )
                self.XLMS_proteins_with_structure_info.append(protein)
            except urllib.error.HTTPError as e:
                os.chdir(os.path.join(self.base_dir, "AlphaFold Structures"))
                os.rmdir(os.path.join(self.base_dir, "AlphaFold Structures", protein))
                self.XLMS_proteins_without_structure_info.append(protein)
                print(
                    f"AlphaFold Structure not downloaded/found for the following protein: {protein}"
                )
            except Exception as e:
                print(
                    f"An unexpected error occurred while processing protein {protein}: {e}"
                )
        clear_output()
        print("AlphaFold Structures downloaded in directory:", os.path.join(self.base_dir, "AlphaFold Structures"))
        print("AlphaFold Structures not found for: ", self.XLMS_proteins_without_structure_info)
        os.chdir(current_working_dir)

    def getResidueDistance(self, row):
        clear_output()
        """
        This function returns the distance between the two residues
        args:
            row: row of the dataframe
        return:
            dist: distance between the two residues
        """
        import prody as prd
        if row["uniprotID"] in self.XLMS_proteins_with_structure_info:
            try:
                protein = row["uniprotID"]
                path = os.path.join(
                    self.base_dir,
                    "AlphaFold Structures",
                    protein,
                    f"{protein}.cif",
                )
                print(path)
                protein_struct = prd.parseMMCIF(
                    os.path.join(
                        self.base_dir,
                        "AlphaFold Structures",
                        protein,
                        f"{protein}.cif",
                    )
                )
                pep_a_pos = int(row["Residue 1 Position"])
                pep_b_pos = int(row["Residue 2 Position"])
                res_1 = protein_struct.select(f"resnum {pep_a_pos} and name CA")
                res_2 = protein_struct.select(f"resnum {pep_b_pos} and name CA")
                dist = prd.calcDistance(res_1, res_2)[0]
                return dist
            except AttributeError:
                print(
                    "Unable to compute distance for the protein: ",
                    row["uniprotID"],
                    ".\nThe structure file is not appropriate.",
                )
                return "N/A"
        else:
            return "N/A"

    def getFirstResidueBeta(self, row):
        clear_output()
        import prody as prd
        if row['uniprotID'] in self.XLMS_proteins_with_structure_info:
            try:
                protein = row['uniprotID']
                protein_struct = prd.parseMMCIF(os.path.join(self.base_dir,'AlphaFold Structures', protein, f'{protein}.cif'))
                protein_struct_hv = protein_struct.getHierView()
                pep_a_pos = int(row['Residue 1 Position'])
                res_1 = protein_struct_hv.getResidue("A", pep_a_pos)
                res_1_ca = res_1["CA"]
                beta = res_1_ca.getBeta()
                return beta
            except AttributeError:
                print('Unable to compute distance for the protein: ', row['uniprotID'], '.\nThe structure file is not appropriate.')
                return 'N/A'
        else:
            return 'N/A'

    def getSecondResidueBeta(self, row):
        clear_output()
        import prody as prd
        if row['uniprotID'] in self.XLMS_proteins_with_structure_info:
            try:
                protein = row['uniprotID']
                protein_struct = prd.parseMMCIF(os.path.join(self.base_dir,'AlphaFold Structures', protein, f'{protein}.cif'))
                protein_struct_hv = protein_struct.getHierView()
                pep_b_pos = int(row['Residue 2 Position'])
                res_2 = protein_struct_hv.getResidue("A", pep_b_pos)
                res_2_ca = res_2["CA"]
                beta = res_2_ca.getBeta()
                return beta
            except AttributeError:
                print('Unable to compute distance for the protein: ', row['uniprotID'], '.\nThe structure file is not appropriate.')
                return 'N/A'
        else:
            return 'N/A'

    def get_manual_structure_distance_data(self, row):
        '''
        This function returns the distance between the two residues
        for uploaded .CIF structures
        args:
            row: row of the dataframe
        return:
            data_dict

        Data_Dict has 5 keys:
        residue-1-start-resindex
        residue-2-start-resindex
        residue-1-start-resnum
        residue-2-start-resnum
        residue-distance

        Difference b/w
        '''
        clear_output()
        import prody as prd
        data_dict = {
                        'residue-1-start-resindex': 'N/A',
                        'residue-2-start-resindex': 'N/A',
                        'residue-1-start-resnum': 'N/A',
                        'residue-2-start-resnum': 'N/A',
                        'residue-distance': 'N/A'
                    }
        if row['uniprotID'] in self.XLMS_cif_files_protein_names:
            try:
                protein = row['uniprotID']
                protein_struct = prd.parseMMCIF(os.path.join(self.base_dir,'Uploaded Structures', f'{protein}.cif'))
                protein_struct_hv = protein_struct.getHierView()

                if len(list(protein_struct_hv)) != 1:
                    raise MultipleChainError
                else:
                    protein_struct_seq = list(protein_struct_hv)[0].getSequence()

                protein_struct_residue_list = list(list(protein_struct_hv)[0])
                print(list(list(protein_struct_hv)[0]))
                print(protein_struct_seq)
                print(len(protein_struct_seq))
                peptide_a = re.compile(row['Peptide A'])
                peptide_b = re.compile(row['Peptide B'])
                print(peptide_a, 'pep a')
                print(peptide_b, 'pep b')
                start_list_a = []
                start_list_b = []

                for m in peptide_a.finditer(protein_struct_seq):
                    start_list_a.append(m.start())

                for m in peptide_b.finditer(protein_struct_seq):
                    start_list_b.append(m.start())

                if (len(start_list_a) > 1) or (len(start_list_b) > 1):
                    raise MultiplePeptideIterationsError
                elif (len(start_list_a) == 0) or (len(start_list_b) == 0):
                    raise PeptideNotFoundError
                else:
                    peptide_a_start = start_list_a[0]
                    peptide_b_start = start_list_b[0]

                print(peptide_a_start, 'pep_a_starr')
                print(peptide_b_start, 'pep_b_starr')
                residue_a = row['Residue 1']
                residue_a_loc = re.findall(r"\d+", residue_a)
                print('residue_a_loc', residue_a_loc)
                if len(residue_a_loc) == 1:
                    residue_a_pos = int(residue_a_loc[0]) + peptide_a_start
                else:
                    raise ResidueLocationError

                residue_b = row['Residue 2']
                residue_b_loc = re.findall(r"\d+", residue_b)
                print('residue_b_loc', residue_b_loc)
                if len(residue_b_loc) == 1:
                    residue_b_pos = int(residue_b_loc[0]) + peptide_b_start
                else:
                    raise ResidueLocationError

                res_1 = protein_struct_residue_list[residue_a_pos-1]
                res_2 = protein_struct_residue_list[residue_b_pos-1]

                print(res_1, res_1.getResindex())
                res_1_ca = res_1["CA"]
                res_2_ca = res_2["CA"]

                dist = prd.calcDistance(res_1_ca, res_2_ca)

                data_dict = {
                            'residue-1-start-resindex': res_1.getResindex(),
                            'residue-2-start-resindex': res_2.getResindex(),
                            'residue-1-start-resnum': res_1.getResnum(),
                            'residue-2-start-resnum': res_2.getResnum(),
                            'residue-distance': dist
                            }

                return data_dict
            except AttributeError:
                print('Unable to compute distance for the protein: ', row['uniprotID'], '.\nThe structure file is not appropriate.')
                data_dict['residue-distance'] = 'Structure Error'
                self.erroneous_XLMS_cif_files_protein_names.append(row['uniprotID'])

                return data_dict
            except MultipleChainError:
                print('Unable to compute distance for the protein: ', row['uniprotID'], '.\nThe structure file has multiple chains. The structure file should have only one chain.')
                data_dict['residue-distance'] = 'Multiple Chain Error'
                self.erroneous_XLMS_cif_files_protein_names.append(row['uniprotID'])

                return data_dict
            except ResidueLocationError:
                print('Unable to compute distance for the protein: ', row['uniprotID'], '.\nThe Residue Location format is incorrect.')
                data_dict['residue-distance'] = 'Residue Location Error'
                self.erroneous_XLMS_cif_files_protein_names.append(row['uniprotID'])

                return data_dict
            except MultiplePeptideIterationsError:
                print('Unable to compute distance for the protein: ', row['uniprotID'], '.\nThe Peptide is found multiple times.')
                data_dict['residue-distance'] = 'Multiple Peptide Iterations Error'
                self.erroneous_XLMS_cif_files_protein_names.append(row['uniprotID'])

                return data_dict
            except PeptideNotFoundError:
                print('Unable to compute distance for the protein: ', row['uniprotID'], '.\nThe Peptide is not found.')
                data_dict['residue-distance'] = 'Peptide Not Found Error'
                self.erroneous_XLMS_cif_files_protein_names.append(row['uniprotID'])

                return data_dict
        else:
            return data_dict

    def get_residue_1_number_manual(self, row):
        return row['residue-1-start-resnum']

    def get_residue_2_number_manual(self, row):
        return row['residue-2-start-resnum']

    def get_residue_1_index_manual(self, row):
        return row['residue-1-start-resindex']

    def get_residue_2_index_manual(self, row):
        return row['residue-2-start-resindex']

    def get_residue_distance_manual(self, row):
        #print(row)
        return row['residue-distance']


    def calculate_residue_distance_and_betas_all(self):
        self.XLMS_DF_NO_DUPES_NO_SHARED["Residue Distance"] = (
            self.XLMS_DF_NO_DUPES_NO_SHARED.apply(
                lambda row: self.getResidueDistance(row), axis=1
            )
        )
        self.XLMS_DF_NO_DUPES_NO_SHARED["Residue 1 pLDDT"] = (
            self.XLMS_DF_NO_DUPES_NO_SHARED.apply(
                lambda row: self.getFirstResidueBeta(row), axis=1
            )
        )
        self.XLMS_DF_NO_DUPES_NO_SHARED["Residue 2 pLDDT"] = (
            self.XLMS_DF_NO_DUPES_NO_SHARED.apply(
                lambda row: self.getSecondResidueBeta(row), axis=1
            )
        )
        if self.are_manual_structures_verified:
            manual_struct_distance_data_dict_df = self.XLMS_DF_NO_DUPES_NO_SHARED.apply(
                lambda row: self.get_manual_structure_distance_data(row), axis=1
            )
            #print(manual_struct_distance_data_dict_df)
            self.XLMS_DF_NO_DUPES_NO_SHARED['Residue Distance (Manual)'] = manual_struct_distance_data_dict_df.apply(
                lambda row: self.get_residue_distance_manual(row)
            )
            self.XLMS_DF_NO_DUPES_NO_SHARED['Residue 1 Index (Manual)'] = manual_struct_distance_data_dict_df.apply(
                lambda row: self.get_residue_1_index_manual(row)
            )
            self.XLMS_DF_NO_DUPES_NO_SHARED['Residue 2 Index (Manual)'] = manual_struct_distance_data_dict_df.apply(
                lambda row: self.get_residue_2_index_manual(row)
            )
            self.XLMS_DF_NO_DUPES_NO_SHARED['Residue 1 Number (Manual)'] = manual_struct_distance_data_dict_df.apply(
                lambda row: self.get_residue_1_number_manual(row)
            )
            self.XLMS_DF_NO_DUPES_NO_SHARED['Residue 2 Number (Manual)'] = manual_struct_distance_data_dict_df.apply(
                lambda row: self.get_residue_2_number_manual(row)
            )


    def calculate_violation_status(self, distance):
        """
        Determines if a Cα-Cα distance violates the threshold.

        Args:
            distance (float or str): The calculated distance or an error string/NaN.

        Returns:
            str: "Satisfied", "Violated", or "N/A".
        """
        # Check if distance is a valid number
        if pd.isna(distance) or not isinstance(distance, (int, float)):
            return "N/A" # Status is Not Applicable if distance couldn't be calculated

        # Compare numeric distance to threshold
        if distance <= self.residue_distance_threshold:
            return "Satisfied"
        else:
            return "Violated"

    def calculate_plddt_status(self, plddt):
        """
        Determines if a pLDDT score meets the confidence threshold.

        Args:
            plddt (float or str): The pLDDT score or an error string/NaN.

        Returns:
            str: "Pass", "Fail", or "N/A".
        """
        # Check if pLDDT is a valid number
        if pd.isna(plddt) or not isinstance(plddt, (int, float)):
            return "N/A" # Status is Not Applicable if pLDDT couldn't be calculated or isn't numeric

        # Compare numeric pLDDT to the threshold
        # Note: Higher pLDDT is better
        if plddt >= self.plddt_threshold:
            return "Pass"
        else:
            return "Fail"

    def insert_analysis_results_to_raw_input(self):
        """
        Inserts residue distances and related metrics (pLDDT, manual structure data)
        from the de-duplicated DataFrame (XLMS_DF_NO_DUPES_NO_SHARED)
        back into the original raw input DataFrame (XLMS_input).

        Raises:
            Exception: If multiple matching unique entries are found, indicating a data integrity issue.
        """
        #Create the XLMS Output Object
        self.XLMS_output = self.XLMS_input.copy(deep=True)

        # Define columns from XLMS_DF_NO_DUPES_NO_SHARED to be inserted into XLMS_input
        columns_to_insert = [
            'Residue Distance',
            'Residue 1 pLDDT',
            'Residue 2 pLDDT',
            'Residue 1 Position',
            'Residue 2 Position'
        ]

        # Add manual structure columns if applicable
        if self.are_manual_structures_verified:
            columns_to_insert.extend([
                'Residue Distance (Manual)',
                'Residue 1 Index (Manual)',
                'Residue 2 Index (Manual)',
                'Residue 1 Number (Manual)',
                'Residue 2 Number (Manual)'
            ])

        # Initialize these columns in XLMS_input with 'N/A' to avoid KeyError
        # and ensure all rows have these columns before assignment.
        for col in columns_to_insert:
            self.XLMS_output[col] = 'N/A'

        # Iterate over each row in the original XLMS_input
        for index, row in self.XLMS_output.iterrows():
            if row['uniprotID'] not in self.XLMS_proteins_with_structure_info and \
               (not self.are_manual_structures_verified or row['uniprotID'] not in self.XLMS_cif_files_protein_names):
                # If protein structure info is not available for AlphaFold or manual (if applicable),
                # all relevant columns remain 'N/A' (already initialized).
                continue

            # Define the key columns for matching unique cross-links
            # These columns should uniquely identify an entry after de-duplication
            match_conditions = (
                (self.XLMS_DF_NO_DUPES_NO_SHARED["uniprotID"] == row["uniprotID"]) &
                (self.XLMS_DF_NO_DUPES_NO_SHARED["Peptide A"] == row["Peptide A"]) &
                (self.XLMS_DF_NO_DUPES_NO_SHARED["Peptide B"] == row["Peptide B"]) &
                (self.XLMS_DF_NO_DUPES_NO_SHARED["Residue 1"] == row["Residue 1"]) &
                (self.XLMS_DF_NO_DUPES_NO_SHARED["Residue 2"] == row["Residue 2"])
            )

            # Retrieve matching rows from the de-duplicated DataFrame
            matching_unique_rows = self.XLMS_DF_NO_DUPES_NO_SHARED[match_conditions]

            # Sanity Check: Ensure no more than one matching unique row is found
            if len(matching_unique_rows) > 1:
                raise Exception(
                    f"Fatal Error: Multiple matching unique entries found for protein {row['uniprotID']}, "
                    f"peptides {row['Peptide A']}-{row['Peptide B']}, and residues {row['Residue 1']}-{row['Residue 2']} "
                    f"in the de-duplicated DataFrame (XLMS_DF_NO_DUPES_NO_SHARED). "
                    f"This indicates an issue in the de-duplication logic or input data integrity."
                )
            elif len(matching_unique_rows) == 1:
                # If exactly one unique match is found, extract its values
                unique_row_data = matching_unique_rows.iloc[0]
                for col in columns_to_insert:
                    value_to_assign = unique_row_data[col]
                    # Attempt conversion to float for numeric columns if they are not 'N/A' or similar strings
                    if col in ['Residue Distance', 'Residue 1 pLDDT', 'Residue 2 pLDDT'] and value_to_assign != 'N/A':
                        try:
                            self.XLMS_output.at[index, col] = float(value_to_assign)
                        except ValueError:
                            self.XLMS_output.at[index, col] = value_to_assign # Keep as string if conversion fails
                    else:
                        self.XLMS_output.at[index, col] = value_to_assign
            else:
                # If no unique match is found (e.g., due to 'N/A as Shared Peptide' logic or other filtering),
                # mark relevant columns as 'N/A as Shared Peptide'
                # This explicitly sets columns for entries that were effectively "removed"
                # during the de-duplication/shared peptide filtering step.
                for col in columns_to_insert:
                    # Only overwrite if it wasn't already determined as missing due to structure info
                    if self.XLMS_output.at[index, col] == 'N/A': # Check if it's still 'N/A' from initialization
                        self.XLMS_output.at[index, col] = 'N/A as Shared Peptide'

    # Outputting the distances
    def output_distances(self):
        """
        Save all analysis results to output files.

        Generates:
            - Main Output File (XLMS_Final_Output.csv)
            - Violated Distances File (Violated_Distances.csv)
            - Satisfied Distances File (Satisfied_Distances.csv
            - Confident pLDDT File (Confident_pLDDT.csv)
            - Low Confidence pLDDT File (Low_Confidence_pLDDT.csv)
            - pLDDT Summary File (pLDDT_Summary.csv)
            - Detailed pLDDT Summary File (pLDDT_Overall_Summary.csv)
            - Distance Summary File (Distance_Summary.csv)
            - XLMS_Distances_WO_Duplicates_WO_Shared.csv (For BarPlot and Visualization)
        args:
            None
        return:
            None
        """

        base_filename = pathlib.Path(self.data_file).stem

        # Calculate status based on AlphaFold distance first
        final_df = self.XLMS_output.copy() # Work on a copy for final output

        # Calculate distance status based on AlphaFold distance first
        residue_distance_numeric = pd.to_numeric(final_df['Residue Distance'], errors='coerce')
        final_df["CA-CA Distance status"] = residue_distance_numeric.apply(self.calculate_violation_status)

        # Calculate pLDDT Status
        residue_1_plddt_numeric = pd.to_numeric(final_df['Residue 1 pLDDT'], errors='coerce')
        residue_2_plddt_numeric = pd.to_numeric(final_df['Residue 2 pLDDT'], errors='coerce')
        final_df["Residue 1 pLDDT status"] = residue_1_plddt_numeric.apply(self.calculate_plddt_status)
        final_df["Residue 2 pLDDT status"] = residue_2_plddt_numeric.apply(self.calculate_plddt_status)

        # If manual structures were used, add/update status based on manual distance
        if self.are_manual_structures_verified and 'Residue Distance (Manual)' in final_df.columns:
                # Convert manual distance to numeric where possible for comparison
                manual_dist_numeric = pd.to_numeric(final_df['Residue Distance (Manual)'], errors='coerce')
                final_df["CA-CA Distance status (Manual)"] = manual_dist_numeric.apply(self.calculate_violation_status)
                # Optionally, create a combined status or prioritize one?
                # For now, keeping them separate.

        # Save the analysis results
        output_csv_path = os.path.join(
            self.base_dir,
            pathlib.Path(self.data_file).stem + "_XLMS_Final_Output.csv"
        )
        final_df.to_csv(output_csv_path)

        # 1. Violated Distance
        # Ensure Distance Status exists
        if "CA-CA Distance status" in final_df.columns:
            violated_df = final_df[final_df["CA-CA Distance status"] == "Violated"].copy()
            violated_output_path = os.path.join(self.base_dir, f"{base_filename}_Violated_Distances.csv")
            violated_df.to_csv(violated_output_path, index=False)
            print(f" - Saved Violated Distances ({len(violated_df)} rows) to: {violated_output_path}")

            # 2. Satisfied Distance
            satisfied_df = final_df[final_df["CA-CA Distance status"] == "Satisfied"].copy()
            satisfied_output_path = os.path.join(self.base_dir, f"{base_filename}_Satisfied_Distances.csv")
            satisfied_df.to_csv(satisfied_output_path, index=False)
            print(f" - Saved Satisfied Distances ({len(satisfied_df)} rows) to: {satisfied_output_path}")
        else:
            print(" - Skipping Distance-based filtering: 'CA-CA Distance status' column not found.")

        # 3. Confident pLDDT (Both Residues)
        # Ensure the status columns exist before filtering
        if "Residue 1 pLDDT status" in final_df.columns and "Residue 2 pLDDT status" in final_df.columns:
            confident_mask = (final_df["Residue 1 pLDDT status"] == "Pass") & \
                                (final_df["Residue 2 pLDDT status"] == "Pass")
            confident_df = final_df[confident_mask].copy()
            confident_output_path = os.path.join(self.base_dir, f"{base_filename}_Confident_pLDDT.csv")
            confident_df.to_csv(confident_output_path, index=False)
            print(f" - Saved Confident pLDDT ({len(confident_df)} rows) to: {confident_output_path}")

            # 4. Low Confidence pLDDT (At least one residue is Low Confidence, excluding N/A)
            low_conf_mask = (
                ((final_df["Residue 1 pLDDT status"] == "Fail") | \
                    (final_df["Residue 2 pLDDT status"] == "Fail")) & \
                (final_df["Residue 1 pLDDT status"] != "N/A") & \
                (final_df["Residue 2 pLDDT status"] != "N/A")
            )
            low_confidence_df = final_df[low_conf_mask].copy()
            low_conf_output_path = os.path.join(self.base_dir, f"{base_filename}_Low_Confidence_pLDDT.csv")
            low_confidence_df.to_csv(low_conf_output_path, index=False)
            print(f" - Saved Low Confidence pLDDT ({len(low_confidence_df)} rows) to: {low_conf_output_path}")
        else:
            print(" - Skipping pLDDT-based filtering as status columns are missing.")

        # SUMMARY STATS
        # 1. Distance Status Summary
        if "CA-CA Distance status" in final_df.columns:
            distance_summary = final_df["CA-CA Distance status"].value_counts().reset_index()
            distance_summary.columns = ['Status', 'Count']
            distance_summary_path = os.path.join(self.base_dir, f"{base_filename}_Distance_Summary.csv")
            distance_summary.to_csv(distance_summary_path, index=False)
            print(f" - Saved Distance Summary to: {distance_summary_path}")
        else:
            print(" - Skipping Distance Summary: 'CA-CA Distance status' column not found.")

        # 2. pLDDT Confidence Summary
        if "Residue 1 pLDDT status" in final_df.columns and "Residue 2 pLDDT status" in final_df.columns:
            # Filter out rows where pLDDT status is N/A for accurate counting
            plddt_valid_df = final_df[
                (final_df["Residue 1 pLDDT status"] != "N/A") &
                (final_df["Residue 2 pLDDT status"] != "N/A")
            ].copy()

            if not plddt_valid_df.empty:
                # Count rows where both are Confident
                both_confident_count = len(plddt_valid_df[
                    (plddt_valid_df["Residue 1 pLDDT status"] == "Pass") &
                    (plddt_valid_df["Residue 2 pLDDT status"] == "Pass")
                ])

                # Count rows where at least one is Low Confidence
                at_least_one_low_conf_count = len(plddt_valid_df[
                    (plddt_valid_df["Residue 1 pLDDT status"] == "Fail") |
                    (plddt_valid_df["Residue 2 pLDDT status"] == "Fail")
                ])

                # Create summary DataFrame
                plddt_summary_data = {
                    'Category': ['Both Residues Pass', 'At Least One Residue Fail'],
                    'Count': [both_confident_count, at_least_one_low_conf_count]
                }
                plddt_summary_df = pd.DataFrame(plddt_summary_data)
                plddt_summary_path = os.path.join(self.base_dir, f"{base_filename}_pLDDT_Summary.csv")
                plddt_summary_df.to_csv(plddt_summary_path, index=False)
                print(f" - Saved pLDDT Summary to: {plddt_summary_path}")
            else:
                    print(" - Skipping pLDDT Summary: No rows with valid pLDDT status found.")

        else:
            print(" - Skipping pLDDT Summary: pLDDT status columns not found.")

        # 3. Detailed pLDDT Confidence Level Summary
        if "Residue 1 pLDDT" in final_df.columns and "Residue 2 pLDDT" in final_df.columns:
            print(" - Generating detailed pLDDT confidence summary...")
            # Define bins and labels for numeric pLDDTs
            bins = [-np.inf, 50, 70, 90, np.inf]
            labels = ['Very Low (pLDDT < 50)', 'Low (70 > pLDDT > 50)', 'High (90 > pLDDT > 70)', 'Very High (pLDDT > 90)']
            all_numeric_categories = labels # Categories for numeric values

            # --- Count Original NAs/Non-Numerics FIRST ---
            # Create boolean masks to identify rows where coercion to numeric results in NaN
            plddt1_na_mask = pd.to_numeric(final_df["Residue 1 pLDDT"], errors='coerce').isna()
            plddt2_na_mask = pd.to_numeric(final_df["Residue 2 pLDDT"], errors='coerce').isna()
            # Sum the masks to get the count of original NAs/non-numerics
            na_count1 = plddt1_na_mask.sum()
            na_count2 = plddt2_na_mask.sum()

            # --- Process Numeric Values ---
            # Coerce to numeric, getting NaN for original non-numerics
            plddt1_numeric = pd.to_numeric(final_df["Residue 1 pLDDT"], errors='coerce')
            plddt2_numeric = pd.to_numeric(final_df["Residue 2 pLDDT"], errors='coerce')

            # Categorize only the valid numeric values (drop NaNs resulting from coercion)
            # Use dropna() on the numeric series before applying cut
            plddt1_cat_numeric = pd.cut(plddt1_numeric.dropna(), bins=bins, labels=labels, right=False).astype(str)
            plddt2_cat_numeric = pd.cut(plddt2_numeric.dropna(), bins=bins, labels=labels, right=False).astype(str)

            # Get value counts for the numeric categories
            plddt1_counts_numeric = plddt1_cat_numeric.value_counts()
            plddt2_counts_numeric = plddt2_cat_numeric.value_counts()

            # --- Combine Counts into Summary DataFrame ---
            # Start with all possible categories (numeric + N/A)
            all_categories_with_na = all_numeric_categories + ['N/A']
            plddt_overall_summary_df = pd.DataFrame({
                'Confidence Level': all_categories_with_na
            }).set_index('Confidence Level')

            # Add counts for numeric categories using reindex to handle missing categories gracefully
            plddt_overall_summary_df['Residue 1 Count'] = plddt1_counts_numeric.reindex(plddt_overall_summary_df.index, fill_value=0)
            plddt_overall_summary_df['Residue 2 Count'] = plddt2_counts_numeric.reindex(plddt_overall_summary_df.index, fill_value=0)

            # Explicitly set the counts for the 'N/A' category using the pre-calculated counts
            plddt_overall_summary_df.loc['N/A', 'Residue 1 Count'] = na_count1
            plddt_overall_summary_df.loc['N/A', 'Residue 2 Count'] = na_count2

            # Ensure integer type
            plddt_overall_summary_df = plddt_overall_summary_df.astype(int)

            # Reorder index if needed (already done by starting with all_categories_with_na)
            # plddt_overall_summary_df = plddt_overall_summary_df.reindex(all_categories_with_na)

            # Save the summary
            plddt_overall_summary_path = os.path.join(self.base_dir, f"{base_filename}_pLDDT_Overall_Summary.csv")
            plddt_overall_summary_df.reset_index().to_csv(plddt_overall_summary_path, index=False)
            print(f" - Saved Detailed pLDDT Summary to: {plddt_overall_summary_path}")

        else:
            print(" - Skipping Detailed pLDDT Summary: pLDDT columns not found.")

        path = os.path.join(
            self.base_dir,
            base_filename + "_XLMS_Distances_WO_Duplicates_WO_Shared.csv",
        )
        self.XLMS_DF_NO_DUPES_NO_SHARED.to_csv(
            os.path.join(
                self.base_dir,
                base_filename + "_XLMS_Distances_WO_Duplicates_WO_Shared.csv",
            ),
            index=False,
        )

        # Store Proteins with Structure Information
        pd.Series(self.XLMS_proteins_with_structure_info).to_csv(
            os.path.join(self.base_dir, "xlms_proteins_with_structure_info.csv")
        )


    def save_barplot(self):
        """
        Generate and save bar plot of residue distances.

        Creates a bar plot showing residue distances with a threshold line based on self.residue_distance_threshold.
        and saves it as a JPEG file.
        """
        self.df_bplt = pd.read_csv(
            os.path.join(
                self.base_dir,
                pathlib.Path(self.data_file).stem + "_XLMS_Distances_WO_Duplicates_WO_Shared.csv",
            )
        )
        Barplot = self.df_bplt.plot.bar(y="Residue Distance", rot=0)
        plt.axhline(y=self.residue_distance_threshold, color="r", linestyle="dashed")
        Barplot.set_title("Distance_Residue Bar_Plot", fontdict={"fontsize": 12})
        # Barplot.set_xlabel("Score_1",fontdict= { 'fontsize': 10})
        Barplot.axes.get_xaxis().set_visible(False)
        Barplot.set_ylabel("Cα-Cα Distance", fontdict={"fontsize": 10})
        plt.savefig(
            os.path.join(
                self.base_dir,
                self.data_file.split(".")[0] + "_XLMS_Distances_Barplot.jpeg",
            ),
            dpi=600,
            bbox_inches="tight",
        )
        plt.close()
        print("Barplot saved")

    def save_histplot(self):
        """
        Generate and save histogram of residue distances with error handling for small datasets.
        """
        try:
            self.df_hplt = pd.read_csv(
                os.path.join(
                    self.base_dir, pathlib.Path(self.data_file).stem + "_XLMS_Final_Output.csv"
                )
            )
            # Convert 'Residue Distance' to numeric, coercing errors to NaN
            self.df_hplt['Residue Distance'] = pd.to_numeric(self.df_hplt['Residue Distance'], errors='coerce')

            # Drop NaN values and check if we have enough data
            valid_data = self.df_hplt.dropna(subset=['Residue Distance'])
            if len(valid_data) == 0:
                print("Warning: No valid distance data found for histogram")
                return

            # Create histogram with adjusted number of bins
            plt.figure(figsize=(10, 6))

            # Calculate appropriate number of bins based on data size
            n_bins = min(max(5, len(valid_data) // 2), 140)  # At least 5 bins, at most 140

            Histplot = sns.histplot(
                data=valid_data,
                x="Residue Distance",
                bins=n_bins,  # Adjusted number of bins
                stat="probability",
                hue="CA-CA Distance status",
                binwidth=None,  # Let seaborn determine appropriate binwidth
                palette={"Satisfied": "green", "Violated": "red"}
            )

            # Set labels and style
            Histplot.set(
                ylabel="Cross-links",
                xlabel="Cα-Cα Distance"
            )
            plt.ylabel("Cross-links", fontsize=10)
            plt.xlabel("Cα-Cα Distance", fontsize=10)

            # Save the plot
            output_plot_path = os.path.join(
                self.base_dir,
                pathlib.Path(self.data_file).stem + "_XLMS_distances_Histplot.jpeg"
            )
            plt.savefig(output_plot_path, dpi=300, bbox_inches='tight')
            plt.close()

            print(f"Histogram created with {len(valid_data)} valid data points")
        except Exception as e:
            print(f"Error creating histogram: {str(e)}")
            # Create a minimal plot if regular histogram fails
            try:
                plt.figure(figsize=(10, 6))
                plt.text(0.5, 0.5, "Insufficient data for histogram",
                        ha='center', va='center')
                plt.savefig(output_plot_path, dpi=300, bbox_inches='tight')
                plt.close()
            except Exception as e2:
                print(f"Could not create fallback plot: {str(e2)}")

    def visualize_crosslinks(self, is_manual):
        """
        Generate PyMOL visualization of cross-links.

        Args:
            is_manual (bool): If True, use manually uploaded structures;
                            if False, use AlphaFold structures

        Creates:
            - Individual PyMOL sessions for each cross-link
            - Consolidated views of all cross-links per protein
            - Color-coded distance violations (green=satisfied, red=violated)

        TODO:
        Optimize this
        """
        from pymol import cmd
        from pymol.cgo import CYLINDER
        if self.is_visualization_allowed and not is_manual:
            try:
                os.mkdir(os.path.join(self.base_dir ,"PyMOL Sessions"))
            except:
                shutil.rmtree(os.path.join(self.base_dir ,"PyMOL Sessions"))
                os.mkdir(os.path.join(self.base_dir ,"PyMOL Sessions"))
            proteins_with_structure = self.XLMS_proteins_with_structure_info

        elif self.is_visualization_allowed and is_manual:
            try:
                os.mkdir(os.path.join(self.base_dir, "Uploaded Structures", "PyMOL Sessions"))
                os.chdir(os.path.join(self.base_dir, "Uploaded Structures", "PyMOL Sessions"))
            except:
                shutil.rmtree(os.path.join(self.base_dir, "Uploaded Structures", "PyMOL Sessions"))
                os.mkdir(os.path.join(self.base_dir, "Uploaded Structures", "PyMOL Sessions"))
                os.chdir(os.path.join(self.base_dir, "Uploaded Structures", "PyMOL Sessions"))

            proteins_with_structure = self.XLMS_cif_files_protein_names
        else:
            raise Exception("Error in flow: visualize_alphfold_crosslinks. Contact Development Team.")

        processed_input = pd.read_csv(os.path.join(self.data_file.split(".")[0] +'_XLMS_Distances_WO_Duplicates_WO_Shared.csv'))

        if self.are_manual_structures_verified:
            input_for_pymol = processed_input[[
                'uniprotID',
                'Peptide A',
                'Peptide B',
                'Residue 1 Position',
                'Residue 2 Position',
                'Residue Distance',
                'Residue Distance (Manual)',
                'Residue 1 Number (Manual)',
                'Residue 2 Number (Manual)'
            ]]
        else:
            input_for_pymol = processed_input[[
                'uniprotID',
                'Peptide A',
                'Peptide B',
                'Residue 1 Position',
                'Residue 2 Position',
                'Residue Distance',
            ]]

        proteins_processed_counter = {}

        for index, row in input_for_pymol.iterrows():
            protein = row['uniprotID']
            counter = 0

            if protein not in proteins_with_structure:
                print(f"Skipped PyMOL Session generation for {protein}. Reason: Structure not found!")
                continue
            if is_manual and protein in self.erroneous_XLMS_cif_files_protein_names:
                print(f"Skipped PyMOL Session generation for {protein}. Reason: Structure has a problem!")
                continue

            if protein in proteins_processed_counter.keys():
                proteins_processed_counter[protein] += 1
                counter = proteins_processed_counter[protein]
            else:
                proteins_processed_counter[protein] = 0
                counter = proteins_processed_counter[protein]
                if not is_manual:
                    os.mkdir(os.path.join(self.base_dir , "PyMOL Sessions" , protein))
                else:
                    os.mkdir(os.path.join(self.base_dir, "Uploaded Structures", "PyMOL Sessions" , protein))

            cmd.reinitialize()
            if not is_manual:
                cmd.load(os.path.join(self.base_dir , 'AlphaFold Structures' , protein , f'{protein}.cif'))
            else:
                cmd.load(os.path.join(self.base_dir, "Uploaded Structures", f'{protein}.cif'))
            cmd.spectrum('b', 'rainbow_r')  # approximate AF coloring
            cmd.bg_color('white')

            colors = {
                'green': [0.0, 1.0, 0.0], # green
                'red': [0.82, 0.0, 0.3]   # dubnium
            }

            radius = 0.5
            selection = 'all'
            atom = 'CA'
            prefix = 'xl'
            if is_manual:
                res_distance = row['Residue Distance']
            else:
                res_distance = row['Residue Distance']
            threshold = self.residue_distance_threshold

            if is_manual:
                x1, y1, z1 = cmd.get_coords(f'{selection} and resi {int(row["Residue 1 Number (Manual)"])} and name {atom}', 1)[0]
                x2, y2, z2 = cmd.get_coords(f'{selection} and resi {int(row["Residue 2 Number (Manual)"])} and name {atom}', 1)[0]

                cmd.distance(f'{selection} and resi {row["Residue 1 Number (Manual)"]} and name {atom}', f'{selection} and resi {row["Residue 2 Number (Manual)"]} and name {atom}')
            else:
              x1, y1, z1 = cmd.get_coords(f'{selection} and resi {int(row["Residue 1 Position"])} and name {atom}', 1)[0]
              x2, y2, z2 = cmd.get_coords(f'{selection} and resi {int(row["Residue 2 Position"])} and name {atom}', 1)[0]

              cmd.distance(f'{selection} and resi {row["Residue 1 Position"]} and name {atom}', f'{selection} and resi {row["Residue 2 Position"]} and name {atom}')

            d = np.linalg.norm(np.array([x2, y2, z2]) - np.array([x1, y1, z1]))

            if d <= threshold:

                r1, g1, b1 = colors['green']
                r2, g2, b2 = colors['green']

            else:

                r1, g1, b1 = colors['red']
                r2, g2, b2 = colors['red']

            if is_manual:
                cmd.load_cgo([CYLINDER, x1, y1, z1, x2, y2, z2, radius, r1, g1, b1, r2, g2, b2],
                    f'{prefix}_{row["Residue 1 Number (Manual)"]}_{row["Residue 2 Number (Manual)"]}_{atom}')
            else:
                cmd.load_cgo([CYLINDER, x1, y1, z1, x2, y2, z2, radius, r1, g1, b1, r2, g2, b2],
                    f'{prefix}_{row["Residue 1 Position"]}_{row["Residue 2 Position"]}_{atom}')

            cmd.group(prefix, f'{prefix}_*')
            #print(res_distance)
            cmd.label(f'{prefix}_*', str(res_distance))
            if not is_manual:
                cmd.save(os.path.join(self.base_dir , "PyMOL Sessions" , protein , f'{protein}-{counter}.pse'))
            else:
                cmd.save(os.path.join(self.base_dir, "Uploaded Structures", "PyMOL Sessions" , protein , f'{protein}-{counter}.pse'))
            if proteins_processed_counter[protein] == 0:
                if not is_manual:
                    cmd.save(os.path.join(self.base_dir , "PyMOL Sessions" , protein , f'{protein}-Consolidated.pse'))
                else:
                    cmd.save(os.path.join(self.base_dir, "Uploaded Structures", "PyMOL Sessions" , protein , f'{protein}-Consolidated.pse'))
            else:
                cmd.reinitialize()
                if not is_manual:
                    cmd.load(os.path.join(self.base_dir , "PyMOL Sessions" , protein , f'{protein}-Consolidated.pse'))
                else:
                    cmd.load(os.path.join(self.base_dir, "Uploaded Structures", "PyMOL Sessions" , protein , f'{protein}-Consolidated.pse'))

                if is_manual:
                    cmd.distance(f'{selection} and resi {row["Residue 1 Number (Manual)"]} and name {atom}',
                                f'{selection} and resi {row["Residue 2 Number (Manual)"]} and name {atom}')
                    cmd.load_cgo([CYLINDER, x1, y1, z1, x2, y2, z2, radius, r1, g1, b1, r2, g2, b2],
                                f'{prefix}_{row["Residue 1 Number (Manual)"]}_{row["Residue 2 Number (Manual)"]}_{atom}_{counter}')
                    cmd.group(prefix, f'{prefix}_*')
                    cmd.label(f'{prefix}_*', str(res_distance))
                else:
                    cmd.distance(f'{selection} and resi {row["Residue 1 Position"]} and name {atom}',
                                f'{selection} and resi {row["Residue 2 Position"]} and name {atom}')
                    cmd.load_cgo([CYLINDER, x1, y1, z1, x2, y2, z2, radius, r1, g1, b1, r2, g2, b2],
                                f'{prefix}_{row["Residue 1 Position"]}_{row["Residue 1 Position"]}_{atom}_{counter}')
                    cmd.group(prefix, f'{prefix}_*')
                    cmd.label(f'{prefix}_*', str(res_distance))
                if not is_manual:
                    cmd.save(os.path.join(self.base_dir , "PyMOL Sessions" , protein , f'{protein}-Consolidated.pse'))
                else:
                    cmd.save(os.path.join(self.base_dir, "Uploaded Structures", "PyMOL Sessions" , protein , f'{protein}-Consolidated.pse'))
            if not is_manual:
                print("PyMOL Session saved in " + os.path.join(self.base_dir , "PyMOL Sessions" , protein , f'{protein}-{counter}.pse'))
            else:
                print("PyMOL Session saved in " + os.path.join(self.base_dir, "Uploaded Structures", "PyMOL Sessions" , protein , f'{protein}-{counter}.pse'))

main_obj = AlphaCrossXL()

def alphacrossx_logger(log_type, message):
    # Implement Logger Functionality if needed
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{timestamp}] [{log_type.upper()}]: {message}")

def alphacrossxl_main():
    # Persistent Header Defined First
    app_title = 'AlphaCross-XL (Colab Version)'
    app_version_number = 'v1.2'
    app_update_date = 'June 20, 2025'
    app_info = 'Python-based Interactive Tool for Analyzing XL-MS Data-sets and creating useful visualizations.'

    app_header_html = widgets.HTML(value=
    f"""
    <div style='margin:10px'>
    <h1 style='text-align: center;'>{app_title}</h1>
    <h5 style='text-align: center;'>{app_info}</h5>
    <hr>
    <h3><span style='text-align: left;'>Version: {app_version_number}</span><span style='float: right;'>Last Updated on: {app_update_date}</span></h3>
    <hr>
    </div>
    """
    )
    # Aligning Widgets StackOverFlow Reference: https://stackoverflow.com/a/62760915
    centered_box_layout_visible = widgets.Layout(display='flex',
                    flex_flow='column',
                    align_items='center',
                    #width='50%'
                    )
    left_aligned_box_layout_visible = widgets.Layout(display='flex',
                    flex_flow='column',
                    align_items='flex-start',
                    #width='50%'
                    )
    right_aligned_box_layout_visible = widgets.Layout(display='flex',
                    flex_flow='column',
                    align_items='flex-end',
                    #width='50%'
                    )

    # Loading Bar Stuff
    # taken from https://stackoverflow.com/a/62889861
    loading_bar_path = os.path.join(main_base_dir, 'loading-bar.gif')
    loading_bar_url = 'https://drive.usercontent.google.com/uc?id=1319Us_Vh57iYmBVPdwg-Sk4SVcfdPzQS'
    try:
        with open(loading_bar_path, 'rb') as f:
            img = f.read()
    except:
        with open(loading_bar_path, 'wb') as f:
            f.write(requests.get(loading_bar_url).content)
        with open(loading_bar_path, 'rb') as f:
            img = f.read()



    #   create loading bar widget, ready to display when running long function
    loading_bar = widgets.Image(value=img, layout=Layout(max_height='40px'))
    centered_loading_bar = widgets.VBox([loading_bar], layout=centered_box_layout_visible)

    # Output Defined Now but will be displayed just before Footer as persistent widget.
    output_widget = widgets.Output(layout=Layout(overflow='scroll visible',
                                        #border='3px solid black',
                                        width='',
                                        max_height='200px',
                                        flex_flow='column',
                                        display='flex'))

    # Persistent Header (Would Show during all pages.)
    display(app_header_html)

    # Widgets
    ## Page 1 Widgets
    button_start_app = widgets.Button(description="AlphaCross-XL is Initializing. Please Wait.",
                                      layout=Layout(
                                          width='auto',
                                      ),
                                      disabled=True)
    page_start_app = widgets.VBox(children=[button_start_app],layout=centered_box_layout_visible)


    ## Page 2 Widgets
    file_upload_data_widget = widgets.FileUpload(
        description='XL-MS Input File',
        accept='.csv, .xlsx',  # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
        multiple=False,
        layout={'width': 'auto'}
    )
    file_upload_fasta_db_widget = widgets.FileUpload(
        description='FASTA Database',
        accept='.gz, .fasta',  # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
        multiple=False,
        layout={'width': 'auto'}
    )
    labelled_file_upload_data_widget = widgets.HBox(children=[widgets.Label(value="Please Input the XL-MS Data Set (Only .CSV/.XLSX Files Allowed): "), file_upload_data_widget]
                                                    ) #layout none required?
    labelled_file_upload_fasta_db_widget = widgets.HBox(children=[widgets.Label(value="Please Input the compressed FASTA Database (Only .FASTA.GZ Files Allowed): "), file_upload_fasta_db_widget]
                                                    ) #layout none required?
    button_submit_files = widgets.Button(description="Submit Files")
    centered_button_submit_files = widgets.VBox(children=[button_submit_files], layout=centered_box_layout_visible)

    page_input_files = widgets.VBox(children=[labelled_file_upload_data_widget, labelled_file_upload_fasta_db_widget, centered_button_submit_files], layout={'display': 'none'})

    ## Page 3 Widgets
    button_confirm_files = widgets.Button(description="Confirm Files")
    button_cancel_files = widgets.Button(description="Cancel Upload")

    page_confirm_or_cancel_files = widgets.VBox(children=[button_confirm_files, button_cancel_files], layout={'display': 'none'})

    ## Page 4 Widgets
    dropdown_uniprot_id = widgets.Dropdown(
        options=['Options not initialized'],
        description='Choose UniProt ID Column',
        disabled=False,
        style={'description_width': 'initial'},
        layout={'width': 'max-content'}
    )
    dropdown_xlink_types = widgets.Dropdown(
        options=['Options not initialized'],
        description='Choose Cross-Link Type Column',
        disabled=False,
        style={'description_width': 'initial'},
        layout={'width': 'max-content'}
    )
    dropdown_peptide_a = widgets.Dropdown(
        options=['Options not initialized'],
        description='Choose Peptide A Column',
        disabled=False,
        style={'description_width': 'initial'},
        layout={'width': 'max-content'}
    )
    dropdown_peptide_b = widgets.Dropdown(
        options=['Options not initialized'],
        description='Choose Peptide B Column',
        disabled=False,
        style={'description_width': 'initial'},
        layout={'width': 'max-content'}
    )
    dropdown_link_site_a = widgets.Dropdown(
        options=['Options not initialized'],
        description='Choose Link Site A Column',
        disabled=False,
        style={'description_width': 'initial'},
        layout={'width': 'max-content'}
    )
    dropdown_link_site_b = widgets.Dropdown(
        options=['Options not initialized'],
        description='Choose Link Site B Column',
        disabled=False,
        style={'description_width': 'initial'},
        layout={'width': 'max-content'}
    )


    button_confirm_columns = widgets.Button(description="Confirm Input Columns", layout={'width': 'auto'})
    button_go_back_columns = widgets.Button(description="Go Back", layout={'width': 'auto'})

    buttons_input_columns = widgets.HBox(children=[button_confirm_columns, button_go_back_columns])
    #centered_button_confirm_columns = widgets.VBox(children=[button_confirm_columns], layout=centered_box_layout_visible)
    page_input_columns = widgets.VBox(children=[dropdown_uniprot_id, dropdown_xlink_types, dropdown_peptide_a, dropdown_peptide_b, dropdown_link_site_a, dropdown_link_site_b, buttons_input_columns], layout={'display': 'none'})


    ## Page 5 Widgets
    inttext_threshold_dist = widgets.BoundedIntText(
        value=20,
        min=1,
        max=100,
        step=1,
        description='Threshold Distance (in Angstroms, Min - 1, Max - 100):',
        disabled=False,
        style={'description_width': 'initial'},
        layout={'width': 'initial'}
    )
    inttext_threshold_plddt = widgets.BoundedIntText(
        value=80,
        min=1,
        max=100,
        step=1,
        description='pLDDT Threshold (in %, Min - 1, Max - 100):',
        disabled=False,
        style={'description_width': 'initial'},
        layout={'width': 'initial'}
    )
    dropdown_xlink_type = widgets.Dropdown(
        options=['Options not initialized'],
        description='Choose Cross-Link Type for Analysis',
        disabled=False,
        style={'description_width': 'initial'},
        layout={'width': 'max-content'}
    )
    button_confirm_analysis_options = widgets.Button(description="Confirm Options", layout={'width': 'auto'})
    button_go_back_analysis_options = widgets.Button(description="Go Back", layout={'width': 'auto'})

    buttons_analysis_options = widgets.HBox(children=[button_confirm_analysis_options, button_go_back_analysis_options])
    #centered_button_confirm_analysis_options = widgets.VBox(children=[button_confirm_analysis_options], layout=centered_box_layout_visible)
    page_analysis_options = widgets.VBox(children=[inttext_threshold_dist, inttext_threshold_plddt, dropdown_xlink_type, buttons_analysis_options], layout={'display': 'none'})

    ## Page 6 Widgets
    radiobutton_visualization = widgets.RadioButtons(
        options=['Yes', 'No'],
        value='Yes',
        description='Do you want to generate PyMOL Visualizations: ',
        disabled=False,
        style={'description_width': 'initial'},
        layout={'width': 'max-content'}
    )
    radiobutton_manual_comparison = widgets.RadioButtons(
        options=['Yes', 'No'],
        value='No',
        description='Do you want to upload your own protein structures for comparison: ',
        disabled=False,
        style={'description_width': 'initial'},
        layout={'width': 'max-content'}
    )
    file_upload_protein_structure_widget = widgets.FileUpload(
        description='Structure Files',
        accept='.zip',  # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
        multiple=False,
        disabled=True,
    )
    labelled_file_upload_protein_structure_widget = widgets.HBox(children=[widgets.Label(value="Please Input your own Protein Structure Files (.cif only) in a .ZIP Archive: "), file_upload_protein_structure_widget]
                                                                )

    button_confirm_visualization_options = widgets.Button(description="Confirm Visualization Options", layout={'width': 'auto'})
    button_go_back_visualization_options = widgets.Button(description="Go Back", layout={'width': 'auto'})

    buttons_visualization_options = widgets.HBox(children=[button_confirm_visualization_options, button_go_back_visualization_options])
    #centered_button_confirm_visualization_options = widgets.VBox(children=[button_confirm_visualization_options], layout=centered_box_layout_visible)
    page_visualization_options = widgets.VBox(children=[radiobutton_visualization, radiobutton_manual_comparison, labelled_file_upload_protein_structure_widget, buttons_visualization_options], layout={'display': 'none'})

    ## Page 7 Widgets - Processing Confirm
    button_preview = widgets.Button(description="Start Analysis", layout={'width': 'auto'})
    button_go_back_preview = widgets.Button(description="Go Back", layout={'width': 'auto'})
    buttons_preview = widgets.HBox(children=[button_preview, button_go_back_preview])

    page_preview = widgets.VBox(
        children=[
            file_upload_data_widget,
            file_upload_fasta_db_widget,
            dropdown_uniprot_id,
            dropdown_xlink_types,
            dropdown_peptide_a,
            dropdown_peptide_b,
            dropdown_link_site_a,
            dropdown_link_site_b,
            inttext_threshold_dist,
            inttext_threshold_plddt,
            dropdown_xlink_type,
            radiobutton_visualization,
            radiobutton_manual_comparison,
            file_upload_protein_structure_widget,
            buttons_preview
        ],
        layout={'display': 'none'}
    )
    ## Page 8 Widgets
    page_result = widgets.HTML(value=
    """
    <div style='margin:10px'>
    <button>
    </div>
    """
    )


    # Display Sequence
    ## Page 1 Display Sequence
    display(page_start_app)

    ## Page 2 Display Sequence
    display(page_input_files)

    ## Page 3 Display Sequence
    display(page_confirm_or_cancel_files)

    ## Page 4 Display Sequence
    display(page_input_columns)

    ## Page 5 Display Sequence
    display(page_analysis_options)

    ## Page 6 Display Sequence
    display(page_visualization_options)

    ## Page 7 Display Sequence
    #display(page_preview)
    display(page_preview)

    # ..


    # Widget Handler Functions
    ## Page 1 Widget Handler Functions
    def on_click_button_reset(b):
        with output_widget:
            clear_output()
            # Clear All Files.
            # Get back to Start Page.
            print('Resetting AlphaCross-XL.')

    def on_click_button_start(b):
        with output_widget:
            clear_output()
            page_start_app.layout.display = 'none'
            page_input_files.layout.display = 'block'
            centered_button_submit_files.layout = {'display': 'flex', 'flex_flow':'column',
                    'align_items': 'center'}
            print("Started. Awaiting Files.")

    ## Page 2 Widget Handler Functions
    def on_click_button_submit_files(b):
        with output_widget:
            clear_output()
            print('Verifying Submitted Files.')
            if file_upload_data_widget.value and file_upload_fasta_db_widget.value:
                page_input_files.layout.display = 'none'
                page_confirm_or_cancel_files.layout = {'display': 'flex', 'flex_flow':'column',
                    'align_items': 'center'}

                data_filename, data_content = next(iter(file_upload_data_widget.value.items()))
                print("Data File: ", data_filename)
                db_filename, db_content = next(iter(file_upload_fasta_db_widget.value.items()))
                print("Database File: ", db_filename)

                print("Make sure your data file is of correct format and FASTA Database is of correct species.")
                print("If selected files are correct, press confirm to continue.")

            else:
                clear_output()
                print("Please upload both input files together. Try again")

    ## Page 3 Widget Handler Functions
    def on_click_button_confirm_files(b):
        with output_widget:
            clear_output()
            os.chdir(main_base_dir)
            main_obj.base_dir = os.getcwd()
            main_obj.base_dir_parent = main_base_dir_parent
            page_confirm_or_cancel_files.layout.display = 'none'
            print('Files Confirmed. Uploading Files.')
            data_filename, data_content = next(iter(file_upload_data_widget.value.items()))
            print("Data File: ", data_filename)
            db_filename, db_content = next(iter(file_upload_fasta_db_widget.value.items()))
            print("Database File: ",db_filename)

            data_file_path = os.path.join(main_obj.base_dir, data_filename)
            db_path = os.path.join(main_obj.base_dir, db_filename)

            with open(data_file_path, 'wb') as f:
                f.write(data_content['content'])
            with open(db_path, 'wb') as f:
                f.write(db_content['content'])

            if os.path.exists(data_file_path) and os.path.exists(db_path):
                print("Files saved successfully. Proceeding to Column Selection.")
            else:
                raise Exception("Failed to save files. Contact Developer Team.")

            main_obj.data_file = data_file_path
            main_obj.fasta_db = db_path
            main_obj.initialize_input_file()
            main_obj.process_fasta()
            input_file_columns = main_obj.get_input_file_columns()
            #input_x_link_types = main_obj.get_input_xlink_types()
            print("Input File Columns: ", input_file_columns)
            print("Please choose corresponding columns in input file.")

            dropdown_uniprot_id.options = input_file_columns
            dropdown_xlink_types.options = input_file_columns
            dropdown_peptide_a.options = input_file_columns
            dropdown_peptide_b.options = input_file_columns
            dropdown_link_site_a.options = input_file_columns
            dropdown_link_site_b.options = input_file_columns

            #main_obj.threshold_dist = inttext_threshold_dist.value
            #print(f"Upload Process completed. Saved input files to {base_dir}.")
            #print("You can find the files at /content")
            page_input_columns.layout = {'display': 'flex', 'flex_flow':'column',
                    'align_items': 'center'}


    def on_click_button_cancel_files(b):
        with output_widget:
            clear_output()
            print('Canceled Operation. Please Start Again.')

            # Easy Fix for Faulty File Counter in ipywidgets 7.7.1
            file_upload_data_widget._counter = 0
            file_upload_data_widget.value.clear()
            file_upload_fasta_db_widget._counter = 0
            file_upload_fasta_db_widget.value.clear()

            page_confirm_or_cancel_files.layout.display = 'none'
            page_input_files.layout.display = 'block'

    ## Page 4 Widget Handler Functions
    def on_click_button_confirm_columns(b):
        with output_widget:
            clear_output()
            user_chosen_columns_dict = {
                "uniprotID": dropdown_uniprot_id.value,
                "X-link type": dropdown_xlink_types.value,
                "Peptide A": dropdown_peptide_a.value,
                "Residue 1": dropdown_link_site_a.value,
                "Residue 2": dropdown_link_site_b.value,
                "Peptide B": dropdown_peptide_b.value,
            }
            if (len(user_chosen_columns_dict.values()) != len(set(user_chosen_columns_dict.values()))):
                print("You have chosen the same columns for one particular column type.")
                print("Please choose unique columns for each type.")
            else:
                print("Columns are Verified")
                main_obj.set_input_file_columns(user_chosen_columns_dict=user_chosen_columns_dict, reset=False)
                page_input_columns.layout.display = 'none'
                dropdown_xlink_type.options = main_obj.get_input_xlink_types()
                page_analysis_options.layout = {'display': 'flex', 'flex_flow':'column',
                        'align_items': 'center'}
                print('Input Columns Confirmed. Proceeding to Analysis Options.')
                print("Please choose analysis options.")



    def on_click_button_go_back_columns(b):
        with output_widget:
            clear_output()
            # Deleting Submitted Files to Preserve Memory
            current_data_file_path = main_obj.data_file
            current_fasta_db_path = main_obj.fasta_db
            os.remove(current_data_file_path)
            os.remove(current_fasta_db_path)
            os.remove(main_obj.fasta_db + '.fxi') #pyfastx index file

            # Resetting Main Object
            main_obj.__init__()
            print('Canceled Operation. Please Start Again.')

            # Easy Fix for Faulty File Counter in ipywidgets 7.7.1
            file_upload_data_widget._counter = 0
            file_upload_data_widget.value.clear()
            file_upload_fasta_db_widget._counter = 0
            file_upload_fasta_db_widget.value.clear()

            dropdown_uniprot_id.options = ['Options not initialized']
            dropdown_xlink_types.options = ['Options not initialized']
            dropdown_peptide_a.options = ['Options not initialized']
            dropdown_peptide_b.options = ['Options not initialized']
            dropdown_link_site_a.options = ['Options not initialized']
            dropdown_link_site_b.options = ['Options not initialized']

            page_input_columns.layout.display = 'none'
            page_input_files.layout.display = 'block'


    ## Page 5 Widget Handler Functions
    def on_click_button_confirm_analysis_options(b):
        with output_widget:
            clear_output()
            main_obj.set_input_xlink_type_threshold_dist(
                user_x_link_type_chosen=dropdown_xlink_type.value,
                user_threshold_dist_chosen=inttext_threshold_dist.value,
                user_threshold_plddt_chosen=inttext_threshold_plddt.value,
            )
            page_analysis_options.layout.display = 'none'
            print("Analysis Options Confirmed. Proceeding to Visualization Options.")
            page_visualization_options.layout = {'display': 'flex', 'flex_flow':'column',
                    'align_items': 'center'}

            print("Please choose visualization options.")

    def on_click_button_go_back_analysis_options(b):
        with output_widget:
            clear_output()

            user_chosen_columns_dict = {
                "uniprotID": None,
                "X-link type": None,
                "Peptide A": None,
                "Residue 1": None,
                "Residue 2": None,
                "Peptide B": None,
            }
            main_obj.set_input_file_columns(user_chosen_columns_dict=user_chosen_columns_dict, reset=True)
            page_analysis_options.layout.display = 'none'
            page_input_columns.layout = {'display': 'flex', 'flex_flow':'column',
                    'align_items': 'center'}
            dropdown_xlink_type.options = ['Options not initialized']



    ## Page 6 Widget Handler Functions
    def on_click_button_confirm_visualization_options(b):

        with output_widget:
            clear_output()
            if radiobutton_manual_comparison.value == 'Yes' and file_upload_protein_structure_widget.value == {}:
                print('Please upload the .cif files in a single .zip archive.')
                # print('Make sure your .cif files are labelled as per format: {UniProt_ID}.cif
            else:
                if radiobutton_visualization.value == 'Yes':
                    main_obj.is_visualization_allowed = True
                else:
                    main_obj.is_visualization_allowed = False
                if radiobutton_manual_comparison.value == 'Yes':
                    main_obj.is_manual_protein_struct = True
                else:
                    main_obj.is_manual_protein_struct = False

                if main_obj.is_manual_protein_struct:
                    protein_struct_filename, protein_struct_content = next(iter(file_upload_protein_structure_widget.value.items()))
                    print("Protein Structures Archive Uploaded: ", protein_struct_filename)
                    protein_struct_path = os.path.join(main_obj.base_dir, protein_struct_filename)
                    main_obj.manual_protein_struct_file = protein_struct_path
                    with open(protein_struct_path, 'wb') as f:
                        f.write(protein_struct_content['content'])
                    main_obj.verify_and_process_manual_protein_struct_file()
                print("Visualization Options Confirmed. Proceeding to Final Preview.")
                page_visualization_options.layout.display = 'none'
                file_upload_data_widget.disabled = True
                file_upload_fasta_db_widget.disabled = True
                dropdown_uniprot_id.disabled = True
                dropdown_xlink_types.disabled = True
                dropdown_peptide_a.disabled = True
                dropdown_peptide_b.disabled = True
                dropdown_link_site_a.disabled = True
                dropdown_link_site_b.disabled = True
                inttext_threshold_dist.disabled = True
                inttext_threshold_plddt.disabled = True
                dropdown_xlink_type.disabled = True
                radiobutton_visualization.disabled = True
                radiobutton_manual_comparison.disabled = True
                file_upload_protein_structure_widget.disabled = True
                page_preview.layout = {'display': 'flex', 'flex_flow':'column',
                        'align_items': 'center'}


    def on_click_button_go_back_visualization_options(b):
        with output_widget:
            clear_output()
            main_obj.set_input_xlink_type_threshold_dist(
                user_x_link_type_chosen=None,
                user_threshold_dist_chosen=None,
                user_threshold_plddt_chosen=None,
                reset=True
            )

            page_visualization_options.layout.display = 'none'
            page_analysis_options.layout = {'display': 'flex', 'flex_flow':'column',
                    'align_items': 'center'}

    def on_change_radiobutton_manual_comparison(change):
        with output_widget:
            if change['new'] == 'Yes':
                file_upload_protein_structure_widget.disabled = False
                print("Please upload your .cif structure files in a single .zip archive.")
                print("Make sure your .cif files are labelled as per format: {UniProt_ID}.cif")
                print("Incorrectly named files will be ignored.")
            else:
                file_upload_protein_structure_widget.disabled = True
                file_upload_protein_structure_widget.value.clear()
                file_upload_protein_structure_widget._counter = 0

    ## Page 7 Widget Handler Functions
    def on_click_button_go_back_preview(b):
        with output_widget:
            clear_output()
            print("All parameters have been confirmed. Analysis will now begin.")
            page_preview.layout.display = 'none'
            if main_obj.is_manual_protein_struct:
                main_obj.verify_and_process_manual_protein_struct_file(reset=True)

            page_visualization_options.layout.display = 'none'
            file_upload_data_widget.disabled = False
            file_upload_fasta_db_widget.disabled = False
            dropdown_uniprot_id.disabled = False
            dropdown_xlink_types.disabled = False
            dropdown_peptide_a.disabled = False
            dropdown_peptide_b.disabled = False
            dropdown_link_site_a.disabled = False
            dropdown_link_site_b.disabled = False
            inttext_threshold_dist.disabled = False
            inttext_threshold_plddt.disabled = False
            dropdown_xlink_type.disabled = False
            radiobutton_visualization.disabled = False
            radiobutton_manual_comparison.disabled = False
            file_upload_protein_structure_widget.disabled = False

            page_visualization_options.layout = {'display': 'flex', 'flex_flow':'column',
                    'align_items': 'center'}

    def on_click_button_preview(b):
        with output_widget:
            clear_output()
            display(centered_loading_bar)
            page_preview.layout.display = 'none'

            print("Confirmed all parameters. Starting Analysis! Please Wait...")

            main_obj.convert_to_xlms_format()
            clear_output()
            display(centered_loading_bar)
            print("Converted to suitable formats.")
            # will have to modify this for in-house protein centric support
            main_obj.calculate_absolute_chain_pos()
            # will have to modify this for in-house protein centric support
            clear_output()
            display(centered_loading_bar)
            print("Obtained Absolute Positions of Link-Sites in Protein Chain.")
            main_obj.proteins_from_alphafold()
            clear_output()
            display(centered_loading_bar)
            print("Downloaded Protein Structures from AlphaFold")
            main_obj.calculate_residue_distance_and_betas_all()
            clear_output()
            display(centered_loading_bar)
            print("Computed Residue Distance and extracting pLDDT.")
            main_obj.insert_analysis_results_to_raw_input()
            clear_output()
            display(centered_loading_bar)
            print("Inserted values for duplicates.")
            main_obj.output_distances()
            clear_output()
            display(centered_loading_bar)
            print("Created Output Files")
            main_obj.save_barplot()
            main_obj.save_histplot()
            clear_output()
            display(centered_loading_bar)
            print("Created Plots")
            if main_obj.is_visualization_allowed and main_obj.is_manual_protein_struct:
                main_obj.visualize_crosslinks(is_manual=True)
                main_obj.visualize_crosslinks(is_manual=False)
                clear_output()
                display(centered_loading_bar)
                print("Visualized Crosslinks (Both AlphaFold and Uploaded Structures)")
            elif main_obj.is_visualization_allowed and not main_obj.is_manual_protein_struct:
                main_obj.visualize_crosslinks(is_manual=False)
                clear_output()
                display(centered_loading_bar)
                print("Visualized Crosslinks (AlphaFold Only)")
            else:
                print("No Crosslinks Visualized.")

            print("Analysis Complete!")
            print("Compressing all analysis files into a .zip archive. Please wait...")


            os.chdir(main_obj.base_dir)
            date_string = datetime.date.today().strftime('%Y-%m-%d')
            data_filename = '-' + main_obj.data_file.split("/")[-1].split(".")[0]

            if main_obj.are_manual_structures_verified:
                shutil.make_archive(
                    'acxl-uploaded_structs-data',
                    'zip',
                    os.path.join(main_obj.base_dir, 'Uploaded Structures')
                )
                shutil.rmtree(os.path.join(main_obj.base_dir, 'Uploaded Structures'))
            if main_obj.is_visualization_allowed:
                shutil.make_archive('acxl-alphafold-pymol-sessions', 'zip', os.path.join(main_obj.base_dir, 'PyMOL Sessions'))
                shutil.rmtree(os.path.join(main_obj.base_dir, 'PyMOL Sessions'))

            shutil.make_archive(
                'acxl-alphafold-structures',
                'zip',
                os.path.join(main_obj.base_dir, 'AlphaFold Structures')
            )
            shutil.rmtree(os.path.join(main_obj.base_dir, 'AlphaFold Structures'))
            os.remove(os.path.join(main_obj.base_dir, 'loading-bar.gif'))
            os.chdir(main_obj.base_dir_parent)
            shutil.make_archive(
                'acxl-results-'+ date_string + data_filename,
                'zip',
                main_obj.base_dir
            )
            result_file_path = os.path.join(main_obj.base_dir_parent, 'acxl-results-'+ date_string + data_filename + '.zip')
            clear_output()
            print("Results are stored in: ",result_file_path)
            print("Thank you for using AlphaCross-XL!")
            print("Downloading Results!")
            print("Note: If the download doesn't start automatically, you can download the file manually.")
            print("Note: Click the Folder Icon on the RHS Tool bar to see the directory structure.")
            print("This may take upto 5-10 if Cross-Links are visualized!")
            print("To run the tool again, run this cell again by using the play button or Ctrl/Cmd + Enter")
            files.download(result_file_path)

    # Widget-Handler Bindings
    ## Page 1 Widget-Handler Bindings
    button_start_app.on_click(on_click_button_start)

    ## Page 2
    button_submit_files.on_click(on_click_button_submit_files)

    ## Page 3
    button_cancel_files.on_click(on_click_button_cancel_files)
    button_confirm_files.on_click(on_click_button_confirm_files)

    ## Page 4
    button_confirm_columns.on_click(on_click_button_confirm_columns)
    button_go_back_columns.on_click(on_click_button_go_back_columns)

    ## Page 5
    button_confirm_analysis_options.on_click(on_click_button_confirm_analysis_options)
    button_go_back_analysis_options.on_click(on_click_button_go_back_analysis_options)

    ## Page 6
    button_confirm_visualization_options.on_click(on_click_button_confirm_visualization_options)
    button_go_back_visualization_options.on_click(on_click_button_go_back_visualization_options)
    #radiobutton_visualization.observe(on_change_radiobutton_visualization, names='value')
    radiobutton_manual_comparison.observe(on_change_radiobutton_manual_comparison, names='value')

    ## Page 7
    button_go_back_preview.on_click(on_click_button_go_back_preview)
    button_preview.on_click(on_click_button_preview)

    ## Page 8




    # Processing Functions and Function Calls
    ## Page 1 Processing Functions and Function Calls
    def update_tool_status_idle():
        with output_widget:
            clear_output()
            print("Idle")
    update_tool_status_idle()

    # Formatted Output Widget
    arranged_output = widgets.VBox(children=[widgets.HTML(
                                        value="<hr><h3 style='text-align: center;'>Console Log (Scroll for Long Outputs)</h3>"
                                    ), output_widget,]
                                   )
    # Persistent Footer Defined Last
    app_credits = '© 2024-2025 AlphaCross-XL Development Team.<br>This tool was developed as a collaborative project at Proteomics Lab, IIT Bombay and Wiita Lab, UCSF, with assistance from Sali Lab, UCSF.'
    app_footer = widgets.HTML(value=
    f"""
    <div style='margin:0px'>
    <hr>
    <h5 style='text-align: center;'>{app_credits}</h5>
    </div>
    """
    )

    display(arranged_output, app_footer)

    # Most Important Function which needs to be run once window is rendered
    def configure_dependencies():
        with output_widget:
            try:
                clear_output()
                print("Configuring Dependencies.")
                import prody, pymol
                import pyfastx as pyfx
                print("Dependencies Configured!")
                print(pymol.get_version_message())
                print("ProDy Version: ", prody.__version__, ", PyFastX Version: ", pyfx.__version__)
                button_start_app.description = "Click Here to Start AlphaCross-XL"
                button_start_app.disabled = False
                print("AlphaCross-XL is Initialized!")
                print("Base Directory: ", main_base_dir)

                #update_tool_status_idle()
            except:
                #print('time taken to run:',t2-t1)
                button_start_app.description = "AlphaCross-XL is Initializing for the First Time. Please Wait upto 10 Minutes."
                clear_output()
                display(centered_loading_bar)
                print("ProDy and PyFastx are being installed! Please be patient. This takes 1-2 Minutes!")
                try:
                    start_time = time.perf_counter()
                    # No need to install specific version of biopython anymore!
                    !pip install pyfastx

                    # Build ProDy from Source
                    # This was done to avoid the issues with Numpy v2
                    !git clone https://github.com/prody/ProDy.git
                    os.chdir(os.path.join(main_base_dir, 'ProDy'))
                    !python setup.py build_ext --inplace --force
                    !pip install -U .
                    shutil.rmtree(os.path.join(main_base_dir, 'ProDy'))
                    # You need to set your cwd again
                    os.chdir(main_base_dir)

                    check_time = time.perf_counter()
                    clear_output()
                    display(centered_loading_bar)
                    print("Time taken to install ProDy, PyFastx: ", str(datetime.timedelta(seconds=int(check_time - start_time))))
                    print("Installing PyMOL. This will take 6-8 Minutes! Do not close the window.")
                    !apt-get install -yq git build-essential python3-dev libglew-dev \
                    libpng-dev libfreetype6-dev libxml2-dev \
                    libmsgpack-dev python3-pyqt5.qtopengl libglm-dev libnetcdf-dev

                    clear_output()
                    display(centered_loading_bar)
                    check_time = time.perf_counter()
                    print("Time Elapsed: ", str(datetime.timedelta(seconds=int(check_time - start_time)))," minutes.")
                    print("Building PyMOL")

                    # Get Latest PyMOL build and build it
                    !git clone https://github.com/schrodinger/pymol-open-source.git
                    !git clone https://github.com/rcsb/mmtf-cpp.git
                    !mv mmtf-cpp/include/mmtf* pymol-open-source/include/
                    os.chdir(os.path.join(main_base_dir, 'pymol-open-source'))
                    !python setup.py install
                    shutil.rmtree(os.path.join(main_base_dir, 'pymol-open-source'))
                    shutil.rmtree(os.path.join(main_base_dir, 'mmtf-cpp'))

                    # All built!
                    os.chdir(main_base_dir)
                    import pymol, prody
                    import pyfastx as pyfx
                    clear_output()
                    check_time = time.perf_counter()
                    print("PyMOL is Installed. Dependency Installation Completed.")
                    print("Total Time Elapsed: ", str(datetime.timedelta(seconds=int(check_time - start_time)))," minutes.")
                    print("Dependencies Configured!")
                    button_start_app.description = "Click Here to Start AlphaCross-XL"
                    button_start_app.disabled = False
                    print("AlphaCross-XL is Initialized!")
                    print(pymol.get_version_message())
                    print("ProDy Version: ", prody.__version__, ", PyFastX Version: ", pyfx.__version__)
                    print("Base Directory: ", main_base_dir)

                except:
                    clear_output()
                    print("Initialization Terminated! This shouldn't happen, unless you manually interrupted the execution.")
                    print("Fatal Error Installing Dependenices")
                    button_start_app.description = "Error Initializing AlphaCross-XL Dependencies. Please contact Development Team"
                    button_start_app.disabled = False
    configure_dependencies()



alphacrossxl_main()


HTML(value="\n    <div style='margin:10px'>\n    <h1 style='text-align: center;'>AlphaCross-XL (Colab Version)…

VBox(children=(Button(description='AlphaCross-XL is Initializing. Please Wait.', disabled=True, layout=Layout(…

VBox(children=(HBox(children=(Label(value='Please Input the XL-MS Data Set (Only .CSV/.XLSX Files Allowed): ')…

VBox(children=(Button(description='Confirm Files', style=ButtonStyle()), Button(description='Cancel Upload', s…

VBox(children=(Dropdown(description='Choose UniProt ID Column', layout=Layout(width='max-content'), options=('…

VBox(children=(BoundedIntText(value=20, description='Threshold Distance (in Angstroms, Min - 1, Max - 100):', …

VBox(children=(RadioButtons(description='Do you want to generate PyMOL Visualizations: ', layout=Layout(width=…

VBox(children=(FileUpload(value={}, accept='.csv, .xlsx', description='XL-MS Input File', layout=Layout(width=…

VBox(children=(HTML(value="<hr><h3 style='text-align: center;'>Console Log (Scroll for Long Outputs)</h3>"), O…

HTML(value="\n    <div style='margin:0px'>\n    <hr>\n    <h5 style='text-align: center;'>© 2024-2025 AlphaCro…