In [8]:
import pandas as pd 
import numpy as np
import irtho

### what we need

Protein fasta, DNA fasta, GFF. Beginning with two references, one input, one output.

1. Load protein fasta + gff and find longest transcripts
2. Subset to only longest transcripts
3. Run Orthofinder
4. Find main orthologs, calculate synteny, find syntenic orthologs
5. Visualise synteny / Orthology in each species
6. Once we have genes, take each gene and each snp, align them, find the new position in the new reference.

Lets locate our reference genomes in the `resources/reference` folder

In [46]:
import glob

vb_refs = [ref.split("reference/")[1].rstrip(".gff") for ref in glob.glob('../resources/reference/*gff')]
np.sort(vb_refs)

array(['AaegyptiLVP_AGWG', 'Aalbopictus_AalbF5', 'AdirusWRAIR2',
       'AfunestusAfunGA1', 'AgambiaePEST', 'AminimusMINIMUS1',
       'AsinensisChina', 'AstephensiUCISS2018',
       'CquinquefasciatusJHB2020', 'LlongipalpisASM'], dtype='<U24')

In [7]:
# genomes = np.array([
#        'AaegyptiLVP_AGWG', 'Aalbopictus_AalbF5', 'AdirusWRAIR2',
#        'AfunestusAfunGA1', 'AgambiaePEST', 'AminimusMINIMUS1',
#        'AsinensisChina', 'AstephensiUCISS2018',
#        'CquinquefasciatusJHB2020', 'LlongipalpisASM']
# )

# for g in genomes:
#     irtho.write_longest_isoforms(g, debug=True, path_to_references="../resources/reference")

## Run Orthofinder

### Synteny analysis

In [47]:
# # Run OrthoFinder on a directory with additional arguments
# input_directory = "../results/proteome/"
# additional_arguments = ["-t", "48", "-o", "../results/orthofinder"]  # Example: Use 4 threads
# irtho.run_orthofinder(input_directory, additional_args=additional_arguments, debug=True)

### Orthofinder results

In [48]:
import os
import pandas as pd

class OrthoFinderResults:
    """
    A class to parse and store relevant information from OrthoFinder output.
    """

    def __init__(self, results_dir, debug=False):
        """
        Initialize the OrthoFinderResults class.

        Args:
            results_dir (str): Path to the OrthoFinder results directory.
            debug (bool): Whether to print debug information.
        """
        self.results_dir = results_dir
        self.debug = debug
        self.hierarchical_orthogroups = None
        self.one_to_one_orthologs = None

        if debug:
            print(f"Initializing OrthoFinderResults for directory: {results_dir}")

        # Load relevant files
        file_path = os.path.join(
            self.results_dir, "Comparative_Genomics_Statistics", "OrthologuesStats_one-to-one.tsv"
        )
        self.stats = pd.read_csv(file_path, sep="\t", index_col=0)
        self.orthogroups = self._load_hierarchical_orthogroups()

    def _load_hierarchical_orthogroups(self):
        """
        Load the hierarchical orthogroups file (N0.tsv).
        """
        file_path = os.path.join(
            self.results_dir, "Phylogenetic_Hierarchical_Orthogroups", "N0.tsv"
        )
        if os.path.exists(file_path):
            if self.debug:
                print(f"Loading hierarchical orthogroups from: {file_path}")
            self.hierarchical_orthogroups = pd.read_csv(file_path, sep="\t")
        else:
            if self.debug:
                print(f"File not found: {file_path}")
            self.hierarchical_orthogroups = None

    def get_orthologs(self, species_a, species_b):
        """
        Get orthologs (one-to-one, one-to-many, and many-to-many) between two species.

        Args:
            species_a (str): Name of the first species.
            species_b (str): Name of the second species.

        Returns:
            pd.DataFrame: A DataFrame containing orthologs between the two species.
                        Includes a column 'ortholog_type' to specify 'one-to-one',
                        'one-to-many', or 'many-to-many'.
        """
        # Define the path to the pairwise orthologs file
        orthologs_file = os.path.join(
            self.results_dir,
            "Orthologues",
            f"Orthologues_{species_a}",
            f"{species_a}__v__{species_b}.tsv",
        )

        # Check if the file exists
        if not os.path.exists(orthologs_file):
            raise FileNotFoundError(
                f"Orthologs file not found: {orthologs_file}. Ensure the species names are correct."
            )

        if self.debug:
            print(f"Loading orthologs from: {orthologs_file}")

        # Load the orthologs file
        orthologs_df = pd.read_csv(orthologs_file, sep="\t")

        # Ensure the required columns are present
        required_columns = [species_a, species_b, "Orthogroup"]
        if not all(col in orthologs_df.columns for col in required_columns):
            raise ValueError(
                f"Orthologs file does not contain the required columns: {required_columns}"
            )

        if self.debug:
            print(f"Orthologs file loaded with {len(orthologs_df)} rows.")

        # Identify one-to-one orthologs
        one_to_one = orthologs_df[
            (orthologs_df[species_a].str.contains(",") == False)  # No commas in species_a
            & (orthologs_df[species_b].str.contains(",") == False)  # No commas in species_b
        ].copy()
        one_to_one["ortholog_type"] = "one-to-one"

        if self.debug:
            print(f"Found {len(one_to_one)} one-to-one orthologs.")

        # Identify one-to-many orthologs
        one_to_many = orthologs_df[
            (orthologs_df[species_a].str.contains(",") == False)  # Single gene in species_a
            & (orthologs_df[species_b].str.contains(","))  # Multiple genes in species_b
        ].copy()
        one_to_many["ortholog_type"] = "one-to-many"

        if self.debug:
            print(f"Found {len(one_to_many)} one-to-many orthologs.")

        # Identify many-to-many orthologs
        many_to_many = orthologs_df[
            (orthologs_df[species_a].str.contains(","))  # Multiple genes in species_a
            & (orthologs_df[species_b].str.contains(","))  # Multiple genes in species_b
        ].copy()
        many_to_many["ortholog_type"] = "many-to-many"

        if self.debug:
            print(f"Found {len(many_to_many)} many-to-many orthologs.")

        # Combine all ortholog types into a single DataFrame
        all_orthologs = pd.concat([one_to_one, one_to_many, many_to_many], ignore_index=True)

        if self.debug:
            print(f"Total orthologs retrieved: {len(all_orthologs)}")

        return all_orthologs.rename(columns={species_a: "gene"})


    def get_hierarchical_orthogroup(self, orthogroup_id):
        """
        Get the genes in a specific hierarchical orthogroup.

        Args:
            orthogroup_id (str): The ID of the hierarchical orthogroup.

        Returns:
            pd.DataFrame: A DataFrame containing the genes in the specified orthogroup.
        """
        if self.hierarchical_orthogroups is None:
            raise ValueError("Hierarchical orthogroups file not loaded.")

        if self.debug:
            print(f"Retrieving hierarchical orthogroup: {orthogroup_id}")

        # Filter for the specified orthogroup
        orthogroup = self.hierarchical_orthogroups[
            self.hierarchical_orthogroups["HOG"] == orthogroup_id
        ]

        return orthogroup

    def list_species(self):
        """
        List all species in the OrthoFinder results.

        Returns:
            list: A list of species names.
        """
        if self.stats is None:
            raise ValueError("One-to-one orthologs statistics file not loaded.")

        return list(self.stats.columns)

    def __repr__(self):
        """
        Provide a pretty string representation of the OrthoFinderResults object.
        """
        repr_str = "OrthoFinderResults Summary\n"
        repr_str += f"Results Directory: {self.results_dir}\n"
        repr_str += "-" * 40 + "\n"

        if self.stats is not None:
            repr_str += f"One-to-One Orthologs Loaded: {self.stats.shape[0]} genes x {self.stats.shape[1]} species\n"
        else:
            repr_str += "One-to-One Orthologs: Not Loaded\n"

        if self.hierarchical_orthogroups is not None:
            repr_str += f"Hierarchical Orthogroups Loaded: {self.hierarchical_orthogroups.shape[0]} orthogroups\n"
        else:
            repr_str += "Hierarchical Orthogroups: Not Loaded\n"

        repr_str += "-" * 40 + "\n"
        repr_str += "Available Methods:\n"
        repr_str += " - list_species(): List all species in the results\n"
        repr_str += " - get_one_to_one_orthologs(species_a, species_b): Retrieve one-to-one orthologs between two species\n"
        repr_str += " - get_hierarchical_orthogroup(orthogroup_id): Retrieve genes in a specific hierarchical orthogroup\n"

        return repr_str

In [49]:
def map_input_genes_to_orthologs(
    input_df,
    reference_species,
    target_species,
    self, 
    debug=False,
):
    """
    Map genes from an input DataFrame to their orthologs in a target species.

    Args:
        input_df (pd.DataFrame): Input DataFrame with a 'gene' column containing gene IDs.
        reference_species (str): Name of the reference species.
        target_species (str): Name of the target species.
        orthofinder_results (OrthoFinderResults): An instance of the OrthoFinderResults class.
        gene_mapping_df (pd.DataFrame): DataFrame with columns ['gene_id', 'transcript_id', 'protein_id'].
        debug (bool): Whether to print debug information.

    Returns:
        pd.DataFrame: A DataFrame with the original input and additional columns:
                      'ortholog_gene' and 'ortholog_type'.
    """
    # Step 1: Validate species in OrthoFinder results
    available_species = self.list_species()
    if reference_species not in available_species or target_species not in available_species:
        raise ValueError(
            f"One or both species ({reference_species}, {target_species}) are not in the OrthoFinder results."
        )

    if debug:
        print(f"Reference species: {reference_species}")
        print(f"Target species: {target_species}")
        print(f"Available species: {available_species}")

    # Step 2: Retrieve orthologs (one-to-one, one-to-many, and many-to-many)
    orthologs = self.get_orthologs(reference_species, target_species)

    if debug:
        print(f"Retrieved {len(orthologs)} orthologs (one-to-one, one-to-many, and many-to-many).")

    # Step 3: Merge the orthologs back into the input DataFrame
    final_df = input_df.merge(orthologs, on="gene", how="left")

    if debug:
        print(f"Final result contains {len(final_df)} rows.")
        print(final_df.head())

    return final_df


In [50]:
# Initialize OrthoFinderResults
results_dir = "../results/orthofinder/Results_Feb03_1"
ortho = OrthoFinderResults(results_dir, debug=True)

# Load input DataFrame
input_file = "../Irtho-genes.tsv"
input_df = pd.read_csv(input_file, sep="\t")

# Define species
reference_species = "AgambiaePEST"
target_species = "CquinquefasciatusJHB2020"

# Map genes to orthologs
targets_df = map_input_genes_to_orthologs(
    input_df,
    reference_species,
    target_species,
    ortho,
    debug=False,
)
targets_df

Initializing OrthoFinderResults for directory: ../results/orthofinder/Results_Feb03_1
Loading hierarchical orthogroups from: ../results/orthofinder/Results_Feb03_1/Phylogenetic_Hierarchical_Orthogroups/N0.tsv
Loading orthologs from: ../results/orthofinder/Results_Feb03_1/Orthologues/Orthologues_AgambiaePEST/AgambiaePEST__v__CquinquefasciatusJHB2020.tsv
Orthologs file loaded with 9535 rows.
Found 8337 one-to-one orthologs.
Found 514 one-to-many orthologs.
Found 156 many-to-many orthologs.
Total orthologs retrieved: 9007


Unnamed: 0,genome,gene,chrom,start,end,type,desc,transcript,codon,Orthogroup,CquinquefasciatusJHB2020,ortholog_type
0,AgambiaePEST,AGAP004707,AgamP4_2L,2422651.0,2422652.0,neural,Vgsc-L995F,AGAP004707-RD,995.0,OG0003301,CQUJHB005371,one-to-one
1,AgambiaePEST,AGAP004707,AgamP4_2L,2422650.0,2422651.0,neural,Vgsc-L995S,AGAP004707-RD,995.0,OG0003301,CQUJHB005371,one-to-one
2,AgambiaePEST,AGAP004707,AgamP4_2L,2391227.0,2391228.0,neural,Vgsc-V402L,AGAP004707-RD,402.0,OG0003301,CQUJHB005371,one-to-one
3,AgambiaePEST,AGAP006048,AgamP4_2L,,,p450,cyp4j5,,,OG0000166,CQUJHB002907,one-to-one
4,AgambiaePEST,AGAP006028,AgamP4_2L,25429234.0,25429235.0,neural,Rdl,AGAP006028-RA,296.0,OG0005465,CQUJHB004999,one-to-one
5,AgambiaePEST,AGAP006028,AgamP4_2L,25429235.0,25429236.0,neural,Rdl,AGAP006028-RA,296.0,OG0005465,CQUJHB004999,one-to-one
6,AgambiaePEST,AGAP006227,AgamP4_2L,,,coe,Coeae1f-E477D,AGAP006227-RA,477.0,OG0001708,CQUJHB000812,one-to-one
7,AgambiaePEST,AGAP006228,AgamP4_2L,,,coe,Coeae2f,,,OG0009784,CQUJHB006176,one-to-one
8,AgambiaePEST,AGAP001356,AgamP4_2R,3492073.0,3492074.0,neural,Ace1-G280S,AGAP001356.R546,280.0,OG0003646,CQUJHB013404,one-to-one
9,AgambiaePEST,AGAP008212,AgamP4_3R,,,p450,cyp6m2,,,,,


### Build protein alignments

- find target genome protein, get ref genome isoform
- make alignment between two, get index of target genome position 
- convert back to genome coordinates
- add to dataframe

In [51]:
import pandas as pd

def parse_attributes(attr_string):
    """Parse GFF attribute string into dictionary."""
    attributes = {}
    for attr in attr_string.split(';'):
        if '=' in attr:
            key, value = attr.split('=', 1)
            attributes[key] = value
    return attributes

def create_gene_mapping(gff_file):
    """
    Create a pandas DataFrame mapping genes to their transcripts and proteins.
    Handles both VectorBase and RefSeq GFF formats.
    
    Args:
        gff_file (str): Path to the GFF file
        
    Returns:
        pandas.DataFrame: DataFrame with columns for gene_id, transcript_id, and protein_id
    """
    gene_transcript_protein_map = []
    seen = set()  # To track unique combinations
    
    with open(gff_file) as f:
        for line in f:
            if line.startswith('#'):
                continue
                
            fields = line.strip().split('\t')
            if len(fields) < 9 or fields[2] != 'CDS':
                continue
            
            source = fields[1]
            attributes = parse_attributes(fields[8])
            
            # Handle VectorBase format
            if source == 'VEuPathDB':
                gene_id = attributes.get('gene_id')
                transcript_id = attributes.get('Parent')
                protein_id = attributes.get('protein_source_id')
                
            # Handle RefSeq format - only process if source is RefSeq
            elif source == 'RefSeq':
                gene_id = attributes.get('gene')  # LOC ID
                transcript_id = attributes.get('Parent')
                if transcript_id and transcript_id.startswith('rna-'):
                    transcript_id = transcript_id[4:]  # Remove 'rna-' prefix
                protein_id = attributes.get('protein_id')
                if protein_id and protein_id.startswith('cds-'):
                    protein_id = protein_id[4:]  # Remove 'cds-' prefix
                    
            else:
                continue  # Skip all other sources
                
            # Skip if any required field is missing
            if not all([gene_id, transcript_id, protein_id]):
                continue
                
            # Create tuple of the mapping
            mapping = (gene_id, transcript_id, protein_id)
            
            # Only add if we haven't seen this combination before
            if mapping not in seen:
                gene_transcript_protein_map.append({
                    'gene_id': gene_id,
                    'transcript_id': transcript_id,
                    'protein_id': protein_id
                })
                seen.add(mapping)
    
    # Convert to DataFrame
    df = pd.DataFrame(gene_transcript_protein_map)
    
    # Sort by gene_id to maintain consistent order
    df = df.sort_values('gene_id').reset_index(drop=True)
    
    return df

In [52]:
from Bio import SeqIO
import pandas as pd

def extract_protein_sequences(ref_transcript_id, target_gene_id, 
                          ref_gene_map, target_gene_map,
                          ref_sequences, target_sequences,
                          debug=False):
    """
    Extract amino acid sequences for a reference transcript and the longest protein 
    sequence from its orthologous target gene.
    
    Args:
        ref_transcript_id (str): Transcript ID from reference genome
        target_gene_id (str): Gene ID from target genome (found through orthology)
        ref_gene_map (pd.DataFrame): Gene mapping DataFrame for reference genome
        target_gene_map (pd.DataFrame): Gene mapping DataFrame for target genome
        ref_sequences (dict): Dictionary of reference protein sequences
        target_sequences (dict): Dictionary of target protein sequences
        debug (bool): Whether to print debug information
        
    Returns:
        tuple: (ref_sequence, target_sequence, target_protein_id)
            - ref_sequence: The amino acid sequence for the reference transcript
            - target_sequence: The longest protein sequence for the target gene
            - target_protein_id: The ID of the longest target protein sequence
    """
    print(ref_transcript_id)
    print(ref_gene_map[
        ref_gene_map['transcript_id'] == ref_transcript_id
    ]['protein_id'])
    # Get reference protein ID from transcript ID
    ref_protein_id = ref_gene_map[
        ref_gene_map['transcript_id'] == ref_transcript_id
    ]['protein_id'].iloc[0]
    
    # Get all target protein IDs for the target gene
    target_protein_ids = target_gene_map[
        target_gene_map['gene_id'] == target_gene_id
    ]['protein_id'].tolist()
    
    if debug:
        print(f"Reference protein ID: {ref_protein_id}")
        print(f"Target protein IDs: {target_protein_ids}")
    
    # Extract reference sequence
    ref_sequence = ref_sequences.get(ref_protein_id)
    if ref_sequence is None:
        raise KeyError(f"Reference protein {ref_protein_id} not found in sequences")
        
    # Find the longest target protein sequence
    longest_target_seq = ""
    longest_target_id = None
    for protein_id in target_protein_ids:
        if protein_id in target_sequences:
            seq = target_sequences[protein_id]
            if len(seq) > len(longest_target_seq):
                longest_target_seq = seq
                longest_target_id = protein_id
    
    if not longest_target_seq:
        raise KeyError(f"No target proteins found in sequences for gene {target_gene_id}")
        
    if debug:
        print(f"Selected longest target protein: {longest_target_id} "
              f"(length: {len(longest_target_seq)})")
        
    return ref_sequence, longest_target_seq, longest_target_id



In [53]:
import warnings

def map_codon_position(ref_codon_num, alignment_str, window_size=10):
    """
    Maps a reference codon number to the corresponding codon number in the target sequence
    using a pairwise protein alignment. If target position is a gap, finds closest non-gap
    position within specified window and raises warning.
    
    Args:
        ref_codon_num (int): Codon number in reference sequence (1-based)
        alignment_str (str): String representation of the pairwise alignment
        window_size (int): Maximum distance to look for non-gap positions
        
    Returns:
        int: Corresponding codon number in target sequence (1-based)
        or None if no suitable position found within window
    """
    # Extract aligned sequences
    lines = alignment_str.split('\n')[::2]  # Skip match lines
    
    # Process the chunked alignment lines
    query_chunks = []
    target_chunks = []
    
    for line in lines:
        if line.startswith('query'):
            query_chunks.append(line.split(None, 2)[2])
        elif line.startswith('target'):
            target_chunks.append(line.split(None, 2)[2])
            
    # Join the chunks
    query_seq = ''.join(query_chunks)   # Reference sequence
    target_seq = ''.join(target_chunks) # Target sequence
    
    if len(query_seq) != len(target_seq):
        raise ValueError("Aligned sequences must be the same length")
    
    # Convert codon number to amino acid position (1-based)
    aa_pos = ref_codon_num
    
    # Count non-gap positions up to our reference position
    ref_gaps = 0
    target_gaps = 0
    aligned_pos = 0
    
    # Find corresponding position in alignment
    for i in range(len(query_seq)):
        if query_seq[i] != '-':
            ref_gaps += 1
        if target_seq[i] != '-':
            target_gaps += 1
            
        if ref_gaps == aa_pos:
            aligned_pos = i
            break
            
    # If we found a valid position
    if ref_gaps == aa_pos:
        # Count non-gap positions in target up to aligned position
        target_pos = target_gaps
        
        # Check if target position is a gap
        if target_seq[aligned_pos] == '-':
            # Look for closest non-gap position within window
            left = max(0, aligned_pos - window_size)
            right = min(len(target_seq), aligned_pos + window_size + 1)
            
            # Count target positions up to left boundary
            left_target_count = sum(1 for x in target_seq[:left] if x != '-')
            
            closest_pos = None
            min_distance = window_size + 1
            best_target_pos = None
            
            # Check positions in both directions
            for i in range(left, right):
                if target_seq[i] != '-':
                    distance = abs(i - aligned_pos)
                    if distance < min_distance:
                        min_distance = distance
                        closest_pos = i
                        # Count non-gaps up to this position for target position
                        best_target_pos = left_target_count + sum(1 for x in target_seq[left:i+1] if x != '-')
            
            if closest_pos is not None:
                warnings.warn(
                    f"Target position was gap, using closest non-gap position {best_target_pos} "
                    f"({min_distance} positions away)",
                    UserWarning
                )
                return best_target_pos
                
            return None
            
        return target_pos
    else:
        raise ValueError(f"Reference codon number {ref_codon_num} is out of range")

In [54]:


# # Create gene mappings
# ref_gene_map = create_gene_mapping('../resources/reference/AgambiaePEST.gff')
# target_gene_map = create_gene_mapping('../resources/reference/CquinquefasciatusJHB2020.gff')

# # Load protein sequences
# ref_sequences = irtho.load_fasta("../resources/reference/AgambiaePEST_AnnotatedProteins.fasta", debug=False)
# target_sequences = irtho.load_fasta("../resources/reference/CquinquefasciatusJHB2020_AnnotatedProteins.fasta", debug=False)

# # Extract sequences
# ref_seq, target_seq, target_id = extract_protein_sequences(
#     ref_transcript_id='AGAP004707-RD',
#     target_gene_id='CQUJHB005371',
#     ref_gene_map=ref_gene_map,
#     target_gene_map=target_gene_map,
#     ref_sequences=ref_sequences,
#     target_sequences=target_sequences,
#     debug=True
# )

# aligner = Align.PairwiseAligner()
# alignment = aligner.align(target_seq, ref_seq)[0]
# alignment_str = str(alignment)

# # Map codon 50 from reference to target
# target_codon = map_codon_position(995, alignment_str)
# print(f"Reference codon 995 maps to target codon {target_codon}")

In [55]:
def get_genomic_position_from_codon(codon_num, gff_path, protein_id, gene_map):
    """
    Convert a codon number to genomic coordinates using GFF file and protein ID.
    
    Args:
        codon_num (int): The codon number (1-based)
        gff_path (str): Path to the GFF file
        protein_id (str): The protein ID to look up
        gene_map (pd.DataFrame): Gene mapping DataFrame containing protein_id to transcript_id mappings
        
    Returns:
        tuple: (chromosome, start, end, strand)
            - chromosome: The chromosome name
            - start: Start position of the codon in genomic coordinates
            - end: End position of the codon in genomic coordinates
            - strand: The strand ('+' or '-')
    """
    # Get transcript ID for this protein
    transcript_id = gene_map[gene_map['protein_id'] == protein_id]['transcript_id'].iloc[0]
    
    # Parse GFF to get CDS entries for this transcript
    cds_regions = []
    
    with open(gff_path) as f:
        for line in f:
            if line.startswith('#'):
                continue
                
            fields = line.strip().split('\t')
            if len(fields) < 9:
                continue
                
            if fields[2] == 'CDS':
                attributes = dict(x.split('=') for x in fields[8].split(';'))
                # Handle both Parent and protein_id attributes
                matches = False
                if attributes.get('Parent') == transcript_id:
                    matches = True
                if attributes.get('protein_id') == protein_id:
                    matches = True
                if matches:
                    cds_regions.append({
                        'chrom': fields[0],
                        'start': int(fields[3]),
                        'end': int(fields[4]),
                        'strand': fields[6],
                        'phase': int(fields[7]) if fields[7] != '.' else 0
                    })
    
    if not cds_regions:
        raise ValueError(f"No CDS regions found for protein {protein_id} (transcript {transcript_id})")
        
    # Sort CDS regions by genomic coordinates
    strand = cds_regions[0]['strand']
    chrom = cds_regions[0]['chrom']
    
    if strand == '+':
        cds_regions.sort(key=lambda x: x['start'])
    else:
        cds_regions.sort(key=lambda x: x['start'], reverse=True)
    
    # Convert codon number to nucleotide position (0-based)
    target_nt = (codon_num - 1) * 3
    
    # Account for phase in first CDS
    first_phase = cds_regions[0]['phase']
    if first_phase > 0:
        target_nt += first_phase
    
    # Find which CDS region contains our position
    current_pos = 0
    for cds in cds_regions:
        cds_length = cds['end'] - cds['start'] + 1
        
        if current_pos + cds_length > target_nt:
            # Found the CDS containing our position
            offset = target_nt - current_pos
            
            if strand == '+':
                codon_start = cds['start'] + offset
                codon_end = codon_start + 2  # +2 because codon is 3 bases
            else:
                codon_end = cds['end'] - offset
                codon_start = codon_end - 2  # -2 because codon is 3 bases
            
            return (chrom, codon_start, codon_end, strand)
            
        current_pos += cds_length
    
    raise ValueError(f"Codon number {codon_num} is outside CDS regions")


In [56]:

# # Get genomic coordinates for the target codon
# chrom, start, end, strand = get_genomic_position_from_codon(
#     target_codon, 
#     "../resources/reference/CquinquefasciatusJHB2020.gff",
#     target_id,
#     target_gene_map
# )

# print(f"Codon {target_codon} is located at {chrom}:{start}-{end} ({strand})")

In [57]:
import pandas as pd
from Bio import Align, SeqIO
from pathlib import Path
import logging

# Configure logging
logging.basicConfig(level=logging.DEBUG, 
                   format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def load_genome_data(genome_dir, genome_name):
    """Load protein sequences and create gene mapping for a genome."""
    logger.info(f"Loading genome data for {genome_name}")
    
    protein_path = Path(genome_dir) / f"{genome_name}_AnnotatedProteins.fasta"
    logger.debug(f"Reading protein sequences from {protein_path}")
    
    proteins = SeqIO.to_dict(SeqIO.parse(protein_path, "fasta"))
    protein_seqs = {k: str(v.seq) for k, v in proteins.items()}
    logger.debug(f"Loaded {len(protein_seqs)} protein sequences")
    
    gff_path = Path(genome_dir) / f"{genome_name}.gff"
    logger.debug(f"Reading GFF from {gff_path}")
    gene_map = create_gene_mapping(str(gff_path))
    logger.debug(f"Created gene mapping with {len(gene_map)} entries")
    
    return protein_seqs, gene_map

def align_protein_sequences(ref_seq, target_seq, match_score=2, mismatch_score=-1, 
                          open_gap_score=-10, extend_gap_score=-0.5):
    """Perform pairwise alignment of protein sequences."""
    logger.debug("Performing sequence alignment")
    logger.debug(f"Reference sequence length: {len(ref_seq)}")
    logger.debug(f"Target sequence length: {len(target_seq)}")
    
    aligner = Align.PairwiseAligner()
    aligner.mode = 'global'
    aligner.match_score = match_score
    aligner.mismatch_score = mismatch_score
    aligner.open_gap_score = open_gap_score
    aligner.extend_gap_score = extend_gap_score
    
    alignments = aligner.align(ref_seq, target_seq)
    best_alignment = alignments[0]
    logger.debug(f"Alignment score: {best_alignment.score}")
    
    return best_alignment

def locate_orthologous_target(ref_transcript_id, target_ortholog_id, codon_num,
                            ref_proteins, target_proteins, ref_gene_map, target_gene_map, 
                            target_gff_path, alignment_params=None):
    """Locate the orthologous position in the target genome."""
    logger.info(f"Processing ortholog mapping for reference transcript {ref_transcript_id}")
    logger.info(f"Target ortholog: {target_ortholog_id}, Codon: {codon_num}")
    
    result = {
        'target_codon': None,
        'target_chrom': None,
        'target_start': None,
        'target_end': None,
        'target_strand': None
    }
    
    # Check for required inputs
    if pd.isna(codon_num) or pd.isna(target_ortholog_id):
        logger.warning("Missing required input - skipping")
        return result
        
    # Get protein sequences
    logger.debug("Extracting protein sequences")
    ref_seq, target_seq, target_id = extract_protein_sequences(
        ref_transcript_id=ref_transcript_id,
        target_gene_id=target_ortholog_id,
        ref_gene_map=ref_gene_map,
        target_gene_map=target_gene_map,
        ref_sequences=ref_proteins,
        target_sequences=target_proteins
    )
    logger.debug(f"Target protein ID: {target_id}")
    
    # Perform alignment
    alignment_params = alignment_params or {}
    logger.debug(f"Alignment parameters: {alignment_params}")
    alignment = align_protein_sequences(target_seq, ref_seq, **alignment_params)
    
    # Map codon position
    target_codon = map_codon_position(int(codon_num), str(alignment))
    logger.debug(f"Mapped reference codon {codon_num} to target codon {target_codon}")
    
    if target_codon is not None:
        # Get genomic coordinates
        logger.debug("Getting genomic coordinates")
        chrom, start, end, strand = get_genomic_position_from_codon(
            target_codon,
            target_gff_path,
            target_id,
            target_gene_map
        )
        
        result.update({
            'target_codon': target_codon,
            'target_chrom': chrom,
            'target_start': start,
            'target_end': end,
            'target_strand': strand
        })
        logger.info(f"Found target location: {chrom}:{start}-{end} ({strand})")
    else:
        logger.warning("Could not map codon position")
    
    return result

def map_resistance_sites(targets_df, reference_dir, ref_genome, target_genome, 
                        transcript_col='transcript', codon_col='codon'):
    """Map resistance sites from reference to target genome."""
    logger.info(f"Starting resistance site mapping from {ref_genome} to {target_genome}")
    logger.info(f"Processing {len(targets_df)} sites")
    
    # Load genome data
    logger.info("Loading genome data")
    ref_proteins, ref_gene_map = load_genome_data(reference_dir, ref_genome)
    target_proteins, target_gene_map = load_genome_data(reference_dir, target_genome)
    
    # Get target GFF path
    target_gff = Path(reference_dir) / f"{target_genome}.gff"
    logger.debug(f"Target GFF path: {target_gff}")
    
    # Add target columns
    for col in ['target_codon', 'target_chrom', 'target_start', 'target_end', 'target_strand']:
        targets_df[col] = None
    
    # Process each row
    mapped_count = 0
    for idx, row in targets_df.iterrows():
        logger.debug(f"Processing row {idx}")
        result = locate_orthologous_target(
            ref_transcript_id=row[transcript_col],
            target_ortholog_id=row[target_genome],
            codon_num=row[codon_col],
            ref_proteins=ref_proteins,
            target_proteins=target_proteins,
            ref_gene_map=ref_gene_map,
            target_gene_map=target_gene_map,
            target_gff_path=str(target_gff)
        )
        
        # Update DataFrame
        for key, value in result.items():
            targets_df.at[idx, key] = value
        
        if result['target_codon'] is not None:
            mapped_count += 1
    
    logger.info(f"Mapping complete. Successfully mapped {mapped_count}/{len(targets_df)} sites")
    return targets_df

def enable_debug():
    """Enable debug logging"""
    logging.basicConfig(level=logging.DEBUG, 
                       format='%(asctime)s - %(levelname)s - %(message)s')
    
def disable_debug():
    """Disable all logging"""
    logging.getLogger(__name__).setLevel(logging.CRITICAL + 1)

In [58]:
target_genome = "CquinquefasciatusJHB2020"

# split one to many ortholog targets 
split_targets_df = targets_df.copy()
split_targets_df[target_species] = split_targets_df[target_species].str.split(',')
split_targets_df = split_targets_df.explode(target_species).reset_index(drop=True)
split_targets_df[target_genome] = split_targets_df[target_genome].str.replace(" ", "")

In [59]:
# Process sites
updated_df = map_resistance_sites(
    split_targets_df,
    reference_dir="../resources/reference/",
    ref_genome="AgambiaePEST",
    target_genome=target_genome,
)

# Save results
# updated_df.to_csv("resistance_sites_mapped.csv", index=False)

# Print summary
print("\nProcessing complete!")
print(f"Total sites: {len(split_targets_df)}")
print(f"Sites mapped: {updated_df['target_codon'].notna().sum()}")

updated_df

2025-02-04 16:58:46,130 - INFO - Starting resistance site mapping from AgambiaePEST to CquinquefasciatusJHB2020
2025-02-04 16:58:46,132 - INFO - Processing 19 sites
2025-02-04 16:58:46,133 - INFO - Loading genome data
2025-02-04 16:58:46,135 - INFO - Loading genome data for AgambiaePEST
2025-02-04 16:58:46,136 - DEBUG - Reading protein sequences from ../resources/reference/AgambiaePEST_AnnotatedProteins.fasta
2025-02-04 16:58:46,464 - DEBUG - Loaded 15328 protein sequences
2025-02-04 16:58:46,466 - DEBUG - Reading GFF from ../resources/reference/AgambiaePEST.gff
2025-02-04 16:58:46,869 - DEBUG - Created gene mapping with 15328 entries
2025-02-04 16:58:46,890 - INFO - Loading genome data for CquinquefasciatusJHB2020
2025-02-04 16:58:46,891 - DEBUG - Reading protein sequences from ../resources/reference/CquinquefasciatusJHB2020_AnnotatedProteins.fasta
2025-02-04 16:58:47,297 - DEBUG - Loaded 24676 protein sequences
2025-02-04 16:58:47,298 - DEBUG - Reading GFF from ../resources/reference

AGAP004707-RD
4961    AGAP004707-PD
Name: protein_id, dtype: object


2025-02-04 16:58:48,228 - INFO - Found target location: CM027411.1:22646398-22646400 (-)
2025-02-04 16:58:48,230 - DEBUG - Processing row 1
2025-02-04 16:58:48,231 - INFO - Processing ortholog mapping for reference transcript AGAP004707-RD
2025-02-04 16:58:48,232 - INFO - Target ortholog: CQUJHB005371, Codon: 995.0
2025-02-04 16:58:48,233 - DEBUG - Extracting protein sequences
2025-02-04 16:58:48,244 - DEBUG - Target protein ID: CQUJHB005371.P8259
2025-02-04 16:58:48,245 - DEBUG - Alignment parameters: {}
2025-02-04 16:58:48,246 - DEBUG - Performing sequence alignment
2025-02-04 16:58:48,247 - DEBUG - Reference sequence length: 2148
2025-02-04 16:58:48,248 - DEBUG - Target sequence length: 2118
2025-02-04 16:58:48,302 - DEBUG - Alignment score: 3650.0
2025-02-04 16:58:48,305 - DEBUG - Mapped reference codon 995.0 to target codon 1035
2025-02-04 16:58:48,306 - DEBUG - Getting genomic coordinates


AGAP004707-RD
4961    AGAP004707-PD
Name: protein_id, dtype: object


2025-02-04 16:58:48,713 - INFO - Found target location: CM027411.1:22646398-22646400 (-)
2025-02-04 16:58:48,716 - DEBUG - Processing row 2
2025-02-04 16:58:48,717 - INFO - Processing ortholog mapping for reference transcript AGAP004707-RD
2025-02-04 16:58:48,718 - INFO - Target ortholog: CQUJHB005371, Codon: 402.0
2025-02-04 16:58:48,720 - DEBUG - Extracting protein sequences
2025-02-04 16:58:48,732 - DEBUG - Target protein ID: CQUJHB005371.P8259
2025-02-04 16:58:48,733 - DEBUG - Alignment parameters: {}
2025-02-04 16:58:48,734 - DEBUG - Performing sequence alignment
2025-02-04 16:58:48,737 - DEBUG - Reference sequence length: 2148
2025-02-04 16:58:48,738 - DEBUG - Target sequence length: 2118
2025-02-04 16:58:48,787 - DEBUG - Alignment score: 3650.0
2025-02-04 16:58:48,791 - DEBUG - Mapped reference codon 402.0 to target codon 419
2025-02-04 16:58:48,791 - DEBUG - Getting genomic coordinates


AGAP004707-RD
4961    AGAP004707-PD
Name: protein_id, dtype: object


2025-02-04 16:58:49,189 - INFO - Found target location: CM027411.1:22686951-22686953 (-)
2025-02-04 16:58:49,191 - DEBUG - Processing row 3
2025-02-04 16:58:49,192 - INFO - Processing ortholog mapping for reference transcript nan
2025-02-04 16:58:49,193 - INFO - Target ortholog: CQUJHB002907, Codon: nan
2025-02-04 16:58:49,195 - DEBUG - Processing row 4
2025-02-04 16:58:49,197 - INFO - Processing ortholog mapping for reference transcript AGAP006028-RA
2025-02-04 16:58:49,198 - INFO - Target ortholog: CQUJHB004999, Codon: 296.0
2025-02-04 16:58:49,199 - DEBUG - Extracting protein sequences
2025-02-04 16:58:49,211 - DEBUG - Target protein ID: CQUJHB004999.P7685
2025-02-04 16:58:49,212 - DEBUG - Alignment parameters: {}
2025-02-04 16:58:49,213 - DEBUG - Performing sequence alignment
2025-02-04 16:58:49,215 - DEBUG - Reference sequence length: 559
2025-02-04 16:58:49,216 - DEBUG - Target sequence length: 555
2025-02-04 16:58:49,221 - DEBUG - Alignment score: 996.0
2025-02-04 16:58:49,223 -

AGAP006028-RA
6430    AGAP006028-PA
Name: protein_id, dtype: object


2025-02-04 16:58:49,628 - INFO - Found target location: CM027412.1:182141943-182141945 (+)
2025-02-04 16:58:49,630 - DEBUG - Processing row 5
2025-02-04 16:58:49,631 - INFO - Processing ortholog mapping for reference transcript AGAP006028-RA
2025-02-04 16:58:49,631 - INFO - Target ortholog: CQUJHB004999, Codon: 296.0
2025-02-04 16:58:49,632 - DEBUG - Extracting protein sequences
2025-02-04 16:58:49,643 - DEBUG - Target protein ID: CQUJHB004999.P7685
2025-02-04 16:58:49,644 - DEBUG - Alignment parameters: {}
2025-02-04 16:58:49,645 - DEBUG - Performing sequence alignment
2025-02-04 16:58:49,645 - DEBUG - Reference sequence length: 559
2025-02-04 16:58:49,646 - DEBUG - Target sequence length: 555
2025-02-04 16:58:49,650 - DEBUG - Alignment score: 996.0
2025-02-04 16:58:49,651 - DEBUG - Mapped reference codon 296.0 to target codon 296
2025-02-04 16:58:49,652 - DEBUG - Getting genomic coordinates


AGAP006028-RA
6430    AGAP006028-PA
Name: protein_id, dtype: object


2025-02-04 16:58:50,053 - INFO - Found target location: CM027412.1:182141943-182141945 (+)
2025-02-04 16:58:50,055 - DEBUG - Processing row 6
2025-02-04 16:58:50,055 - INFO - Processing ortholog mapping for reference transcript AGAP006227-RA
2025-02-04 16:58:50,056 - INFO - Target ortholog: CQUJHB000812, Codon: 477.0
2025-02-04 16:58:50,057 - DEBUG - Extracting protein sequences
2025-02-04 16:58:50,068 - DEBUG - Target protein ID: CQUJHB000812.P1285
2025-02-04 16:58:50,069 - DEBUG - Alignment parameters: {}
2025-02-04 16:58:50,069 - DEBUG - Performing sequence alignment
2025-02-04 16:58:50,070 - DEBUG - Reference sequence length: 540
2025-02-04 16:58:50,070 - DEBUG - Target sequence length: 542
2025-02-04 16:58:50,075 - DEBUG - Alignment score: 550.0
2025-02-04 16:58:50,076 - DEBUG - Mapped reference codon 477.0 to target codon 476
2025-02-04 16:58:50,077 - DEBUG - Getting genomic coordinates


AGAP006227-RA
6626    AGAP006227-PA
Name: protein_id, dtype: object


2025-02-04 16:58:50,489 - INFO - Found target location: CM027412.1:137361722-137361724 (-)
2025-02-04 16:58:50,490 - DEBUG - Processing row 7
2025-02-04 16:58:50,491 - INFO - Processing ortholog mapping for reference transcript nan
2025-02-04 16:58:50,492 - INFO - Target ortholog: CQUJHB006176, Codon: nan
2025-02-04 16:58:50,494 - DEBUG - Processing row 8
2025-02-04 16:58:50,495 - INFO - Processing ortholog mapping for reference transcript AGAP001356.R546
2025-02-04 16:58:50,496 - INFO - Target ortholog: CQUJHB013404, Codon: 280.0
2025-02-04 16:58:50,496 - DEBUG - Extracting protein sequences
2025-02-04 16:58:50,507 - DEBUG - Target protein ID: CQUJHB013404.P20823
2025-02-04 16:58:50,508 - DEBUG - Alignment parameters: {}
2025-02-04 16:58:50,509 - DEBUG - Performing sequence alignment
2025-02-04 16:58:50,510 - DEBUG - Reference sequence length: 711
2025-02-04 16:58:50,510 - DEBUG - Target sequence length: 737
2025-02-04 16:58:50,520 - DEBUG - Alignment score: 981.0
2025-02-04 16:58:50,

AGAP001356.R546
1437    AGAP001356.P546
Name: protein_id, dtype: object


2025-02-04 16:58:50,923 - INFO - Found target location: CM027412.1:71671322-71671324 (-)
2025-02-04 16:58:50,924 - DEBUG - Processing row 9
2025-02-04 16:58:50,925 - INFO - Processing ortholog mapping for reference transcript nan
2025-02-04 16:58:50,926 - INFO - Target ortholog: nan, Codon: nan
2025-02-04 16:58:50,928 - DEBUG - Processing row 10
2025-02-04 16:58:50,929 - INFO - Processing ortholog mapping for reference transcript nan
2025-02-04 16:58:50,930 - INFO - Target ortholog: nan, Codon: nan
2025-02-04 16:58:50,931 - DEBUG - Processing row 11
2025-02-04 16:58:50,932 - INFO - Processing ortholog mapping for reference transcript AGAP002865-RA
2025-02-04 16:58:50,933 - INFO - Target ortholog: nan, Codon: 88.0
2025-02-04 16:58:50,935 - DEBUG - Processing row 12
2025-02-04 16:58:50,936 - INFO - Processing ortholog mapping for reference transcript AGAP002867-RA
2025-02-04 16:58:50,936 - INFO - Target ortholog: nan, Codon: 236.0
2025-02-04 16:58:50,938 - DEBUG - Processing row 13
2025-

AGAP000818-RA
851    AGAP000818-PA
Name: protein_id, dtype: object


2025-02-04 16:58:51,390 - INFO - Found target location: CM027410.1:75018575-75018577 (-)
2025-02-04 16:58:51,392 - DEBUG - Processing row 18
2025-02-04 16:58:51,393 - INFO - Processing ortholog mapping for reference transcript AGAP000818-RA
2025-02-04 16:58:51,394 - INFO - Target ortholog: CQUJHB016283, Codon: 224.0
2025-02-04 16:58:51,394 - DEBUG - Extracting protein sequences
2025-02-04 16:58:51,406 - DEBUG - Target protein ID: CQUJHB016283.P25194
2025-02-04 16:58:51,406 - DEBUG - Alignment parameters: {}
2025-02-04 16:58:51,407 - DEBUG - Performing sequence alignment
2025-02-04 16:58:51,408 - DEBUG - Reference sequence length: 355
2025-02-04 16:58:51,409 - DEBUG - Target sequence length: 531
2025-02-04 16:58:51,412 - DEBUG - Alignment score: 131.0
2025-02-04 16:58:51,414 - DEBUG - Mapped reference codon 224.0 to target codon None
2025-02-04 16:58:51,416 - INFO - Mapping complete. Successfully mapped 8/19 sites


AGAP000818-RA
851    AGAP000818-PA
Name: protein_id, dtype: object

Processing complete!
Total sites: 19
Sites mapped: 8


Unnamed: 0,genome,gene,chrom,start,end,type,desc,transcript,codon,Orthogroup,CquinquefasciatusJHB2020,ortholog_type,target_codon,target_chrom,target_start,target_end,target_strand
0,AgambiaePEST,AGAP004707,AgamP4_2L,2422651.0,2422652.0,neural,Vgsc-L995F,AGAP004707-RD,995.0,OG0003301,CQUJHB005371,one-to-one,1035.0,CM027411.1,22646398.0,22646400.0,-
1,AgambiaePEST,AGAP004707,AgamP4_2L,2422650.0,2422651.0,neural,Vgsc-L995S,AGAP004707-RD,995.0,OG0003301,CQUJHB005371,one-to-one,1035.0,CM027411.1,22646398.0,22646400.0,-
2,AgambiaePEST,AGAP004707,AgamP4_2L,2391227.0,2391228.0,neural,Vgsc-V402L,AGAP004707-RD,402.0,OG0003301,CQUJHB005371,one-to-one,419.0,CM027411.1,22686951.0,22686953.0,-
3,AgambiaePEST,AGAP006048,AgamP4_2L,,,p450,cyp4j5,,,OG0000166,CQUJHB002907,one-to-one,,,,,
4,AgambiaePEST,AGAP006028,AgamP4_2L,25429234.0,25429235.0,neural,Rdl,AGAP006028-RA,296.0,OG0005465,CQUJHB004999,one-to-one,296.0,CM027412.1,182141943.0,182141945.0,+
5,AgambiaePEST,AGAP006028,AgamP4_2L,25429235.0,25429236.0,neural,Rdl,AGAP006028-RA,296.0,OG0005465,CQUJHB004999,one-to-one,296.0,CM027412.1,182141943.0,182141945.0,+
6,AgambiaePEST,AGAP006227,AgamP4_2L,,,coe,Coeae1f-E477D,AGAP006227-RA,477.0,OG0001708,CQUJHB000812,one-to-one,476.0,CM027412.1,137361722.0,137361724.0,-
7,AgambiaePEST,AGAP006228,AgamP4_2L,,,coe,Coeae2f,,,OG0009784,CQUJHB006176,one-to-one,,,,,
8,AgambiaePEST,AGAP001356,AgamP4_2R,3492073.0,3492074.0,neural,Ace1-G280S,AGAP001356.R546,280.0,OG0003646,CQUJHB013404,one-to-one,256.0,CM027412.1,71671322.0,71671324.0,-
9,AgambiaePEST,AGAP008212,AgamP4_3R,,,p450,cyp6m2,,,,,,,,,,
