In [None]:
"""
Script Name: ChainLAMP_coverage_analysis_v3.4_extension.ipynb
Version: v3.4
Author: Shane Gilligan-Steinberg
Date: 240822

Description:

Additional software: Script Name: ChainLAMP_coverage_analysis_v3.4_extension.ipynb
This can be used to generate a sets of input sequences organized by subtype for use in pipeline
[1] Generate library of sequences to be inputted into the pipeline (from LANL alignment - need to remove gaps and organize by subtype).
Other capability of splitting by year is not available in this version
    [1B] Generate alignments with additional split by year of sequeneces
    [1C] Another option is to gather sequences from GenBank IDs
"""

# Install all neccesary libraries
!pip install biopython pandas matplotlib tqdm

In [None]:
"""
[1] Generate library of sequences to be inputted into the pipeline (from LANL alignment - need to remove gaps and organize by subtype).

Author: Shane Gilligan-Steinberg
Date: 240715

Description:
Create subtype files of HIV subtypes from LANL alignments
Distinction from LANL sequence naming criteria.
"""

import os
import csv

def remove_gaps_from_sequences(input_file, min_length=0):
    """
    Remove gaps ('-') from sequences in a FASTA file, return a dictionary of sequences,
    and optionally omit sequences below a minimum length.
    Args:
        input_file (str): Path to the input FASTA file.
        min_length (int): Minimum length of sequences to be retained.
    Returns:
        Dict[str, List[str]]: Dictionary with sequence names as keys and lists of cleaned sequences as values.
    """
    fasta_records = {}
    current_name = None
    current_seq = []
    original_counts = {}
    
    with open(input_file, 'r') as f:
        for line in f:
            if line.startswith('>'):
                if current_name is not None:
                    # Process the previous record
                    cleaned_seq = ''.join(current_seq).replace('-', '')
                    if current_name not in original_counts:
                        original_counts[current_name] = 0
                    original_counts[current_name] += 1
                    if len(cleaned_seq) >= min_length:
                        if cleaned_seq:
                            if current_name not in fasta_records:
                                fasta_records[current_name] = []
                            if cleaned_seq not in fasta_records[current_name]:
                                fasta_records[current_name].append(cleaned_seq)
                
                # Start a new record
                current_name = line.strip()[1:]
                current_seq = []
            else:
                current_seq.append(line.strip())
        
        # Process the last record
        if current_name is not None:
            cleaned_seq = ''.join(current_seq).replace('-', '')
            if current_name not in original_counts:
                original_counts[current_name] = 0
            original_counts[current_name] += 1
            if len(cleaned_seq) >= min_length:
                if cleaned_seq:
                    if current_name not in fasta_records:
                        fasta_records[current_name] = []
                    if cleaned_seq not in fasta_records[current_name]:
                        fasta_records[current_name].append(cleaned_seq)
    
    return fasta_records, original_counts

def split_fasta_by_name(input_file, csv_output_file, output_directory, min_length=0):
    """
    Split a FASTA file into separate files based on sequence name prefixes and create a CSV file with output file names.
    Args:
        input_file (str): Path to the input FASTA file.
        csv_output_file (str): Path to the CSV output file.
        output_directory (str): Directory where the output files will be saved.
        min_length (int): Minimum length of sequences to be retained.
    """
    possible_names = ['A', 'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', '01_AE', '02_AG']  # List of possible names
    
    fasta_records, original_counts = remove_gaps_from_sequences(input_file, min_length)
    
    output_files = []
    alignment_counts = {}
    
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    # Adjust sequence names and write to different output files based on name prefix
    for name, seqs in fasta_records.items():
        adjusted_name = name.split('.')[0]
        if adjusted_name[0] == 'A' or adjusted_name[0] == 'F':
            adjusted_name = adjusted_name[0]
        
        output_file = os.path.join(output_directory, f"{adjusted_name}.fasta" if adjusted_name in possible_names else "other.fasta")
        
        with open(output_file, 'a') as out_file:
            for seq in seqs:
                out_file.write(f">{name}\n{seq}\n")
        
        if output_file not in output_files:
            output_files.append(output_file)
            alignment_counts[output_file] = 0
        
        alignment_counts[output_file] += len(seqs)
    
    # Write the output file names and counts to the specified CSV file
    with open(csv_output_file, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['subtypes', 'original_alignments', 'cleaned_alignments'])  # Write column header
        for output_file in output_files:
            subtype = os.path.basename(output_file).split('.')[0]
            original_count = sum(original_counts.get(name, 0) for name in original_counts if name.startswith(subtype))
            csv_writer.writerow([os.path.join("FLT_MCRF_LTR_500-850", os.path.basename(output_file)), original_count, alignment_counts[output_file]])
    
    # Print counts
    print("FASTA sequences have been split into different output files based on name prefix and cleaned of '-' characters.")
    print(f"Output file names and alignment counts have been written to {csv_output_file}.")
    print("Alignment counts per file:")
    for output_file in output_files:
        subtype = os.path.basename(output_file).split('.')[0]
        original_count = sum(original_counts.get(name, 0) for name in original_counts if name.startswith(subtype))
        print(f"{os.path.join('FLT_MCRF_LTR_500-850', os.path.basename(output_file))}: original alignments = {original_count}, cleaned alignments = {alignment_counts[output_file]}")

# Example usage:
input_file = "Alignments/FLT_MCRF_LTR_500-850/HIV1_FLT_2021_500-850_DNA.fasta"
csv_output_file = "Subtypes_LTR_500-850.csv"
output_directory = "Alignments/FLT_MCRF_LTR_500-850"
min_length = 300  # Example minimum length for sequences to be retained
split_fasta_by_name(input_file, csv_output_file, output_directory, min_length)


In [None]:
"""
[1B] Generate library of sequences to be inputted into the pipeline (from LANL alignment - need to remove gaps and organize by subtype). Enables a split by year

Author: Shane Gilligan-Steinberg
Date: 240715

Description:
Create subtype files of HIV subtypes from LANL alignments
Distinction from LANL sequence naming criteria.
Also split files by year into 5 year bins
"""

import os
import csv
from collections import defaultdict

def remove_gaps_from_sequences(input_file):
    """
    Remove gaps ('-') from sequences in a FASTA file and return a dictionary of sequences.
    Args:
        input_file (str): Path to the input FASTA file.
    Returns:
        Dict[str, List[str]]: Dictionary with sequence names as keys and lists of cleaned sequences as values.
    """
    fasta_records = {}
    current_name = None
    current_seq = []
    
    with open(input_file, 'r') as f:
        for line in f:
            if line.startswith('>'):
                if current_name is not None:
                    # Process the previous record
                    cleaned_seq = ''.join(current_seq).replace('-', '')
                    if cleaned_seq:
                        if current_name not in fasta_records:
                            fasta_records[current_name] = []
                        if cleaned_seq not in fasta_records[current_name]:
                            fasta_records[current_name].append(cleaned_seq)
                
                # Start a new record
                current_name = line.strip()[1:]
                current_seq = []
            else:
                current_seq.append(line.strip())
        
        # Process the last record
        if current_name is not None:
            cleaned_seq = ''.join(current_seq).replace('-', '')
            if cleaned_seq:
                if current_name not in fasta_records:
                    fasta_records[current_name] = []
                if cleaned_seq not in fasta_records[current_name]:
                    fasta_records[current_name].append(cleaned_seq)
    
    return fasta_records

def get_year_bin(year):
    """
    Get the bin for the given year.
    Args:
        year (int): Year to be binned.
    Returns:
        str: The bin name.
    """
    if 90 <= year <= 99:
        return "90+99"
    elif 0 <= year <= 4:
        return "00+04"
    elif 5 <= year <= 9:
        return "05+09"
    elif 10 <= year <= 14:
        return "10+14"
    elif 15 <= year <= 25:
        return "15+25"
    else:
        return "other"

def split_fasta_by_year_and_subtype(input_file, csv_output_file, output_directory):
    """
    Split a FASTA file into separate files based on the year in the sequence name and create a CSV file with output file names.
    Args:
        input_file (str): Path to the input FASTA file.
        csv_output_file (str): Path to the CSV output file.
        output_directory (str): Directory where the output files will be saved.
    """
    possible_names = ['A', 'B', 'C', 'D', 'G', '01_AE', '02_AG']  # List of possible names
    
    fasta_records = remove_gaps_from_sequences(input_file)
    
    output_files = []
    alignment_counts = defaultdict(int)
    
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    # Adjust sequence names and write to different output files based on year and subtype
    for name, seqs in fasta_records.items():
        year_part = name.split('.')[2]  # Assuming the year is the third part of the name separated by dots
        if year_part.isdigit():  # Check if the year part is numeric
            year_bin = get_year_bin(int(year_part))
        else:
            year_bin = "other"
        
        subtype = name.split('.')[0]
        if subtype not in possible_names:
            if subtype[0] == 'A': # or subtype[0] == 'F':
                subtype = subtype[0]
            else:
                subtype = "other"
        
        output_file = os.path.join(output_directory, f"{year_bin}_{subtype}.fasta")
        
        with open(output_file, 'a') as out_file:
            for seq in seqs:
                out_file.write(f">{name}\n{seq}\n")
        
        if output_file not in output_files:
            output_files.append(output_file)
        
        alignment_counts[(year_bin, subtype)] += len(seqs)
    
    # Write the output file names and counts to the specified CSV file
    with open(csv_output_file, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['year_bin', 'subtype', 'total_sequences'])  # Write column header
        for output_file in output_files:
            basename = os.path.basename(output_file).split('.')[0]
            year_bin, subtype = basename.split('_', 1)  # Split only on the first underscore
            csv_writer.writerow([year_bin, subtype, alignment_counts[(year_bin, subtype)]])
    
    # Print counts
    print("FASTA sequences have been split into different output files based on year and subtype, and cleaned of '-' characters.")
    print(f"Output file names and sequence counts have been written to {csv_output_file}.")
    print("Sequence counts per file:")
    for output_file in output_files:
        basename = os.path.basename(output_file).split('.')[0]
        year_bin, subtype = basename.split('_', 1)  # Split only on the first underscore
        print(f"{os.path.join(output_directory, os.path.basename(output_file))}: total sequences = {alignment_counts[(year_bin, subtype)]}")

# Example usage:
input_file = "Alignments/FLT_MCRF_Pol/HIV1_FLT_2021_pol_DNA.fasta"
csv_output_file = "Alignments/FLT_MCRF_Pol/Yearly/Yearly_Subtypes_Pol.csv"
output_directory = "Alignments/FLT_MCRF_Pol/Yearly"
split_fasta_by_year_and_subtype(input_file, csv_output_file, output_directory)


In [None]:
"""
[1C] Another option is to gather sequences from GenBank IDs

Author: Shane Gilligan-Steinberg
Date: 240216

Description:
This script reads a list of GenBank IDs from a CSV file, fetches the corresponding nucleotide sequences 
from the GenBank database, and writes the sequences to a FASTA file. The GenBank IDs should be listed 
in a column named "GenBank ID" in the CSV file.
This could be used to use a specific list of sequences for analysis. You will have to make:
3. .csv list of subtypes (can use script 230704_Split_FASTA_v2_CRF.py to perform separation from LANL alignmnents) [~/Targets]
- This will be needed to run this in the main script

Additional Information:
- Make sure to change the `csv_file` name in the script to match the path to your input CSV file.
- The email address specified in the `Entrez.email` field should be updated to your email address for proper usage of the Entrez API.
- Ensure that the CSV file contains a column named "GenBank ID" with the GenBank IDs you wish to fetch.
- The output FASTA file will be saved as "output.fasta" in the same directory as the script.
"""

import pandas as pd
from Bio import Entrez, SeqIO

def fetch_sequences(genbank_ids):
    """
    Fetch sequences from GenBank given a list of GenBank IDs.
    Args:
        genbank_ids (list): List of GenBank IDs.
    Returns:
        list: List of SeqRecord objects.
    """
    sequences = []
    Entrez.email = ""  # Provide your email for Entrez

    for genbank_id in genbank_ids:
        try:
            handle = Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="text")
            record = SeqIO.read(handle, "genbank")
            sequences.append(record)
        except Exception as e:
            print(f"Error fetching sequence for {genbank_id}: {e}")

    return sequences

def write_fasta(sequences, output_filename):
    """
    Write sequences to a FASTA file.
    Args:
        sequences (list): List of SeqRecord objects.
        output_filename (str): Output FASTA file name.
    """
    with open(output_filename, "w") as fasta_file:
        for sequence in sequences:
            fasta_file.write(f">{sequence.id}\n{sequence.seq}\n")

# Read CSV file with GenBank IDs
csv_file = "Test_GenBank.csv"
df = pd.read_csv(csv_file)
print(df)

# Extract GenBank IDs from the CSV DataFrame
genbank_ids = df["GenBank ID"].tolist()
print(genbank_ids)

# Fetch sequences from GenBank
print("Fetching from GenBank........")
sequences = fetch_sequences(genbank_ids)

# Write sequences to FASTA file
output_fasta_file = "output.fasta"
write_fasta(sequences, output_fasta_file)

print("FASTA file generated successfully.")