In [1]:
!pip install biopython 

[0mCollecting biopython
  Downloading biopython-1.81-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Installing collected packages: biopython
Successfully installed biopython-1.81
[0m

In [4]:
from Bio import SeqIO

def filter_fasta(input_file, output_file):
    """
    Filters out protein sequences containing 'fragment' in their description from a .fasta file.
    
    Args:
    input_file (str): Path to the input .fasta file.
    output_file (str): Path to save the filtered .fasta file.
    """
    # Load sequences from the input file
    sequences = SeqIO.parse(input_file, 'fasta')
    
    # Filter sequences to exclude those with 'fragment' in the description
    filtered_sequences = [seq for seq in sequences if 'fragment' not in seq.description.lower()]
    
    # Save the filtered sequences to the output file
    SeqIO.write(filtered_sequences, output_file, 'fasta')

    print(f"Filtered file saved as {output_file}")

# Example usage
input_path = 'data/Tdts_from_PIR.fasta'
output_path = 'data/filtered_output.fasta'
filter_fasta(input_path, output_path)

Filtered file saved as data/filtered_output.fasta


In [6]:
from Bio import SeqIO

def count_protein_sequences(fasta_file):
    """
    Counts the number of protein sequences in a .fasta file.
    
    Args:
    fasta_file (str): Path to the .fasta file.
    
    Returns:
    int: Number of protein sequences in the file.
    """
    # Load sequences from the fasta file
    sequences = SeqIO.parse(fasta_file, 'fasta')
    
    # Count the sequences
    count = sum(1 for _ in sequences)
    
    return count

# Example usage
fasta_path = 'data/filtered_output.fasta'
sequence_count = count_protein_sequences(fasta_path)
print(f"Total protein sequences: {sequence_count}")

Total protein sequences: 568


In [5]:
from Bio import SeqIO

def count_protein_sequences(fasta_file):
    """
    Counts the number of protein sequences in a .fasta file.
    
    Args:
    fasta_file (str): Path to the .fasta file.
    
    Returns:
    int: Number of protein sequences in the file.
    """
    # Load sequences from the fasta file
    sequences = SeqIO.parse(fasta_file, 'fasta')
    
    # Count the sequences
    count = sum(1 for _ in sequences)
    
    return count

# Example usage
fasta_path = 'data/Tdts_from_PIR.fasta'
sequence_count = count_protein_sequences(fasta_path)
print(f"Total protein sequences: {sequence_count}")


Total protein sequences: 784


In [11]:
from Bio import SeqIO

def remove_duplicate_sequences(input_file, output_file):
    """
    Removes duplicate protein sequences from a .fasta file and saves the unique sequences.
    
    Args:
    input_file (str): Path to the input .fasta file.
    output_file (str): Path to save the deduplicated .fasta file.
    """
    # Load sequences from the input file
    sequences = SeqIO.parse(input_file, 'fasta')
    
    # Track unique sequences using a set
    unique_sequences = {}
    for seq in sequences:
        if seq.seq not in unique_sequences:
            unique_sequences[seq.seq] = seq
    
    # Save the unique sequences to the output file
    SeqIO.write(unique_sequences.values(), output_file, 'fasta')
    
    print(f"Deduplicated file saved as {output_file}")

# Example usage
input_path = 'data/Tdts_fromPIR_filtered_from_fragments.fasta'
output_path = 'data/unique_Tdts_fromPIR_filtered_from_fragments.fasta'
remove_duplicate_sequences(input_path, output_path)


Deduplicated file saved as data/unique_Tdts_fromPIR_filtered_from_fragments.fasta


In [10]:
from Bio import SeqIO

def count_protein_sequences(fasta_file):
    """
    Counts the number of protein sequences in a .fasta file.
    
    Args:
    fasta_file (str): Path to the .fasta file.
    
    Returns:
    int: Number of protein sequences in the file.
    """
    # Load sequences from the fasta file
    sequences = SeqIO.parse(fasta_file, 'fasta')
    
    # Count the sequences
    count = sum(1 for _ in sequences)
    
    return count

# Example usage
fasta_path = 'data/unique_Tdts_fromPIR_filtered_from_fragments.fasta'
sequence_count = count_protein_sequences(fasta_path)
print(f"Total protein sequences: {sequence_count}")

Total protein sequences: 519


In [3]:
from Bio import SeqIO

def filter_exact_name_sequences(input_file, output_file, target_name):
    """
    Filters sequences in a .fasta file to include only those with a description that matches the target name exactly.
    
    Args:
    input_file (str): Path to the input .fasta file.
    output_file (str): Path to save the filtered .fasta file.
    target_name (str): The exact name to match in sequence descriptions.
    """
    # Load sequences from the input file
    sequences = SeqIO.parse(input_file, 'fasta')
    
    # Filter sequences by exact name match
    filtered_sequences = [seq for seq in sequences if seq.description.strip() == target_name]
    
    # Save the filtered sequences to the output file
    SeqIO.write(filtered_sequences, output_file, 'fasta')
    
    print(f"Filtered file saved as {output_file}")

# Example usage
input_path = 'data/unique_Tdts_fromPIR_filtered_from_fragments.fasta'
output_path = 'data/onlyDNTT_unique_Tdts_fromPIR_filtered_from_fragments.fasta'
target_sequence_name = "DNA nucleotidylexotransferase"
filter_exact_name_sequences(input_path, output_path, target_sequence_name)


Filtered file saved as data/onlyDNTT_unique_Tdts_fromPIR_filtered_from_fragments.fasta


In [3]:
from Bio import SeqIO

def filter_sequences(input_file, output_file):
    """
    Filters sequences based on their names from a .fasta file. Retains sequences containing 'DNA nucleotidylexotransferase'
    and excludes sequences with the words 'putative', 'like', 'isoform'.
    
    Args:
    input_file (str): Path to the input .fasta file.
    output_file (str): Path to save the filtered .fasta file.
    """
    # Load sequences from the input file
    sequences = SeqIO.parse(input_file, 'fasta')
    
    # Define the unwanted keywords
    unwanted_keywords = ['Putative', 'like', 'isoform', 'PROTEIN']
    
    # Filter sequences based on the criteria
    filtered_sequences = [
        seq for seq in sequences if 'DNA nucleotidylexotransferase' in seq.description and
        not any(keyword in seq.description for keyword in unwanted_keywords)
    ]
    
    # Save the filtered sequences to the output file
    SeqIO.write(filtered_sequences, output_file, 'fasta')
    
    print(f"Filtered file saved as {output_file}")

# Example usage
input_path = 'data/unique_Tdts_fromPIR_filtered_from_fragments.fasta'
output_path = 'data/onlyDNTT_unique_Tdts_fromPIR_filtered_from_fragments.fasta'
filter_sequences(input_path, output_path)


Filtered file saved as data/onlyDNTT_unique_Tdts_fromPIR_filtered_from_fragments.fasta


In [4]:
from Bio import SeqIO

def count_protein_sequences(fasta_file):
    """
    Counts the number of protein sequences in a .fasta file.
    
    Args:
    fasta_file (str): Path to the .fasta file.
    
    Returns:
    int: Number of protein sequences in the file.
    """
    # Load sequences from the fasta file
    sequences = SeqIO.parse(fasta_file, 'fasta')
    
    # Count the sequences
    count = sum(1 for _ in sequences)
    
    return count

# Example usage
fasta_path = 'data/onlyDNTT_unique_Tdts_fromPIR_filtered_from_fragments.fasta'
sequence_count = count_protein_sequences(fasta_path)
print(f"Total protein sequences: {sequence_count}")

Total protein sequences: 488


In [5]:
from Bio import SeqIO

def extract_protein_names(input_file, output_file):
    """
    Extracts protein names from a .fasta file and saves them to a text file.
    
    Args:
    input_file (str): Path to the input .fasta file.
    output_file (str): Path to save the protein names.
    """
    # Load sequences from the input file
    sequences = SeqIO.parse(input_file, 'fasta')
    
    # Open a file to write the protein names
    with open(output_file, 'w') as file:
        for seq in sequences:
            # Extract the protein name from the description
            protein_name = seq.description
            file.write(protein_name + '\n')
    
    print(f"Protein names saved to {output_file}")

# Example usage
input_path = 'data/onlyDNTT_unique_Tdts_fromPIR_filtered_from_fragments.fasta'
output_path = 'protein_names.txt'
extract_protein_names(input_path, output_path)


Protein names saved to protein_names.txt


In [6]:
from Bio import SeqIO
import re

def parse_host(description):
    """
    Extracts the host name from the sequence description.
    Expected to be enclosed in square brackets [ ].
    """
    match = re.search(r'\[(.*?)\]', description)
    return match.group(1) if match else None

def filter_unique_sequences_by_host(input_file, output_file):
    """
    Filters sequences to retain only unique sequences per host from a .fasta file.
    
    Args:
    input_file (str): Path to the input .fasta file.
    output_file (str): Path to save the filtered .fasta file.
    """
    sequences = SeqIO.parse(input_file, 'fasta')
    
    # Group sequences by host
    host_dict = {}
    for seq in sequences:
        host = parse_host(seq.description)
        if host:
            if host not in host_dict:
                host_dict[host] = []
            host_dict[host].append(seq)
    
    # Filter unique sequences within each host
    unique_sequences = []
    for host, seqs in host_dict.items():
        seen_seqs = set()
        for seq in seqs:
            if seq.seq not in seen_seqs:
                seen_seqs.add(seq.seq)
                unique_sequences.append(seq)
    
    # Save the unique sequences to the output file
    SeqIO.write(unique_sequences, output_file, 'fasta')
    
    print(f"Unique sequences per host saved to {output_file}")

# Example usage
input_path = 'data/onlyDNTT_unique_Tdts_fromPIR_filtered_from_fragments.fasta'
output_path = 'data/unique_per_host_onlyDNTT_unique_Tdts_fromPIR_filtered_from_fragments.fasta'
filter_unique_sequences_by_host(input_path, output_path)


Unique sequences per host saved to data/unique_per_host_onlyDNTT_unique_Tdts_fromPIR_filtered_from_fragments.fasta


In [7]:
!pip install argparse numpy matplotlib

[0mCollecting argparse
  Downloading argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Installing collected packages: argparse
Successfully installed argparse-1.4.0
[0m

In [8]:
#!/usr/bin/env python3
import argparse
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import sys
import modules

In [9]:
if __name__ == '__main__':

    #Set path to current directory
    path = os.getcwd() + 'data/unique_per_host_onlyDNTT_unique_Tdts_fromPIR_filtered_from_fragments.fasta'
    
    #make directory for outputs if it does not exist
    if not os.path.exists('Outputs/'):
        os.makedirs('Outputs/')

    parser = argparse.ArgumentParser()

    parser.add_argument('-i', required=True, metavar = 'Input FASTA file', help='Filename for FASTA alignment.')
    parser.add_argument('-o', metavar = 'Output FASTA filename. If not given will use name of input FASTA file as template to name output files.', default='out.txt', help='Output FASTA filename')
    parser.add_argument('-t', metavar = 'Length filtering threshold', type=float, default=0.3, help='Sequence length filtering threshold value (default: 0.3 for removing sequences that deviate +/- 30% from median sequence length of set, change with -t flag). Must be a value between 0 and 1.')
    parser.add_argument('-f', action='store_true', help='Include flag to prevent saving image of sequence lengths histogram')
    parser.add_argument('-a', action='store_true', help='Include flag to keep gaps in sequences for output FASTA alignment (not recommended for further curation).')

    args = parser.parse_args()

    # show help if no arguments passed
    if len(sys.argv) < 2:
        parser.print_help()
        sys.exit(1)

    #FIGURE SETTINGS
    mpl.rcParams['axes.titlesize'] = 18
    mpl.rcParams['axes.labelsize'] = 18
    mpl.rcParams['xtick.labelsize'] = 14
    mpl.rcParams['ytick.labelsize'] = 14
    mpl.rcParams['axes.facecolor'] = 'FFFFFF'
    mpl.rcParams['axes.edgecolor'] = '000000'
    mpl.rcParams['axes.linewidth'] = 1.0
    mpl.rcParams['axes.labelweight'] = 'regular'
    mpl.rcParams['xtick.major.pad'] = 3
    mpl.rcParams['ytick.major.pad'] = 3
    plt.rcParams['font.family'] = 'sans-serif'


    print()
    print(f'Reading file: {args.i}')
    print()

    #read FASTA and populate lists for sequences and IDs
    seqs = list()
    names = list()
    
    #flag to strip gaps when cleaning sequence
    strip_gaps = True
    
    try:
        with open(args.i, 'r') as n:
            seqs, ids = modules.read_fasta(n, strip_gaps)
    except FileNotFoundError:
        print(f'Could not find file: {args.i}')
        sys.exit(1)
        
    #If no sequences were added to list, file was not in FASTA format
    if len(seqs) < 1:
        print('Provided file is not in FASTA format.')
        sys.exit(1)

    num_seqs = len(seqs)
    print(f'Number of sequences in initial sequence set: {num_seqs}')
    print()

    #Calculating lengths of all sequences in alignment
    #Note: gaps are removed before calculating sequence lengths
    lengths = modules.calc_lengths(seqs)
    
    threshold = args.t
    if threshold < 0 or threshold > 1:
        print('Sequence filtering threshold must be between 0 and 1')
        sys.exit(1)

    #Calculating statistics for length filtering
    med_length = int(np.median(lengths))
    lower_threshold = int(round(med_length - med_length * threshold))
    upper_threshold = int(round(med_length + med_length * threshold))

    print(f'Median sequence length: {med_length} residues')
    print(f'Sequence length lower boundary: {lower_threshold} residues')
    print(f'Sequence length upper boundary: {upper_threshold} residues')
    print()

    #Create FASTA of sequences that fall within threshold values
    #Gaps are remove gaps from sequences unless user indicates otherwise with -a flag
    out_fasta = list()
    if args.a:
        for i, (j, k, l) in enumerate(zip(lengths, ids, seqs)):
            if j >= lower_threshold and j <= upper_threshold:
                if i != num_seqs - 1:
                    out_fasta.append(f'{k}\n{l}\n')
                else:
                    out_fasta.append(f'{k}\n{l}')
    else:
        for i, (j, k, l) in enumerate(zip(lengths, ids, seqs)):
            if j >= lower_threshold and j <= upper_threshold:
                if i != num_seqs - 1:
                    out_fasta.append(f"{k}\n{l.replace('-','')}\n")
                else:
                    out_fasta.append(f"{k}\n{l.replace('-','')}")
                    

    num_seqs_out = len(out_fasta)
    print(f'Number of sequences in final sequence set: {num_seqs_out}')
    print()

    #Setting path for file outputs
    #If the user does not supply a name for the output, use input file name
    #If user used a file from the Inputs directory, must remove 'Inputs/' from path
    out_file_name = args.o
    if out_file_name == 'out.txt':
        out_file_prefix = os.path.splitext(args.i)[0]
        if 'Inputs' in out_file_prefix:
            out_file_prefix = out_file_prefix.split('/')[1]
    else:
        out_file_prefix = out_file_name.split('.')[0]

    #Save figure if -f flag is not given
    if not args.f:
        fig, ax = plt.subplots()
        ax.hist(lengths, bins='doane', color='b', edgecolor='k', alpha=0.65)
        ax.set_xlabel('Sequence length')
        ax.set_ylabel('Count')
        ax.axvline(med_length - threshold*med_length, color='k', linestyle='dashed')
        ax.axvline(med_length + threshold*med_length, color='k', linestyle='dashed')
        fig.savefig(f'{path}Outputs/{out_file_prefix}_sequence_length_hist.png', bbox_inches = 'tight', dpi = 300)
        plt.close()
    
    #Save length filtered alignment
    with open(f'{path}Outputs/{out_file_prefix}_lengthFiltered.txt', 'w') as f:
        f.writelines(out_fasta)
        
    #Write summary of analysis to file
    with open(f'{path}Outputs/{out_file_prefix}_lengthFiltered_output.txt', 'w', newline='') as f:
        f.write(f'Filtered sequence set: {args.i}\n\n')
        f.write('Parameters used:\n')
        f.write(f'Sequence length filtering threshold: {threshold}\n\n')
        f.write(f'Number of sequences in initial alignment: {num_seqs}\n\n')
        f.write(f'Median sequence length: {med_length} residues\n')
        f.write(f'Sequence length lower boundary: {lower_threshold} residues\n')
        f.write(f'Sequence length upper boundary: {upper_threshold} residues\n\n')
        f.write(f'Number of sequences in final alignment: {num_seqs_out}\n\n')
        f.write(f'Wrote length filtered sequence set to: {out_file_prefix}_lengthFiltered.txt')
        
    print(f'Wrote filtered alignment to: Outputs/{out_file_prefix}_lengthFiltered.txt')

usage: ipykernel_launcher.py [-h] -i Input FASTA file
                             [-o Output FASTA filename. If not given will use name of input FASTA file as template to name output files.]
                             [-t Length filtering threshold] [-f] [-a]
ipykernel_launcher.py: error: the following arguments are required: -i


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
