In [None]:
def parse_fasta(file_path):
    sequences = {}
    with open(file_path, 'r') as file:
        current_seq_id = None
        current_seq = []
        for line in file:
            if line.startswith('>'):
                if current_seq_id is not None:
                    sequences[current_seq_id] = ''.join(current_seq)
                current_seq_id = line.strip()[1:]  
                current_seq = []
            else:
                current_seq.append(line.strip().upper())
        if current_seq_id is not None:
            sequences[current_seq_id] = ''.join(current_seq)
    return sequences


def validate_sequences(sequences, valid_nucleotides={'A', 'T', 'C', 'G'}):
    cleaned_sequences = {}
    for seq_id, seq in sequences.items():
        ambiguous_nucleotides = set(seq) - valid_nucleotides
        if ambiguous_nucleotides:
            print(f"Sequence {seq_id} contains ambiguous nucleotides: {ambiguous_nucleotides}")
            seq = ''.join(base if base in valid_nucleotides else 'N' for base in seq)
        cleaned_sequences[seq_id] = seq
    return cleaned_sequences


def save_fasta(sequences, output_path):
    with open(output_path, 'w') as file:
        for seq_id, seq in sequences.items():
            file.write(f">{seq_id}\n")
            for i in range(0, len(seq), 80):  
                file.write(seq[i:i+80] + '\n')


input_fasta_path = "Gasterosteusaculeatus.fasta"  
output_fasta_path = "cleaned_Gasterosteusaculeatus.fasta"


sequences = parse_fasta(input_fasta_path)
cleaned_sequences = validate_sequences(sequences)


save_fasta(cleaned_sequences, output_fasta_path)

print(f"Cleaned sequences saved to {output_fasta_path}")


Cleaned sequences saved to cleaned_Gasterosteusaculeatus.fasta


In [None]:
import os
import subprocess

muscle_path = "C:/muscle/muscle.exe"
print(f"Checking MUSCLE Path: {os.path.exists(muscle_path)}")  

try:
    subprocess.run(f'"{muscle_path}" -version', shell=True, check=True)
    print("MUSCLE ran successfully!")
except FileNotFoundError:
    print("MUSCLE executable not found.")
except subprocess.CalledProcessError as e:
    print(f"Error running MUSCLE: {e}")



Checking MUSCLE Path: True
MUSCLE ran successfully!


In [4]:
import subprocess

muscle_path = "C:/muscle/muscle.exe"
input_fasta_path = "C:/Users/robyr/OneDrive/Desktop/Biology/final proj/cleaned_Gasterosteusaculeatus.fasta"
aligned_fasta_path = "C:/Users/robyr/OneDrive/Desktop/Biology/final proj/aligned_cleaned_Gasterosteusaculeatus.fasta"

try:
    subprocess.run([
        muscle_path,
        "-align", input_fasta_path,
        "-output", aligned_fasta_path
    ], check=True)
    print(f"Aligned sequences saved to {aligned_fasta_path}")
except subprocess.CalledProcessError as e:
    print(f"An error occurred during MUSCLE execution: {e}")


Aligned sequences saved to C:/Users/robyr/OneDrive/Desktop/Biology/final proj/aligned_cleaned_Gasterosteusaculeatus.fasta


In [None]:
# import os
# print(f"Input file exists: {os.path.exists(input_fasta_path)}")


Input file exists: True


In [None]:
# def clean_fasta(input_path, output_path):
#     with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
#         for line in infile:
#             if line.startswith('>'):
#                 header = line.split()[0]
#                 outfile.write(f"{header}\n")
#             else:
#                 outfile.write(line.strip().upper() + '\n')


# input_fasta = "C:/Users/robyr/OneDrive/Desktop/Biology/final proj/cleaned_sequences.fasta"
# cleaned_fasta = "C:/Users/robyr/OneDrive/Desktop/Biology/final proj/cleaned_sequences_cleaned.fasta"

# clean_fasta(input_fasta, cleaned_fasta)
# print(f"Cleaned FASTA file saved to {cleaned_fasta}")


Cleaned FASTA file saved to C:/Users/robyr/OneDrive/Desktop/Biology/final proj/cleaned_sequences_cleaned.fasta


In [None]:
# from Bio import SeqIO

# try:
#     fasta_sequences = list(SeqIO.parse(input_fasta_path, "fasta"))
#     print(f"FASTA file contains {len(fasta_sequences)} valid sequences.")
# except Exception as e:
#     print(f"Error in FASTA file: {e}")


FASTA file contains 10 valid sequences.


In [5]:
from Bio import SeqIO

aligned_fasta_path = "C:/Users/robyr/OneDrive/Desktop/Biology/final proj/aligned_cleaned_Gasterosteusaculeatus.fasta"

sequence_lengths = [len(record.seq) for record in SeqIO.parse(aligned_fasta_path, "fasta")]

if len(set(sequence_lengths)) == 1:
    print(f"All sequences are the same length: {sequence_lengths[0]} bases.")
else:
    print(f"Sequences have different lengths: {set(sequence_lengths)}")


All sequences are the same length: 3714 bases.
