In [2]:
from Bio import SeqIO
from Bio.Seq import Seq
import pandas as pd
import re

In [3]:
# Define input and output files
input_file = "NC_000913_3.gb"    
output_file = "42C_AA.fasta" 

with open(output_file, "w") as fasta_output:
    for record in SeqIO.parse(input_file, "genbank"):
        for feature in record.features:
            # Check if the feature is a coding sequence (CDS)
            if feature.type == "CDS":
                # Extract the CDS nucleotide sequence
                dna_seq = feature.extract(record.seq)

                # Translate DNA sequence to protein
                protein_seq = dna_seq.translate(to_stop=True)

                # Create a unique identifier for each protein (or use gene/locus_tag if available)
                protein_id = feature.qualifiers.get("protein_id", ["unknown_protein"])[0]
                gene_name = feature.qualifiers.get("gene", ["unknown_gene"])[0]

                # Write the protein sequence in FASTA format
                fasta_output.write(f">{protein_id} {gene_name}\n{protein_seq}\n")

print(f"Translation and conversion complete: {output_file}")




Translation and conversion complete: 42C_AA.fasta


In [4]:
Tenallion_data = pd.read_csv('42C_Tenallion.csv')
fourtytwoC_data = pd.read_csv('42C.csv')

In [5]:
def clean_mutation_data(mutation_data):
    # Select relevant columns
    data = mutation_data[['Gene (Scrollable)', 'Details']].copy()
    
    # Extract the specific pattern from the 'Details' column
    data['Details'] = data['Details'].str.extract(r'(^[A-Z]\d+[A-Z])')[0]
    
    # Drop rows with NaN values in the 'Details' column
    data = data.dropna()
    
    return data

In [6]:
def mutate_aa_sequence(genbank_file, mutations_df):
    mutated_sequences = []  # List to store results

    # Parse the GenBank file
    for record in SeqIO.parse(genbank_file, "genbank"):
        # Iterate over each feature (gene/protein)
        for feature in record.features:
            if feature.type == "CDS" and "gene" in feature.qualifiers:
                gene_name = feature.qualifiers["gene"][0]

                # Check if the gene is in our DataFrame
                mutation_row = mutations_df[mutations_df["Gene (Scrollable)"] == gene_name]
                if not mutation_row.empty:
                    # Get the mutation details (AA substitution)
                    mutation = mutation_row.iloc[0]["Details"]
                    original_aa = mutation[0]
                    position = int(mutation[1:-1]) - 1  # Adjust for 0-based index
                    new_aa = mutation[-1]

                    # Get the protein sequence and perform mutation if it matches
                    protein_seq = feature.qualifiers.get("translation", [""])[0]
                    if len(protein_seq) > position and protein_seq[position] == original_aa:
                        mutated_seq = protein_seq[:position] + new_aa + protein_seq[position + 1:]

                        # Append the mutated sequence and gene to the list
                        mutated_sequences.append((gene_name, mutated_seq))

    return mutated_sequences


In [7]:
# Clean both data csv files:

Tenallion_data = clean_mutation_data(Tenallion_data)
fourty2C_data = clean_mutation_data(fourtytwoC_data)

eColi_data = pd.concat([Tenallion_data, fourty2C_data])

# Example usage
genbank_file = "NC_000913_3.gb"  
mutated_sequences_list = mutate_aa_sequence(genbank_file, eColi_data)

# Checking
# Display mutated sequences
#for gene, seq in mutated_sequences_list:
#    print(f"Gene: {gene}, Mutated Sequence: {seq}")

# mutation worked!
mutated_sequences_list[0][1] == "MKPTTISLLQKYKQEKKRFATITAYDYSFAKLFADEGLNVMLVGDSLGMTVQGHDSTLPVTVADIAYHTAAVRRGAPNCLLLADLPFMAYATPEQAFENAATVMRAGANMVKIEGGEWLVETVQMLTERAVPVCGHLGLTPQSVNIFGGYKVQGRGDEAGDQLLSDALALEAAGAQLLVLECVPVELAKRITEALAIPVIGIGAGNVTDGQILVMHDAFGITGGHIPKFAKNFLAETGDIRAAVRQYMAEVESGVYPGEEHSFH"

# The mutated sequence
mutated_sequence = mutated_sequences_list[0][1]

# The reference sequence to compare against
reference_sequence = "MKPTTISLLQKYKQEKKRFATITAYDYSFAKLFADEGLNVMLVGDSLGMTVQGHDSTLPVTVADIAYHTAAVRRGAPNCLLLADLPFMAYATPEQAFENAATVMRAGANMVKIEGGEWLVETVQMLTERAVPVCGHLGLTPQSVNIFGGYKVQGRGDEAGDQLLSDALALEAAGAQLLVLECVPVELAKRITEALAIPVIGIGAGNVTDGQILVMHDAFGITGGHIPKFAKNFLAETGDIRAAVRQYMAEVESGVYPGEEHSFH"

# Check if the lengths are different first
if len(mutated_sequence) != len(reference_sequence):
    print("The sequences are of different lengths.")
else:
    # List to collect mismatched positions and characters
    mismatches = []
    
    # Iterate over each position and compare characters
    for i, (mut_char, ref_char) in enumerate(zip(mutated_sequence, reference_sequence)):
        if mut_char != ref_char:
            mismatches.append((i, mut_char, ref_char))

    # Display mismatches
    if mismatches:
        print("Mismatches found at the following positions (index, mutated, reference):")
        for position, mut_char, ref_char in mismatches:
            print(f"Position {position}: {mut_char} != {ref_char}")
    else:
        print("The sequences match exactly.")


The sequences are of different lengths.


In [8]:
#print(eColi_data, '\n')

#print(mutated_sequences_list)

In [10]:
# convert mutated_sequences_list to a fasta file for finetuning

# Specify output file name
output_file = "42C_mutated_seqs.fasta"

# Write to FASTA format
with open(output_file, "w") as fasta_file:
    for identifier, sequence in mutated_sequences_list:
        fasta_file.write(f">{identifier}\n")  # Write header
        fasta_file.write(f"{sequence}\n")     # Write sequence

print(f"FASTA file '{output_file}' created successfully!")

FASTA file '42C_mutated_seqs.fasta' created successfully!
