In [1]:
#Converts a 3 base codon into an amino acid
def codon_to_aa(codon_amino):
    codon_dict = {'TTT':'Phe','TTC':'Phe','TTA':'Leu','TTG':'Leu','CTT':'Leu','CTC':'Leu','CTA':'Leu','CTG':'Leu','ATT':'Ile','ATC':'Ile','ATA':'Ile','ATG':'Met','GTT':'Val','GTC':'Val','GTA':'Val','GTG':'Val','TCT':'Ser','TCC':'Ser','TCA':'Ser','TCG':'Ser','CCT':'Pro','CCC':'Pro','CCA':'Pro','CCG':'Pro','ACT':'Thr','ACC':'Thr','ACA':'Thr','ACG':'Thr','GCT':'Ala','GCC':'Ala','GCA':'Ala','GCG':'Ala','TAT':'Tyr','TAC':'Tyr','TAA':'STOP','TAG':'STOP','CAT':'His','CAC':'His','CAA':'Gln','CAG':'Gln','AAT':'Asn','AAC':'Asn','AAA':'Lys','AAG':'Lys','GAT':'Asp','GAC':'Asp','GAA':'Glu','GAG':'Glu','TGT':'Cys','TGC':'Cys','TGA':'STOP','TGG':'Trp','CGT':'Arg','CGC':'Arg','CGA':'Arg','CGG':'Arg','AGT':'Ser','AGC':'Ser','AGA':'Arg','AGG':'Arg','GGT':'Gly','GGC':'Gly','GGA':'Gly','GGG':'Gly'}
    
    #match codon with amino acid in dictionary
    for i in codon_dict:
        if i == codon_amino:
            codon_amino = codon_dict[i]
    return codon_amino

In [2]:
#Open reading frame
class ORF:
    #initialization
    def __init__(self, orf_num, orf_bps, bp_start, bp_stop, orf_seq):
        self.orf_num = orf_num
        self.orf_bps = orf_bps
        self.bp_start = bp_start
        self.bp_stop = bp_stop
        self.orf_seq = orf_seq
    
    #print variables
    def print_attributes(self):
        print("Orf attributes:")
        print(f"The orf number is: {self.orf_num}")
        print(f"Basepairs: {self.orf_bps}")
        print(f"Starting at {self.bp_start}, ending at {self.bp_stop}\n")
    
    #Converts list of codons into a Protein object
    def transcribe(self):
        aa_seq = []
        
        for codon in self.orf_seq:
            amino = codon_to_aa(codon)#uses codon-amino dictionary to convert
            aa_seq.append(amino)
        
        ascendence = Protein(self.orf_num,len(aa_seq),aa_seq) #protein object 
        return ascendence

In [3]:
class Protein:
    #initialization
    def __init__(self, prot_num, aa_count, aa_seq):
        self.prot_num = prot_num
        self.aa_count = aa_count
        self.aa_seq = aa_seq
    #print variables
    def print_attributes(self):
        print(self.prot_num, self.aa_count, self.aa_seq)
    def print_aa(self):
        print(f"The amino acids sequence for this protein is {self.aa_seq}")
    
    #gives summary of how many of each aa in protein
    def aa_summary(self):
        sum_dict = {'Phe': 0, 'Leu': 0, 'Ile': 0, 'Met': 0, 'Val': 0, 'Ser': 0, 'Pro': 0, 'Thr': 0, 'Ala': 0, 'Tyr': 0, 'His': 0, 'Gln': 0, 'Asn': 0, 'Lys': 0, 'Asp': 0, 'Glu': 0, 'Cys': 0, 'Trp': 0, 'Arg': 0, 'Gly': 0}
        for aa in self.aa_seq:
            for prot in sum_dict:
                if aa == prot:
                    sum_dict[prot] += 1
        
        print(f"This protein has {self.aa_count} amino acids.")
        for key, value in sum_dict.items():
            print(f"{key:>30}:{value}")
    
    #writes protein into a file
    def write_protein(self):
        f = open(str(self.prot_num),"w")
        f.write(str(self.aa_seq))
        f.close()

In [4]:
class Sequence:
    def __init__(self, reference, source, sub_date, acc_date, coding_type, na_seq):
        self.reference = reference
        self.source = source
        self.sub_date = sub_date
        self.acc_date = acc_date
        self.coding_type = coding_type
        self.na_seq = na_seq
        self.orf_list = []
    
    #Number of basepairs in a sequence
    def basepairs(self):
        return len(self.na_seq)
    
    def print_attributes(self):
        print(self.reference,self.source,self.sub_date,self.acc_date,self.coding_type,self.na_seq,self.orf_list)
    
    #Finds the different orfs in the sequence, converts into ORF object, and stores in a list
    def find_orfs(self):     
        print(self.coding_type)
        if self.coding_type == "RNA": #Turn RNA into DNA
            self.na_seq = self.na_seq.replace("U","T")
        
        #variables for the ORF object
        orf_num = 0
        orf_bps = 0
        bp_start = 0
        bp_stop = 0
        orf_seq = []
        
        started = False
        i = 0
        #while loop goes through whole sequence and pulls out Orfs.
        while i < len(self.na_seq)-2:
            triple_base = self.na_seq[i:i+3]
            
            if started == False and triple_base == "ATG": #starting the orf reading
                started = True
                bp_start = i + 1
            elif started == True and (triple_base == "TGA" or triple_base == "TAA" or triple_base == "TAG"): #ending the orf
                started = False
                
                #calculating orf variables
                orf_num += 1
                bp_stop = i + 3
                orf_bps = bp_stop - bp_start + 1
                
                an_orf = ORF(orf_num,orf_bps,bp_start,bp_stop,orf_seq)#create orf object
                self.orf_list.append(an_orf)
                
                orf_seq = []#renewing the sequence list for the next orf
            
            #different increments depending on if currently reading an orf
            if started:
                orf_seq.append(triple_base)
                i += 3 #skip next 2 iterations because I stored this codon(3 base pairs)
            else:
                i += 3

In [5]:
#Main code
#reads sequence file
f = open("COVID-19 Genome.txt","r")
input = [f.readline().strip() for i in range(6)]
a,b,c,d,e,f = input
seq = Sequence(a,b,c,d,e,f)

print(seq.basepairs())
seq.find_orfs()

print(f"{len(seq.orf_list)} total proteins in orf_list\n")

for i in seq.orf_list:
    i.print_attributes()
    #i.transcribe().print_aa()


seq.orf_list[1].transcribe().write_protein()

29902
DNA
11 total proteins in orf_list

Orf attributes:
The orf number is: 1
Basepairs: 21291
Starting at 265, ending at 21555

Orf attributes:
The orf number is: 2
Basepairs: 3822
Starting at 21562, ending at 25383

Orf attributes:
The orf number is: 3
Basepairs: 831
Starting at 25390, ending at 26220

Orf attributes:
The orf number is: 4
Basepairs: 228
Starting at 26245, ending at 26472

Orf attributes:
The orf number is: 5
Basepairs: 669
Starting at 26521, ending at 27189

Orf attributes:
The orf number is: 6
Basepairs: 186
Starting at 27202, ending at 27387

Orf attributes:
The orf number is: 7
Basepairs: 366
Starting at 27394, ending at 27759

Orf attributes:
The orf number is: 8
Basepairs: 123
Starting at 27766, ending at 27888

Orf attributes:
The orf number is: 9
Basepairs: 366
Starting at 27892, ending at 28257

Orf attributes:
The orf number is: 10
Basepairs: 1260
Starting at 28273, ending at 29532

Orf attributes:
The orf number is: 11
Basepairs: 117
Starting at 29557, endi