__PyScript 1:__ Nucleotide count and GC content

In [5]:
#gc() takes any nucleotide sequence as input and returns its ATGC count and GC content
def gc(seq):
    '''return ATGC count and GC content in given sequence''' 
    import re
    #case typed to upper irrespective of input case
    seq_up = seq.upper()
    #checking for any contaminant base, i.e., anything except ATGCU
    check = re.search('[^ATGCU]',seq_up)
    if check:
        return 'incorrect input,', 'please recheck input!'
    else:
        nt_count = {i:seq_up.count(i) for i in seq_up}
        gc_content = ((nt_count['G']+nt_count['C'])/sum(nt_count.values()))*100
        return nt_count,round(gc_content,4)

In [2]:
#testing the function
m,n = gc('atgc')
print(m,n)

x,y = gc('attgcggcatgacaggtctgcagtaaagtcctg')
print(x,y)

s,t = gc('attgcvccagttcca')
print(s,t)

{'A': 1, 'T': 1, 'G': 1, 'C': 1} 50.0
{'A': 8, 'T': 8, 'G': 10, 'C': 7} 51.5152
incorrect input, please recheck input!


__PyScript 2:__ Transcription

In [3]:
#transcription() takes coding starand as an input and converts it to the transcribed RNA seq
def transcription(seq):
    '''this function takes coding strand as argument and transcribed to corresponding RNA'''
    import re
    dna = seq.upper()
    check = re.search('[^ATGCU]',dna)
    if check:
        return 'incorrect input, please recheck!'
    #replacing Ts with Us convert a DNA string to RNA
    else:
        rna = dna.replace('T','U')
        return rna

In [4]:
#testing the function
dna = 'attgcgt'
mrna = transcription(dna)
print(mrna)

dna1 =  'attgcgcgxtggacccagacatgcacgtca'
mrna1 = transcription(dna1)
print(mrna1)

AUUGCGU
incorrect input, please recheck!


__PyScript 3:__ Translation

In [5]:
#translation() converts an mRNA to its corresponding protein
def translation(seq):
    '''it takes RNA seq as argument and convert it to coresponding protein'''
    import re
    rna = seq.upper()
    check = re.search('[^AGCU]',rna)
    if check:
        return 'incorrect input, please recheck!'
    else:
        codons = {"UUU" : "F","CUU" : "L","AUU" : "I","GUU" : "V",
        "UUC" : "F","CUC" : "L","AUC" : "I","GUC" : "V","UUA" : "L",
        "CUA" : "L","AUA" : "I","GUA" : "V","UUG" : "L","CUG" : "L",
        "AUG" : "M","GUG" : "V","UCU" : "S","CCU" : "P","ACU" : "T",
        "GCU" : "A","UCC" : "S","CCC" : "P","ACC" : "T","GCC" : "A",
        "UCA" : "S","CCA" : "P","ACA" : "T","GCA" : "A","UCG" : "S",
        "CCG" : "P","ACG" : "T","GCG" : "A","UAU" : "Y","CAU" : "H",
        "AAU" : "N","GAU" : "D","UAC" : "Y","CAC" : "H","AAC" : "N",
        "GAC" : "D","UAA" : "Stop","CAA" : "Q","AAA" : "K","GAA" : "E",
        "UAG" : "Stop","CAG" : "Q","AAG" : "K","GAG" : "E","UGU" : "C",
        "CGU" : "R","AGU" : "S","GGU" : "G","UGC" : "C","CGC" : "R",
        "AGC" : "S","GGC" : "G","UGA" : "Stop","CGA" : "R","AGA" : "R",
        "GGA" : "G","UGG" : "W","CGG" : "R","AGG" : "R","GGG" : "G"}

        mrna2codon = [rna[i:i+3] for i in range(0,len(rna),3)]
        aa = []
        for codon in mrna2codon:
            if codon == 'UAA' or codon == 'UGA' or codon == 'UAG':
                break
            else:
                aa.append(codons[codon])                
        protein = ''.join(aa)
        return protein  

In [6]:
#testing the function
mrna = 'gguggugguuguugcgua'
prot = translation(mrna)
print(prot)

rna1 = 'gguggugguuguugcUAAgua'
prot1 = translation(rna1)
print(prot1)

rna2 = 'gguggugguuguuggTgua'
prot2 = translation(rna2)
print(prot2)

GGGCCV
GGGCC
incorrect input, please recheck!


__PyScript 4:__ Protein mass calculator

In [7]:

#prot_mass() calculates the input peptide's molecular mass
def prot_mass(prot):
    '''it takes single letter coded peptide sequence and returns its mass'''
    import re
    protein = prot.upper()
    check = re.search('[^ACDEFGHIKLMNPQRSTVWY]',protein)
    if check:
        return 'incorrect input, please recheck!'
    else:
        protein_mass_table = {'A': 71.03711,'C': 103.00919,
        'D': 115.02694,'E': 129.04259,'F': 147.06841,'G': 57.02146,
        'H': 137.05891,'I': 113.08406,'K': 128.09496,'L': 113.08406,
        'M': 131.04049,'N': 114.04293,'P': 97.05276,'Q': 128.05858,
        'R': 156.10111,'S': 87.03203,'T': 101.04768,'V': 99.06841,
        'W': 186.07931,'Y': 163.06333}
        
        protein_mass = 0
        for aa in protein:
            protein_mass += protein_mass_table[aa]
        return round(protein_mass,5)       

In [8]:
#testing the function
peptide = 'GGGCCV'
mass = prot_mass(peptide)
print(mass)

peptide1 = 'GGGCCxV'
mass = prot_mass(peptide1)
print(mass)

476.15117
incorrect input, please recheck!


__PyScript 5:__ Inferring possible mRNA number from protein sequence

In [9]:
#prot2mrna() calculates the all the possible number of mRNA inferred from peptide sequence
def prot2mrna(prot):
    '''this function will return total possile mRNA based 
    on the supplied protein sequence'''
    import re
    protein = prot.upper()
    check = re.search('[^ACDEFGHIKLMNPQRSTVWY]',protein)
    if check:
        return 'incorrect input, please recheck!'
    else:
        codons = {"UUU" : "F","CUU" : "L","AUU" : "I","GUU" : "V",
                "UUC" : "F","CUC" : "L","AUC" : "I","GUC" : "V","UUA" : "L",
                "CUA" : "L","AUA" : "I","GUA" : "V","UUG" : "L","CUG" : "L",
                "AUG" : "M","GUG" : "V","UCU" : "S","CCU" : "P","ACU" : "T",
                "GCU" : "A","UCC" : "S","CCC" : "P","ACC" : "T","GCC" : "A",
                "UCA" : "S","CCA" : "P","ACA" : "T","GCA" : "A","UCG" : "S",
                "CCG" : "P","ACG" : "T","GCG" : "A","UAU" : "Y","CAU" : "H",
                "AAU" : "N","GAU" : "D","UAC" : "Y","CAC" : "H","AAC" : "N",
                "GAC" : "D","UAA" : "Stop","CAA" : "Q","AAA" : "K","GAA" : "E",
                "UAG" : "Stop","CAG" : "Q","AAG" : "K","GAG" : "E","UGU" : "C",
                "CGU" : "R","AGU" : "S","GGU" : "G","UGC" : "C","CGC" : "R",
                "AGC" : "S","GGC" : "G","UGA" : "Stop","CGA" : "R","AGA" : "R",
                "GGA" : "G","UGG" : "W","CGG" : "R","AGG" : "R","GGG" : "G"}
        DegeneracyCodon = {}
        for k,v in codons.items():
            if v not in DegeneracyCodon:
                DegeneracyCodon.update({v:[k]})
            elif v in DegeneracyCodon:
                DegeneracyCodon[v].append(k)

        maxMrnNo = 3 #3 for stop codon degeneracy
        for aa in prot:
            #neglecting any error or new line characters in input
            maxMrnNo *= len(DegeneracyCodon.get(aa,'x')) 
        return maxMrnNo

In [10]:
#testing the function
peptide2 = 'GGGCCV'
inferred_mrna = prot2mrna(peptide2)
print(inferred_mrna)

peptide3 = 'GGGCXCV'
inferred_mrna1 = prot2mrna(peptide3)
print(inferred_mrna1)

3072
incorrect input, please recheck!


__PyScript 6:__ Inferring gene location from spliced mRNA

In [11]:
#gene_tracker() tracks the probale position of exons on DNA starting from spliced mRNA
def gene_tracker(seq,m):
    '''function consider first positional argument as the DNA seq within which it search
    for the motif, which is supplied as the second positional argument. It returns the 
    position of every motif character of the first motif it found'''
    import re
    dna = seq.upper()
    motif = m.upper()
    check = re.search('[^ATGC]',dna)
    check1 = re.search('[^ATGC]',m)
    if check or check1:
        #this conditional statement checks wheather both the supplied motifs are in order
        return 'incorrect input, please recheck!'
    else:
        indices = []
        w = 0
        x = len(dna)
        for i in motif:        
            for j in range(w,x):        
                if dna[j] == i:            
                    indices.append(j+1)
                    w = j+1            
                    break        
        #mapping int to str so that as join will only accept str as argument
        return ' '.join(map(str,indices))
        

In [12]:
#testing the function
s = 'ACGTCGACGTGCG'
t = 'GTA'
ind = gene_tracker(s,t)
print(ind)

s1 = 'ACGxTCGACGTGCG'
t1 = 'GTA'
ind1 = gene_tracker(s1,t1)
print(ind1)

s2 = 'ACGTCGACGTGCG'
t2 = 'GTxA'
ind2 = gene_tracker(s2,t2)
print(ind2)

3 4 7
incorrect input, please recheck!
incorrect input, please recheck!


__PyScript 7:__ ORF finder

In [13]:
#ORF finder finds ORFs from any given DNA string
def ORF_finder(seq):
    '''this function finds ORFs from a given dna sequence(in fasta format) and returns
    corresponding protein sequences respecting to each ORF '''
    import re
    dna = seq.upper()    
    check = re.search('[^ATGC]',dna)    
    if check:
        return 'incorrect input, please recheck!'
    else:
        dna_codon = {'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                 
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TAA':'Stop', 'TAG':'Stop', 'TGC':'C', 'TGT':'C', 'TGA':'Stop', 
        'TGG':'W'}

        cbp = {'A':'T','G':'C','T':'A','C':'G'}
        #creating and orienting complementary strand to the supplied seq
        complmnt_dna = ''.join([cbp[i] for i in dna][::-1])

        #finding the indexes of start codons in both strand
        sc_coding = [i.start() for i in re.finditer('(?=ATG)', dna)]
        sc_templet = [i.start() for  i in re.finditer('(?=ATG)', complmnt_dna)]

        protein_coding = []
        for i in sc_coding:
            pro_seq = ''
            for j in range(i,len(dna)-3+1,3):        
                if dna_codon[dna[j:j+3]] == 'Stop':
                    break
                else:
                    pro_seq += dna_codon[dna[j:j+3]]
            if len(pro_seq) < len(dna[i:])//3:
                protein_coding.append(pro_seq)


        protein_template = []
        for i in sc_templet:
            pro_seq_temp = ''
            for j in range(i,len(complmnt_dna)-3+1,3):        
                if dna_codon[complmnt_dna[j:j+3]] == 'Stop':
                    break
                else:
                    pro_seq_temp += dna_codon[complmnt_dna[j:j+3]]
            '''ensuring program only pick stop codon to define ORF boundary not the end
            of the supplied sequence.Hence a true ORF size should always be less than
            the floor division of the supplied sequence'''
            if len(pro_seq_temp) < len(complmnt_dna[i:])//3:
                protein_template.append(pro_seq_temp)

        total_proteins_seq = '\n'.join(set(protein_coding)|set(protein_template))
        return total_proteins_seq

In [14]:
#testing the function
s = 'GCTCTGGCATTGCAATTTGCCCATCCGGTCAGTTGAATAGGTTAAGCCGCAGTCTTTATCATAATGACGATGGGACCTGATGCAATATCAATGAGCAACC'
p = ORF_finder(s)
print(p)

s = 'GCTCTGGCATTGCAATTTGCCCATCCGGTCAGTTGAATAbGGTTAAGCCGCAGTCTTTATCATAATGACGATGGGACCTGATGCAATATCAATGAGCAACC'
p = ORF_finder(s)
print(p)


MIKTAA
MQYQ
incorrect input, please recheck!


In [107]:
import random as r
x = ''.join(r.choices('ATGC',[1,1,6,6],k=1000))
print(gc(x))

({'G': 428, 'C': 433, 'T': 70, 'A': 69}, 86.1)
