# Handles and fetches DNA data from NCBI

In [4]:
!pip install Bio

Collecting Bio
  Downloading bio-0.1.4-py3-none-any.whl (44 kB)
[K     |████████████████████████████████| 44 kB 6.1 MB/s eta 0:00:011
[?25hCollecting biopython
  Downloading biopython-1.78-cp38-cp38-macosx_10_9_x86_64.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 5.5 MB/s eta 0:00:01
Installing collected packages: biopython, Bio
Successfully installed Bio-0.1.4 biopython-1.78
Note: you may need to restart the kernel to use updated packages.


In [1]:
# Handles and fetches DNA data from NCBI

# NCBI api
def get_sequence(GI,save='No'): # Example: GI=166706892

    # Imports from biopython
    from Bio import Entrez

    # Can be any mail, doesn't really matter
    Entrez.email ="BioPython@gmail.com"

    # Sends a request for GI
    request = Entrez.epost("nucleotide",
                           id=[str(GI)])

    # Reads the results, which contains everything you need to fetch it
    result = Entrez.read(request)

    # Sets parameters for feching data
    webEnv = result["WebEnv"]
    queryKey = result["QueryKey"]

    # Fetches genes
    handle = Entrez.efetch(db="nucleotide",
                           retmode="xml",
                           webenv=webEnv,
                           query_key=queryKey)

    #Needs to have Entrez.parse as list
    for r in Entrez.parse(handle):

        #Prints data
        print(">GI = ",
                GI,
                " ",
                r["GBSeq_primary-accession"],
                " ",
                r["GBSeq_definition"])

    #Handles sequence
    sequence = r["GBSeq_sequence"].upper() # Make into higher case
    
    # If you want the sequence as a list instead of string. When commented, the script returns a long string
    """sequence = list(sequence) # Make into list"""

    # If save is "Yes" or "yes", save as a file
    if (save is 'Yes' or save is 'yes'):
        with open(str(GI)+' DNA_sequence'+'.txt','w') as f: # PS: Directory not set, saves in local work directory
            for i in sequence: # Should not matter if list or string
                f.write(sequence)
    
    #Done
    return sequence

  if (save is 'Yes' or save is 'yes'):
  if (save is 'Yes' or save is 'yes'):


In [2]:
new_seq = get_sequence("166706892")

>GI =  166706892   NG_007370   Homo sapiens signal transducer and activator of transcription 3 (STAT3), RefSeqGene (LRG_112) on chromosome 17


In [3]:
new_seq

'GTAGAGACAAGGTTTCACCACGTTGGCCAGGCTGGTTTCAAACTCCTGACCTCAAGTGATCCGCCTGCCTCAGCCTCCCAAAGTGCTAGGACTACAGGATGAGCCACCACGTCAGGCCTCAAGACTTATTTATTAATATTTAATTAATTCAGTCTTAGGTGAGCTCTAGGCTAGATGATAGATGCTGAGATTATGAAGACAGGGCTTAAGGGGTTCAAGTTTCCAGGAACCCACAGTTTAGGGGAAGATGTTTGTTAGCACGTGGCCAGAGAGTCAGCCAGACCTGATTTTCATTCTGAGTTTCCACCTAGTTTCTAGCTGTGAGACCAGTCTTGAGCCAGTCTTTCACTTTCTCAGTCTTCGTTTCCTCAATTGTAAAATAAGAATTATAATATTACCCTCACTGGGTTCTGGTGAGGATCAAATGAGATAATCTATGTAAAATGCTTAGCACAGTTCCTGGAATATAGTAGTAATTACTTAATCAATGTTAATTATTGTTTTTTTTCAGGAAGTAGGCCAATATGTGGATTTATTAGAATTCAAGGTAGAGAATGTATTCTAGATGGGGAAACAACCTGCCGAAGGAGTCTGGTAGGAGATTAACTTTGCCAGTAAGAATAGAAGGAAGCCTAGAGGAGGTGATGTTGCAACAGGGCTTTGAAGAATGAGTAAGAGTTCTCAGGTAGGGACTGAGGAAAGAGATTCCAGGTAGAAGGAAGACTATCCACAGAGGCATGAAGTTGCAGAAGGTGTAGACATGTTTGGAGGAAGGTAACACATTTAGTAGGGTTTGTGGGACTAGGTTAGAAAAGGGGGGATTCAGGTCAGCCTTTGTGGAATTTATCTTTTTTTTGAGACAGAGTCTCACTGTATTACCCAGGCTGGATTGCAATGGTGCAATCTTAGCTCACTGCAACCTTTGTCTCCTGGGTTCAAGCGATTCTTCTGCCTCACCCTACCGAGTACCTGGGATTACAGGTGCCCGCCAACACGCTT

# Basic DNA analysis

In [4]:
def ATGC_content(DNA):
    """returns ATGC contents in the form of a dict"""
    ATGCDict = {'A': 0, 'T': 0, 'G': 0, 'C': 0}
    for nucleotide in DNA:
        try:
            ATGCDict[nucleotide] += 1
        except KeyError:
            pass
    return ATGCDict

In [5]:
ATGC_content(new_seq)

{'A': 21344, 'T': 23458, 'G': 19354, 'C': 18015}

In [6]:
def complementary(DNA):
    """Given a DNA sequence, will return a complementary DNA sequence"""
    cDNA = []
    complementary = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}
    for nucleotide in DNA:
        try:
            cDNA.append(complementary[nucleotide])
        except KeyError:
            cDNA.append('X')
    
    # Return cDNA as string instead of list. Remove if list is prefered
    cDNA = ''.join(cDNA)
    
    return cDNA

In [7]:
complementary(new_seq)

'CATCTCTGTTCCAAAGTGGTGCAACCGGTCCGACCAAAGTTTGAGGACTGGAGTTCACTAGGCGGACGGAGTCGGAGGGTTTCACGATCCTGATGTCCTACTCGGTGGTGCAGTCCGGAGTTCTGAATAAATAATTATAAATTAATTAAGTCAGAATCCACTCGAGATCCGATCTACTATCTACGACTCTAATACTTCTGTCCCGAATTCCCCAAGTTCAAAGGTCCTTGGGTGTCAAATCCCCTTCTACAAACAATCGTGCACCGGTCTCTCAGTCGGTCTGGACTAAAAGTAAGACTCAAAGGTGGATCAAAGATCGACACTCTGGTCAGAACTCGGTCAGAAAGTGAAAGAGTCAGAAGCAAAGGAGTTAACATTTTATTCTTAATATTATAATGGGAGTGACCCAAGACCACTCCTAGTTTACTCTATTAGATACATTTTACGAATCGTGTCAAGGACCTTATATCATCATTAATGAATTAGTTACAATTAATAACAAAAAAAAGTCCTTCATCCGGTTATACACCTAAATAATCTTAAGTTCCATCTCTTACATAAGATCTACCCCTTTGTTGGACGGCTTCCTCAGACCATCCTCTAATTGAAACGGTCATTCTTATCTTCCTTCGGATCTCCTCCACTACAACGTTGTCCCGAAACTTCTTACTCATTCTCAAGAGTCCATCCCTGACTCCTTTCTCTAAGGTCCATCTTCCTTCTGATAGGTGTCTCCGTACTTCAACGTCTTCCACATCTGTACAAACCTCCTTCCATTGTGTAAATCATCCCAAACACCCTGATCCAATCTTTTCCCCCCTAAGTCCAGTCGGAAACACCTTAAATAGAAAAAAAACTCTGTCTCAGAGTGACATAATGGGTCCGACCTAACGTTACCACGTTAGAATCGAGTGACGTTGGAAACAGAGGACCCAAGTTCGCTAAGAAGACGGAGTGGGATGGCTCATGGACCCTAATGTCCACGGGCGGTTGTGCGAA

In [8]:
# Function that determines what RNA sequence is made from a DNA sequence
def DNA2RNA(DNA_sequence): #Needs to be a string
    """Theory: When a gene is read, only one strand is read and transcribed to mRNA.
    The mRNA that is transcribed is the complimentary of the read strand, the anti-sense strand.
    However, you have to take in account for the direction of transcription and translation of the strand (5' to 3').
    The the sense strand (coding strand) has the same sequence as the mRNA but with U instead of T.
    Simplified solution: The coding strand (the gene) is the same as the mRNA sequence, but with U instead of T."""    
    
    # Simply replacing T with U
    mRNA_sequence = DNA_sequence.replace('T','U')
    
    return mRNA_sequence

In [9]:
def hamming_differences(DNA1, DNA2):
    """Returns info on positional/nucleotide differences in 2 DNA strands in
    the form of a list"""
    hDiff = []

    if len(DNA1) > len(DNA2):
        for i in range(len(DNA1)):
            DNA2 += 'X'
    else:
        for i in range(len(DNA2)):
            DNA1 += 'X'
    for n1, n2 in list(zip(DNA1, DNA2)):
        if n1 != n2:
            locDiff = list(zip(DNA1, DNA2)).index((n1, n2))
            nucDiff = n1 + n2
            print(str(locDiff) + nucDiff)
            hDiff.append(str(locDiff) + nucDiff)
    return hDiff

In [13]:
fasta_file = input('Type your fasta file name: ')

CODONS = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
    'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'
    }

def dna2AA(fastaFile,codons):
    AAsequence = ''
    with open(fastaFile) as fileObj:
        for line in fileObj:
            line.strip()
        cont = fileObj.read()
        sequenceSearch=re.compile(r'')                                                #need to complete
        content = sequenceSearch.search(cont)
        print(content.group())
        for i in range(0,len(content.group()),3):
            seq = cont[i:i+3]
            AAsequence += codons[seq]
    return AAsequence

Type your fasta file name: yes


In [10]:
def check3frames(DNA):
    """Helper function for orf"""
    proteins = set()
    for frame in range(3):
        protein = ""
        while frame <= len(DNA)-3:
            codon = DNA[frame:frame+3]
            if protein == "" and codon != "ATG":
                frame += 3
                continue
            protein += CODONS[codon]
            frame += 3
        while "M" in protein:
            start = protein.index("M")
            try:
                stop = protein.index("_",start)
            except ValueError:
                break
            proteins.add(protein[start:stop])
            protein = protein[start+1:]
    return proteins

In [11]:
def orf(DNA):
    """Given a DNA string, returns every distinct candidate protein string that can be translated from open reading frames (ORFs) of the DNA."""
    return check3frames(DNA) | check3frames(complementary(DNA))

In [14]:
orf(new_seq)

{'MDL',
 'MVRGQELGLESSLSTIMRTHL',
 'MSKGP',
 'MNMEESNNGSLSAEFKHLVCGRSSPSQRASGCRRGLGRASQDTCLFLFLQTLREQRCGNGGRANCDVSFVGDERQLGCFP',
 'MFFNRLVTPNTGIRVDESRDSILLTNSGPQPPMSLDINGDVRSDPLSLSGTEFFISDKNFN',
 'MNPSRRYIRLLGHYTICIKKRLTLKTVSILKPNYEH',
 'MWSLSAIKGVYGMTCVVSPV',
 'MLTRSTDRKMTLTVTC',
 'MCNTYMCISIFKYVKYNPSKKKEKNLSPRVTQWVRSHVNTSVPSDLGTGGPESVRRVESEESSTLMSVRGGVDRLKI',
 'MVACACNPSYSGG',
 'MQIRMCSLRPMIRGCGGGWLEGEKGNVLCCFVPLPSFLSSFLLLLLLFLDKCLLVPAASFCLFL',
 'MCGLYSISVGCCCLRLLIIQCKDCWEVLPALP',
 'MVQPVLSEAEKWSTRRVEPEDFTTLMSELGDTGRIRSKIFS',
 'MSTLGNAGRDNKEKKERKRKKKNSTSKREQRVRPHVTALEPSGVGGGGLKFTKRTDSEGSSTLMSVHGGVGRSKHKNNLYPQRGTTSPTRA',
 'MFLMFFNRLVTPNTGIRVDESRDSILLTNSGPQPPMSLDINGDVRSDPLSLSGTEFFISDKNFN',
 'MNKILYILLFFTKSSKSSVYFILTA',
 'MSVLGGTGWLQFKRSSV',
 'MTSRRFQHALLVGTVMDLARFKGREQRPPPLREWGIMREFRAMLSNHHKCLLNLRFCLKSS',
 'MKIKGDILLIIS',
 'MELSLARMGETGGEAGWERFLMKGLALDTLSLTCILDIQVEILKRQLAISV',
 'MDTTLKPI',
 'MNKHDYLDWSQLTPKFRFVV',
 'MEFHFCCSGWSAWRSLGSPQPPPPGFRRFSFLSLPSSWDYRHAPPSSANFVFLVEMGFSVSVRLVSNS',