#### Validate DNA Sequences

In [136]:
# Create a module 

In [1]:
%%writefile DNAToolkit.py

Nucleotides = ["A", "C", "G", "T"]

# check the sequence to make sure it is a DNA string
def validateSeq (dna_seq):
    tmpseq = dna_seq.upper()
    for nuc in tmpseq:
        if nuc not in Nucleotides:
            return False
    return tmpseq


#def countNucFrequency(seq):
    tmpFreqDict = {"A": 0, "C": 0, "G": 0, "T": 0}
    for nuc in seq:
        tmpFreqDict[nuc] += 1
    return tmpFreqDict

import collections

def countNucFrequency(seq):
    return dict(collections.Counter(seq))

Overwriting DNAToolkit.py


In [138]:
from DNAToolkit import *

In [139]:
rndDNAStr = "ATTTCGT"

In [140]:
print(validateSeq(rndDNAStr))

ATTTCGT


In [141]:
rndDNAStr = "ATTTCGTX"

In [142]:
print(validateSeq(rndDNAStr))

False


In [143]:
import random

In [144]:
# Creating a random DNA sequence for testing
randDNAstr = ''.join(random.choice(Nucleotides)
                    for nuc in range(50))


In [145]:
DNAStr = validateSeq(randDNAstr)

In [146]:
print(validateSeq(DNAStr))

TTCGCGAGTAACGTAGGCGAGGGTCATTAGAAATATGTCAATATTCTGGA


In [147]:
print(countNucFrequency(DNAStr))

{'A': 15, 'C': 7, 'G': 14, 'T': 14}


In [5]:
# ChatGPT mathod 1 to validate DNA seq
def is_valid_dna_sequence(sequence):
    """
    Check if a given sequence is a valid DNA sequence.

    Args:
    sequence (str): The sequence to be validated.

    Returns:
    bool: True if the sequence is a valid DNA sequence, False otherwise.
    """
    # Define a set of valid DNA bases
    valid_bases = set("ATCG")

    # Check if the sequence contains only valid bases and is not empty
    return all(base in valid_bases for base in sequence)

In [6]:
# Example usage:
dna_sequence = "ATGCTAGCTA"
result = is_valid_dna_sequence(dna_sequence)
print(result)  # Output: True


True


In [7]:

invalid_sequence = "ATGCTATkA"
result = is_valid_dna_sequence(invalid_sequence)
print(result)  # Output: False

False


In [8]:
# ChatGPT method 2 to validate DNA seq 
import re

def is_valid_dna_sequence(sequence):
    """
    Check if a given sequence is a valid DNA sequence using regular expressions.

    Args:
    sequence (str): The sequence to be validated.

    Returns:
    bool: True if the sequence is a valid DNA sequence, False otherwise.
    """
    # Define a regular expression pattern for valid DNA sequences
    pattern = re.compile(r"^[ATCG]+$")

    # Use the regular expression to match the sequence
    return bool(pattern.match(sequence))


In [9]:
# Example usage:
dna_sequence = "ATGCTAGCTA"
result = is_valid_dna_sequence(dna_sequence)
print(result)  # Output: True

True


In [10]:
invalid_sequence = "ATGCTAGZTA"
result = is_valid_dna_sequence(invalid_sequence)
print(result)  # Output: False

False


#### Find out the position of invalid bases in DNA sequence

In [11]:
import re

def find_invalid_bases(sequence):
    """
    Find the positions of invalid bases in a DNA sequence using regular expressions.

    Args:
    sequence (str): The sequence to be checked for invalid bases.

    Returns:
    list: A list of tuples containing the position and the invalid base.
    """
    # Define a regular expression pattern for invalid bases
    pattern = re.compile(r"[^ATCG]")

    # Find all matches of the pattern in the sequence
    invalid_matches = pattern.finditer(sequence)

    # Extract positions and invalid bases
    invalid_positions = [(match.start(), match.group()) for match in invalid_matches]

    return invalid_positions


In [12]:
# Example usage:
dna_sequence = "AxTGlCTAGZTA"
invalid_positions = find_invalid_bases(dna_sequence)
print(invalid_positions)

[(1, 'x'), (4, 'l'), (9, 'Z')]


#### Counting DNA Nucleotides - ROSALIND 1

In [10]:
def countNucFrequency(seq):
    tmpFreqDict = {"A": 0, "C": 0, "G": 0, "T": 0}
    for nuc in seq:
        tmpFreqDict[nuc] += 1
    return tmpFreqDict


DNAstring = "GTTAGCTCCGAATGTGATCTGGATAGCCCGCTGGTATATACGTCTATCCTACACCTTTGAGTTTGATCGAATCCCGACGTTTCAGAAGACGAGCATGATATATGAGCTTCTAGCTTCTCCCGCAGGCGAGTATTGTGGGTATGCCTGACGGGTCCTCGAGAGTCGTCGGTCATATAGTGATGGCATCAGCGTATGGCAAACCCTTATAACTCATTTAGCCGCGAGTACGAAAGGCGAAAACGCCTGATTGTTGGCCAGCGACTTGCCGCATTAGTAGCTGTAGTGATGAGTTCTCCTGTTGAAAGTCCAGTCTGTGTGTCTAACTGTTCTGTAACAGTACTTACAACGTATCGTGGGGGCATTGAAGTCAGAAATTGAAAATCTTTTCCTAGGCTTATTAAATTGGGTTGCGGTCACAAAAACGACGCTATTTGTGGTCGTATGTGTCGAATATCAGAGGTAATCGAAGTCCACGAAACGCGGGGGAGTCAAGGTCCTTCGGTCATCGTCGTTCTAGTTGCACGCTCTTAAATGAGTAGGGCGTACGTTGGTCTAGATTTGGATGTCGATGCAATCGATGGTAAACACTTTAGACTCGCTGGAGAGGCTATACTCACGCGCGCTCCGGCGGTTCAAGCCCAGGGCAAGCATACACGCGTCGGATACCGCTACGCCAGTCGAGTAACTCACCGCACGACGGCAGGAGGATGTGCCTGGTTGTATGGTAGCTGGAATATGATGAGAAAAAGATAGTTTCTGGCCATAAGGCTCCCATTGCTACATCGCGAGCACCGATGCCGCTCATCCGTAATGCAGTATTTCTCTTTTTGACTTAGGTGATAAATCTAGACGGCTTACTAAGCGGGGGGGTTCACCCTTAATCAATAAGTTTCCTCTAGCATGCCCATTTGCCGTGCTTGTTGCGTGATCTTATTTATAGGGTATACCAGGCACTGACCCCGGCAGGGGCATATCGCTGCAGATT"
result = countNucFrequency(DNAstring)
print(' '.join([str(val) for key, val in result.items()]))

234 221 261 269


In [6]:
def count_nucleotides(dna_sequence):
    # Initialize counters for each nucleotide
    count_a = 0
    count_c = 0
    count_g = 0
    count_t = 0

    # Iterate through the sequence
    for nucleotide in dna_sequence:
        if nucleotide == 'A':
            count_a += 1
        elif nucleotide == 'T':
            count_t += 1
        elif nucleotide == 'C':
            count_c += 1
        elif nucleotide == 'G':
            count_g += 1

    # Create a dictionary to store the counts
    nucleotide_counts = {
        'A': count_a,
        'C': count_c,
        'G': count_g,
        'T': count_t 
    }

    return nucleotide_counts

# Example usage:
dna_sequence = "GTTAGCTCCGAATGTGATCTGGATAGCCCGCTGGTATATACGTCTATCCTACACCTTTGAGTTTGATCGAATCCCGACGTTTCAGAAGACGAGCATGATATATGAGCTTCTAGCTTCTCCCGCAGGCGAGTATTGTGGGTATGCCTGACGGGTCCTCGAGAGTCGTCGGTCATATAGTGATGGCATCAGCGTATGGCAAACCCTTATAACTCATTTAGCCGCGAGTACGAAAGGCGAAAACGCCTGATTGTTGGCCAGCGACTTGCCGCATTAGTAGCTGTAGTGATGAGTTCTCCTGTTGAAAGTCCAGTCTGTGTGTCTAACTGTTCTGTAACAGTACTTACAACGTATCGTGGGGGCATTGAAGTCAGAAATTGAAAATCTTTTCCTAGGCTTATTAAATTGGGTTGCGGTCACAAAAACGACGCTATTTGTGGTCGTATGTGTCGAATATCAGAGGTAATCGAAGTCCACGAAACGCGGGGGAGTCAAGGTCCTTCGGTCATCGTCGTTCTAGTTGCACGCTCTTAAATGAGTAGGGCGTACGTTGGTCTAGATTTGGATGTCGATGCAATCGATGGTAAACACTTTAGACTCGCTGGAGAGGCTATACTCACGCGCGCTCCGGCGGTTCAAGCCCAGGGCAAGCATACACGCGTCGGATACCGCTACGCCAGTCGAGTAACTCACCGCACGACGGCAGGAGGATGTGCCTGGTTGTATGGTAGCTGGAATATGATGAGAAAAAGATAGTTTCTGGCCATAAGGCTCCCATTGCTACATCGCGAGCACCGATGCCGCTCATCCGTAATGCAGTATTTCTCTTTTTGACTTAGGTGATAAATCTAGACGGCTTACTAAGCGGGGGGGTTCACCCTTAATCAATAAGTTTCCTCTAGCATGCCCATTTGCCGTGCTTGTTGCGTGATCTTATTTATAGGGTATACCAGGCACTGACCCCGGCAGGGGCATATCGCTGCAGATT"
counts = count_nucleotides(dna_sequence)
print(' '.join([str(val) for key, val in counts.items()]))
# Output: {'A': 4, 'T': 3, 'C': 2, 'G': 1}


234 221 261 269


In [9]:
from collections import Counter

def count_nucleotides(dna_sequence):
    # Use Counter to count nucleotides
    nucleotide_counts = Counter(dna_sequence)
    
    # Create a dictionary from the Counter object
    nucleotide_counts_dict = dict(nucleotide_counts)
    
    return nucleotide_counts_dict

# Example usage:
dna_sequence = "GTTAGCTCCGAATGTGATCTGGATAGCCCGCTGGTATATACGTCTATCCTACACCTTTGAGTTTGATCGAATCCCGACGTTTCAGAAGACGAGCATGATATATGAGCTTCTAGCTTCTCCCGCAGGCGAGTATTGTGGGTATGCCTGACGGGTCCTCGAGAGTCGTCGGTCATATAGTGATGGCATCAGCGTATGGCAAACCCTTATAACTCATTTAGCCGCGAGTACGAAAGGCGAAAACGCCTGATTGTTGGCCAGCGACTTGCCGCATTAGTAGCTGTAGTGATGAGTTCTCCTGTTGAAAGTCCAGTCTGTGTGTCTAACTGTTCTGTAACAGTACTTACAACGTATCGTGGGGGCATTGAAGTCAGAAATTGAAAATCTTTTCCTAGGCTTATTAAATTGGGTTGCGGTCACAAAAACGACGCTATTTGTGGTCGTATGTGTCGAATATCAGAGGTAATCGAAGTCCACGAAACGCGGGGGAGTCAAGGTCCTTCGGTCATCGTCGTTCTAGTTGCACGCTCTTAAATGAGTAGGGCGTACGTTGGTCTAGATTTGGATGTCGATGCAATCGATGGTAAACACTTTAGACTCGCTGGAGAGGCTATACTCACGCGCGCTCCGGCGGTTCAAGCCCAGGGCAAGCATACACGCGTCGGATACCGCTACGCCAGTCGAGTAACTCACCGCACGACGGCAGGAGGATGTGCCTGGTTGTATGGTAGCTGGAATATGATGAGAAAAAGATAGTTTCTGGCCATAAGGCTCCCATTGCTACATCGCGAGCACCGATGCCGCTCATCCGTAATGCAGTATTTCTCTTTTTGACTTAGGTGATAAATCTAGACGGCTTACTAAGCGGGGGGGTTCACCCTTAATCAATAAGTTTCCTCTAGCATGCCCATTTGCCGTGCTTGTTGCGTGATCTTATTTATAGGGTATACCAGGCACTGACCCCGGCAGGGGCATATCGCTGCAGATT"
counts = count_nucleotides(dna_sequence)
print(' '.join([str(val) for key, val in counts.items()]))
# Output: {'A': 4, 'T': 3, 'G': 1, 'C': 2}


261 269 234 221


#### Transcribing DNA into RNA - ROSALIND 2

In [12]:
dna_seq = "CTCCGAAGCCGGCGGCTGAACCTCTAATTTTTTCTTATTATGTGGGTTCGAGACGTCCGCTCAGGCCCAGACTATATAGTTGTCAGGTTACAAAGTGGATTGGCATGCTCACACGGATAACGCGGAAGGATTCACAGTATGGTACTTGGTAGTCTAACGCGTTTGCTCCCGGCCGCCCCGGCCGACGCATTCGGCTATCTTAGCCTCGTCAGTCTCCACTCTATTAACGTGTCAGAAGAGAGCTCTACGACTTCCATTTGTCGGCCCATCTCCGGGCACGTCGATTTCACGTACGCTGCCACTTAACAGTAGTAAGATCATATGAAGCGTGAAAAATGTCTTGCTTTTTGGATCTGAGAGTAACGCATACTACGCATCTATTTTGATCGCTACGGGCACCAACATTTCGGGGGTAGCATTGACGTGAAACGCCCGCCCCGTATTCCCACGAGCTGGGCTGTGCTACGTCACCACAAAGAAGTCATAGATGCGAACTAACTCAAATCGTGGTTGGACTCAAATAAGGCAGAGGAGATCGGTCACGAGTCTCTTTATGACCCTACACATTTCGAATAAACCAAGAGACGATGGGTCGCGTGACGCAGGTTAGCGCGTTTAGCACATGATGCGCCCTTGCATGCCTAATATGTGTCTCCCGTTTTGTTTCCCCTGAACTTGCAATGTCGACATTTGCGTAACCCCCATCTCCTCTGTAGTAGTCCGAGACCTTATACGGCACTCCCGAACATAGCTCTGTAGTCGTAAAGCGTGCGCATCCTTGCTAAATGACGCCCCACACAGGTAAGTATCTGGAGCAAAGCCAAGGTACGCCTCAAAATTACCGTTCGAGTGGTATGTAATTTGAAGTGGGACATACGCAAAAAATCGCGTAAAACACGA"
rna_seq = dna_seq.replace('T', 'U')
print("RNA Sequence:", rna_seq)

RNA Sequence: CUCCGAAGCCGGCGGCUGAACCUCUAAUUUUUUCUUAUUAUGUGGGUUCGAGACGUCCGCUCAGGCCCAGACUAUAUAGUUGUCAGGUUACAAAGUGGAUUGGCAUGCUCACACGGAUAACGCGGAAGGAUUCACAGUAUGGUACUUGGUAGUCUAACGCGUUUGCUCCCGGCCGCCCCGGCCGACGCAUUCGGCUAUCUUAGCCUCGUCAGUCUCCACUCUAUUAACGUGUCAGAAGAGAGCUCUACGACUUCCAUUUGUCGGCCCAUCUCCGGGCACGUCGAUUUCACGUACGCUGCCACUUAACAGUAGUAAGAUCAUAUGAAGCGUGAAAAAUGUCUUGCUUUUUGGAUCUGAGAGUAACGCAUACUACGCAUCUAUUUUGAUCGCUACGGGCACCAACAUUUCGGGGGUAGCAUUGACGUGAAACGCCCGCCCCGUAUUCCCACGAGCUGGGCUGUGCUACGUCACCACAAAGAAGUCAUAGAUGCGAACUAACUCAAAUCGUGGUUGGACUCAAAUAAGGCAGAGGAGAUCGGUCACGAGUCUCUUUAUGACCCUACACAUUUCGAAUAAACCAAGAGACGAUGGGUCGCGUGACGCAGGUUAGCGCGUUUAGCACAUGAUGCGCCCUUGCAUGCCUAAUAUGUGUCUCCCGUUUUGUUUCCCCUGAACUUGCAAUGUCGACAUUUGCGUAACCCCCAUCUCCUCUGUAGUAGUCCGAGACCUUAUACGGCACUCCCGAACAUAGCUCUGUAGUCGUAAAGCGUGCGCAUCCUUGCUAAAUGACGCCCCACACAGGUAAGUAUCUGGAGCAAAGCCAAGGUACGCCUCAAAAUUACCGUUCGAGUGGUAUGUAAUUUGAAGUGGGACAUACGCAAAAAAUCGCGUAAAACACGA


#### Complementing a Strand of DNA- ROSALIND 3

In [21]:
# Complement the DNA sequence
dna_seq = "CAGGAGTATACGGTATAGTCAGGTCGAGAGGGGAACATATTGGGCCTCTCCGCTGCTATAGATAACCCCAACCCGTCATTAACGCGCTTACGGTTCTCTATTGCGGGTTCTGGTAGTTGCGCTCATCCAAAACCAGGACATCGTATACTCAACTCATATGTATACTATTTGGTTGCGCACGCCGATAGGCGCCAATTAATTTTGAGACTAGGTGTTAAGACGATGGGTTGGCACCAGGCCTTTCCCATACGTGCGGGTGGTGGGTTTCTCTGTTAGTCATTGGCCCGCAGCGAACAATGTACTATGAGCGATAACCGGTTTTGGAATTCGGCCTGCCCGTTCCATTGAGCTTTAAAGATTATCCCCCGCGGGGTCGAAAGGAAGAAAGGTGTTTAGACCCTCAACATCAAAAGCCCTCGACAGTGAGAGGGAGGGAGAGCACTTACAGTCCAAACGATAGAGTCTCGGGACGAAGTATTGGAATTATCACGTTTCCACATATGTGAGCGAAAAGAAGTCCACGTGCGGTATAACTTGTCGCGAACCAAGCATCCGAGTTGTCTTGATCCCGGGTTCAAAGGACTATATCCCCGTCAAAGGCACCGGATCACTGGCGCTCGTTGACCGTAAAGCTCTATCTGTTGGAGATAAGGATTCTCCAGACCATATCGGCGATTTCAGGGCGCTGCTTTCTGCACTCAGTCCTAGTGTCATCTGGCATCTCCTTTGGTCCCAAGCCAATCGATCCCTGTAGACAATCCCAAGTCCACCAGACCGTTTTATCCTGGACCATAGTGGACTATTGTGGGGTGAGGCACACCAGACATTCACCTATAGAGCATGCGCACGGCGAGT"
complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
complement_seq = ''.join(complement_dict[base] for base in dna_seq)
#print("Complement Sequence:", complement_seq)

# Reverse the DNA sequence
reverse_seq = complement_seq[::-1]
print("Reverse Sequence:", reverse_seq)

Reverse Sequence: ACTCGCCGTGCGCATGCTCTATAGGTGAATGTCTGGTGTGCCTCACCCCACAATAGTCCACTATGGTCCAGGATAAAACGGTCTGGTGGACTTGGGATTGTCTACAGGGATCGATTGGCTTGGGACCAAAGGAGATGCCAGATGACACTAGGACTGAGTGCAGAAAGCAGCGCCCTGAAATCGCCGATATGGTCTGGAGAATCCTTATCTCCAACAGATAGAGCTTTACGGTCAACGAGCGCCAGTGATCCGGTGCCTTTGACGGGGATATAGTCCTTTGAACCCGGGATCAAGACAACTCGGATGCTTGGTTCGCGACAAGTTATACCGCACGTGGACTTCTTTTCGCTCACATATGTGGAAACGTGATAATTCCAATACTTCGTCCCGAGACTCTATCGTTTGGACTGTAAGTGCTCTCCCTCCCTCTCACTGTCGAGGGCTTTTGATGTTGAGGGTCTAAACACCTTTCTTCCTTTCGACCCCGCGGGGGATAATCTTTAAAGCTCAATGGAACGGGCAGGCCGAATTCCAAAACCGGTTATCGCTCATAGTACATTGTTCGCTGCGGGCCAATGACTAACAGAGAAACCCACCACCCGCACGTATGGGAAAGGCCTGGTGCCAACCCATCGTCTTAACACCTAGTCTCAAAATTAATTGGCGCCTATCGGCGTGCGCAACCAAATAGTATACATATGAGTTGAGTATACGATGTCCTGGTTTTGGATGAGCGCAACTACCAGAACCCGCAATAGAGAACCGTAAGCGCGTTAATGACGGGTTGGGGTTATCTATAGCAGCGGAGAGGCCCAATATGTTCCCCTCTCGACCTGACTATACCGTATACTCCTG
