In [1]:
### Chapter 7

In [3]:
## Within chapter exercises

# Import regular expressions module
import re

# Format of search function in re
# re.search(pattern, string)

# Special characters
# \n = new line
# \t = new tab

# Special characters are ignored if you place a "r" (raw)
# before the opening quotation mark
print(r"\t\n")

\t\n


In [4]:
## Searching for a pattern in a string
# re.search is a T/F function
dna = "ATCGCGAATTCAC"
if re.search(r"GAATTC", dna):
    print("restriction site found!")


restriction site found!


In [5]:
## Searching for two patterns in a string
dna = "ATCGCGAATTCAC"
if re.search(r"GGACC", dna) or re.search(r"GGTCC", dna):
    print("restriction site found!")

## Simplified version
# Pipe implies A or T for third character
dna = "ATCGCGAATTCAC"
if re.search(r"GG(A|T)CC", dna):
    print("restriction site found!")

In [None]:
## Character groups
# Expanded version for >2 character options
dna = "ATCGCGAATTCAC"
if re.search(r"GC(A|T|G|C)GC", dna):
    print("restriction site found!")
    
## Simplified version
# Brackets include all character options 
dna = "ATCGCGAATTCAC"
if re.search(r"GC[ATGC]GC", dna):
    print("restriction site found!")
    
## Include any character
# Period implies any character (e.g., A or %)
dna = "ATCGCGAATTCAC"
if re.search(r"GC.GC", dna):
    print("restriction site found!")
    
## Exclude certain characters (e.g., . and :)
dna = "ATCGCGAATTCAC"
if re.search(r"GC[^.:]GC", dna):
    print("restriction site found!")

In [None]:
## Quantifiers
# Describes variation in the number of times
# a section of a pattern is repeated

# ? after character "A?"     = character is optional
# ? after characters "(AA)?" = characters are optional

# {#} after character        = character is repeated # times
# {#,#} after character      = character is repeated between # and # times

## Positions
# ^ before character        = matches the start of a string
# $ after character.        = matches the end of a string

In [8]:
## Extracting part of the string that matched
# Example 1
dna = "ATGACGTACGTACGACTG"
m = re.search(r"GA[ATGC]{3}AC", dna)
print(m.group())

GACGTAC


In [10]:
## Extracting multiples parts of a string that matched
# Example 2
dna = "ATGACGTACGTACGACTG"
# Place parentheses around each part
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
print("entire match: " + m.group())
# Print specifics parts with group(#)
print("first bit: " + m.group(1))
print("second bit: " + m.group(2))

entire match: GACGTACGTAC
first bit: CGT
second bit: GT


In [12]:
## Getting the position of a match
# Example 1
dna = "ATGACGTACGTACGACTG"
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
# Print positions of an entire match
print("start: " + str(m.start()))
print("end: " + str(m.end()))

start: 2
end: 13


In [14]:
## Getting the position of a match
# Example 1
dna = "ATGACGTACGTACGACTG"
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
print("start: " + str(m.start()))
print("end: " + str(m.end()))
# Print positions of specific parts of a match
print("group one start: " + str(m.start(1)))
print("group one end: " + str(m.end(1)))
print("group two start: " + str(m.start(2)))
print("group two end: " + str(m.end(2)))

start: 2
end: 13
group one start: 4
group one end: 7
group two start: 9
group two end: 11


In [15]:
## Splitting a string using a regular expression
# Split sequence whenever there is an unambiguous base
dna = "ACTNGCATRGCTACGTYACGATSCGAWTCG"
# re.split(pattern, split)
runs = re.split(r"[^ATGC]", dna)
print(runs)

['ACT', 'GCAT', 'GCTACGT', 'ACGAT', 'CGA', 'TCG']


In [16]:
## Finding multiple matches
# re.findall = returns a list of all matches of a pattern in a string
dna = "ACTGCATTATATCGTACGAAATTATACGCGCG"
# Searches for all matches of AT between 4 and 100
runs = re.findall(r"[AT]{4,100}", dna)
print(runs)

['ATTATAT', 'AAATTATA']


In [18]:
## Finding multiple matches
# re.finditer = returns a sequence of match objects
dna = "ACTGCATTATATCGTACGAAATTATACGCGCG"
runs = re.finditer(r"[AT]{3,100}", dna)
# For each match, print the start and end position
for match in runs:
    run_start = match.start()
    run_end = match.end()
    print("AT rich region from " + str(run_start) + " to " + str(run_end))

AT rich region from 5 to 12
AT rich region from 18 to 26


In [27]:
### End of chapter exercises
## Accession names

# Save accession names
names = ["xkn59438", "yhdck2", "eihd39d9", "chdsye847",
         "hedle3455", "xjhd53e", "45da", "de37dp"]

# 1. Contains 5
for pattern in names:
    if re.search(r"5", pattern):
        print(pattern)

xkn59438
hedle3455
xjhd53e
45da


In [31]:
# 2. Contains d or e
for pattern in names:
    if re.search(r"d|e", pattern):
        print(pattern)

yhdck2
eihd39d9
chdsye847
hedle3455
xjhd53e
45da
de37dp


In [32]:
# 3. Contains d and e in order
for pattern in names:
    if re.search(r"d.*e", pattern):
        print(pattern)

chdsye847
hedle3455
xjhd53e
de37dp


In [33]:
# 4. Contains d and e in order, with a single letter between them
for pattern in names:
    if re.search(r"d.e", pattern):
        print(pattern)

hedle3455


In [35]:
# 5. Contains d and e in any order
for pattern in names:
    if re.search(r"d.*e", pattern) or re.search(r"e.*d", pattern):
        print(pattern)

eihd39d9
chdsye847
hedle3455
xjhd53e
de37dp


In [36]:
# 6. Starts with x or y
for pattern in names:
    if re.search(r"^(x|y)", pattern):
        print(pattern)

xkn59438
yhdck2
xjhd53e


In [44]:
# 7. Starts with x or y and ends with e
for pattern in names:
    if re.search(r"^(x|y).*e$", pattern):
        print(pattern)

xjhd53e


In [48]:
# 8. Contains three numbers in a row
for pattern in names:
    if re.search(r"[1,2,3,4,5,6,7,8,9,0]{3,}", pattern):
        print(pattern)

xkn59438
chdsye847
hedle3455


In [47]:
# 9. Ends with d followed by either a, r, or p
for pattern in names:
    if re.search(r"d[a,r,p]$", pattern):
        print(pattern)

45da
de37dp


In [79]:
## Double digest
dna = open("chp_07_dna.txt")
dna = dna.read()
dna = dna.rstrip("\n")
#print(dna)

## Restriction enzymes (asteriks indicate cut site)
# abci  = ANT*AAT = A[ATCG]TAAT
# abcii = GCRW*TG = GC[AG][AT]TG

# Find cut positions
#for cut in re.finditer(r"A[ATCG]TAAT", dna):
    #print(cut.start() + 3)

# Find length of dna sequence
dna_length = len(dna)
#print("This sequence has " + str(dna_length) + " bases.")

# Create empty list to store positions
#cuts = []
cuts = [0]

# Statement to save all fragment start / end points for abci
for cut in re.finditer(r"A[ATCG]TAAT", dna):
    cuts.append(cut.start() + 3)
cuts.append(dna_length)

# Statement to print all fragment sizes for abci
for cut in range(1, len(cuts)):
    current_position  = cuts[cut]
    previous_position = cuts[cut-1]
    fragment_size     = current_position - previous_position
    #print("Fragment size = " + str(fragment_size))
    
# Statement to save all fragment start / end points for abci and abcii
for cut in re.finditer(r"A[ATCG]TAAT", dna):
    cuts.append(cut.start() + 3)
    
for cut in re.finditer(r"GC[AG][AT]TG", dna):
    cuts.append(cut.start() + 4)
    
cuts.append(dna_length)
sorted_cuts = sorted(cuts)

# Statement to print all fragment sizes for abci and abcii
for cut in range(1, len(sorted_cuts)):
    current_position  = sorted_cuts[cut]
    previous_position = sorted_cuts[cut-1]
    fragment_size     = current_position - previous_position
    if fragment_size >= 1:
        print("Fragment size = " + str(fragment_size))

Fragment size = 488
Fragment size = 655
Fragment size = 434
Fragment size = 51
Fragment size = 384
