In [2]:
#Regular Expressions
# Modules in Python- each collection of specalized toopls, or really just a collection of specialized functions and data types is called a module
# need to explicitly load each module of specilaized tools that we want to use inside our program
# To load a module use the 'import' statement
# regular expressions module = 're'
import re
#re.search(pattern, string)


In [3]:
# Raw Strings
# we will need a lot of special characters to write regular expressions
# \n means 'start a new line' \t means 'insert a tab'
# if we put the letter 'r' immediately before the opening uote than any special charaters in the string will be ignored
print(r"\t\n")
# r stands for 'raw', 'r' goes outside the quotes

\t\n


In [4]:
# Searching for a pattern in a string
# re. seach is a true/false function
# search for the EcoRI restriction site:
dna = "ATCGCGAATTCAC"
if re.search(r"GAATTC", dna):
    print("restriction site found!")

restriction site found!


In [7]:
#Alternation
# Avall restriction site ahs two different motifs: GGACC and GGTCC

dna = "ATCGCGAATTCAC"
if re.search(r"GGACC", dna) or re.search(r"GGTCC", dna):
    print("restriction site found!")
# capture the variation in the AvaII site using a regular expression
dna = "ATCGCGAATTCAC"
if re.search(r"GG(A|T)CC", dna):
    print("restriction site found!")
# above we use an alteration feature of regular expressions, inside the parentheses we write the alternatives separated by a pipe character so it means either/or

In [8]:
# Character groups
dna = "ATCGCGAATTCAC"
if re.search(r"GC(A|T|G|C)GC", dna):
    print("restriction site found!")
# you can use alterations or you can use square brackets
dna = "ATCGCGAATTCAC"
if re.search(r"GC[ATGC]GC", dna):
    print("restriction site found!")
# if we want a character in a pattern to match any character in the input, we can use a period
# however, periods will also match characters which aren't DNA bases
#sometimes it is easier to specify wich characters we don't want by using a caret ^ 

In [9]:
# Quantifiers - let us descripbe variation in the number of times a section of a pattern is repeated
# ? immediately following a character means that that character is optional and can be there one or zero times
# to apply the question mark to multiple characters, we can put those characters in parantheses
# a plus sign  + immediately following a group of characters means that the character or group must be present but can be repeated any number of times - one or more
# an asterisk immediately following a character or group means thatthe character or group is optional
# If we want to specify a specific number of repeats, we can use {} 
# a single number GA{5}T will match GAAAAAAT but not GAAT 
# A pair of numbers will match inclusively GA{2,4}T, GAAT, GAAAT, and GAAAAT


In [10]:
# Positions
# Represent postions in a string
# a caret ^ matches the start of a string, and a $ matches the end of a string
# ^AAA, means the string starts with AAA, and GGG$ means the string ends with GGG

In [11]:
#Combining
#We can use different quantifiers together with alterations and characters to specify very flexible patterns
# example - identify full-length eukaryotic messenger RNA sequences:

#^ATG[ATGC]{30,1000}A{5,10}$
# re.seacch will identify a pattern occuring 'anywhere' in the string
# re.match will only identify a pattern if it matches the 'entire' string

In [12]:
#Extracting the part of the string that matched
#often in our programs we want to find out if a pattern matches and what part 
# need to store the results using re.search, then use the 'group' method on the resulting object
# Group method- results of a regular expression where the portion of the input string matches a pattern
dna = "ATGACGTACGTACGACTG"
# store the match object in the variable m 
m = re.search(r"GA[ATGC]{3}AC", dna)
print(m.group())

GACGTAC


In [13]:
dna = "ATGACGTACGTACGACTG"
# store the match object in the variable m 
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
print("entire match: " + m.group())
print("first bit: " + m.group(1))
print("second bit: " + m.group(2))

entire match: GACGTACGTAC
first bit: CGT
second bit: GT


In [14]:
#Getting the position of a match
# remember we start counting from zero, so in this case, the match starting at the 3rd base has a start position of 2
dna = "ATGACGTACGTACGACTG"
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
print("start: " + str(m.start()))
print("end: " + str(m.end()))

start: 2
end: 13


In [15]:
# we can get the start and end postion of individual grops by supplying a number as the argument to start and end
dna = "ATGACGTACGTACGACTG"
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
print("start: " + str(m.start()))
print("end: " + str(m.end()))
print("group one start: " + str(m.start(1)))
print("group one end: " + str(m.end(1)))
print("group two start: " + str(m.start(2)))
print("group two end: " + str(m.end(2)))

start: 2
end: 13
group one start: 4
group one end: 7
group two start: 9
group two end: 11


In [16]:
#Splitting a string using a regular expression
# extract all runs of contigous unambigous bases
dna = "ACTNGCATRGCTACGTYACGATSCGAWTCG"
runs = re.split(r"[^ATGC]", dna)
print(runs)

['ACT', 'GCAT', 'GCTACGT', 'ACGAT', 'CGA', 'TCG']


In [17]:
#Finding multiple matches
#re.findall - list of all matches of a pettern in a string
dna = "ACTGCATTATATCGTACGAAATTATACGCGCG"
runs = re.findall(r"[AT]{4,100}", dna)
print(runs)

['ATTATAT', 'AAATTATA']


In [18]:
# above the findall method is not a match object- it is a list of stroings
# to do more complexx actions, use the finditer method, which returns a sequence of match objects
# we need to return the value in a loop
na = "ACTGCATTATATCGTACGAAATTATACGCGCG"
runs = re.finditer(r"[AT]{3,100}", dna)
for match in runs:
    run_start = match.start()
    run_end = match.end()
    print("AT rich region from " + str(run_start) + " to " + str(run_end))

AT rich region from 5 to 12
AT rich region from 18 to 26
