In [1]:
#Regular Expressions

import re #module that deals with regular expressions

#How to search for various regular expressions using this module
#re.search(pattern, string)

#re.search() is a true/false function that determines 
#whether or not a pattern appears somewhere in a string.

#The first argument is the pattern that you want to search 
#for, and the second argument is the string that you want 
#to search in.

#Example, searching for a restriction site cut site in a dna sequence:
dna = "ATCGCGAATTCAC"
if re.search(r"GAATTC", dna):
    print("restriction site found!")

restriction site found!


In [34]:
#What if a restriction enzyme cut site has two different sequences?

dna = "ATCGCGAATTCAC"
if re.search(r"GGACC", dna) or re.search(r"GGTCC", dna):
    print("restriction site found!")
else:
    print("WHY THO")

WHY THO


In [32]:
#OR we can use alternation!

#To represent a number of different alternatives,
#we write the alternatives inside parentheses 
#separated by a pipe character. 

#2 diffeerent characters allowed: GG(A|T)CC

dna = "ATCGCGGACCAC"
if re.search(r"GG(A|T)CC", dna):
    print("restriction site found!")


restriction site found!


In [27]:
#Multiple options:
#GC(A|T|G|C)GC
#OR
#GC[ATGC]GC

dna = "ATCGCGAATTCAC"
if re.search(r"GC[ATGC]GC", dna):
    print("restriction site found!")
    
#Different types of quantifiers   
#GAT?C - one or zero T will find GATC and/or GAC
#GGGA+TTT - one or more A's at the 4th position
#GGGA*T - zero or more A's at the 4th position
#GA{5}T - exactly 5 A's at the 2nd position
#GA{2,4}T - A range of 2 to 4 A's at the 2nd position
#A{3,} - 3 or more A's 
#G{,7} - 0 to 7 G's

#Position
# ^ = start of string
#$ = end of string

In [None]:
#Challenge 1
#What are all the different ways you can 
#describe the following sequence
#using regular expressions

#AvaII enzyme = GGACC or GGTCC

#Results: 
#1)GG(A|T)CC
#2)G{2}(A|T)C{2}
#3)GG[AT]CC
#4)(GGACC|GGTCC)
#5)(GGA|GGT)CC

#There are 5...or more total ways to describe this in a regex.


In [None]:
#Challenge 2
#Write a regular expression that can match
#all of the following conditions

#Reading the pattern from left to right, it will match:

#an AUG start codon at the beginning of the sequence
#followed by between 30 and 1000 bases which can be A, U, G or C
#followed by a poly-A tail of between 5 and 10 bases at the end of the sequence

In [2]:
#Often we want to find out not only if a regular expression matched
#our terms, but we want to print out the matching sequence

#Remember the re.search function provides a true or false output
#but it is more acurately storing a match object (the result of the regular expression)

import re

dna = "ATCGGGACCAC"
if re.search(r"GGACC", dna) or re.search(r"GGTCC", dna):
    print("restriction site found!")
else:
    print("WHY THO")


restriction site found!


In [5]:
#We can use the group function to store the result of the regular expression

dna = "GGACCATCGGGGTCCC"
match = re.search(r"GGACC", dna) or re.search(r"GGTCC", dna)

if match:
    print("restriction site found:")
    matching_sequence = match.group()
    print(matching_sequence)
    
    #What happens if the sequence is found twice in your dna sequence?


restriction site found:
GGACC


In [12]:
#What if we want to find multiple patterns?

#using a period in a regular expression = any character

scientific_name = "Homo sapiens"
match = re.search("(.+) (.+)", scientific_name)

if match:
    genus = match.group(1)
    species = match.group(2)
    print ("genus is " + genus + ",species is " + species)

genus is Homo,species is sapiens


In [7]:
#Getting match positions

dna = "GGACCATCGGGTCCAC" 
m = re.search(r"GGTCC", dna) 

if m: 
    print("restriction enzyme site found!") 
    print("at position " + str(m.start()))
    
    #How do you think you find the end?

restriction enzyme site found!
at position 9


In [21]:
#Multiple Matches

# To process multiple matches, we need to switch to re.finditer()
#, which returns a list of match objects which we can process in a loop:

#Getting match positions

dna = "GGACCATCGGGACCAC" 
matches = re.finditer(r"GGACC", dna)

for m in matches: 
    print("restriction enzyme site found!") 
    print("at position " + str(m.start()))
    
    
    #How do you think you find the end?
    #What if I want to print out the matching sequence too?

restriction enzyme site found!
at position 0GGACC
restriction enzyme site found!
at position 9GGACC


In [9]:
#Getting multiple matches as strings

dna = "ACTGCATTATATCGTACGAAATTATACGCGCG"

matches = re.finditer(r"[AT]{6,}", dna) 

result = [] 
for m in matches: 
    result.append(m.group()) 

print(result)

['ATTATAT', 'AAATTATA']


In [22]:
#Or use findall function - this function takes the pattern and the strings as arguments, and 
#returns the list as strings
dna = "ACTGCATTATATCGTACGAAATTATACGCGCG" 
result = re.findall(r"[AT]{6,}", dna) 
print(result)

['ATTATAT', 'AAATTATA']


In [10]:
#Splitting a string using a regular expression
#use the split function, the first argument is the pattern and the 
#second argument is the string to be split
#notice the bits of string that matched the pattern are excluded 
#from the output. What if we want to 
#add it back?

dna = "GGACCATCGGGACCAC" 
runs = re.split(r"GGACC", dna)
print(runs)

['', 'ATCG', 'AC']


In [20]:
#Challenge

#Go to ensembl
#Select Human
#Search for BRCA1
#Select first link - BRCA1 Human Gene
#Click Show Transcript Table
#Select following transcript - BRCA1-203 (Click the Transcript ID value - ENST00000357654.7)
#On the left hand side under Sequence select cDNA
#Scroll down a bit, under the transcript table select Download Sequence Button
#choose FASTA Format
#Deselect all (under settings)
#Select Exons
#Select Download 

import re

file = open("dna.txt", "r")

for line in file:
    mluI = re.split(r"ACGCGT", line)
    print ("Number of cuts:"+str(len(mluI)))
    for sequence in mluI:
        print ("SEQUENCE:"+str(len(sequence)))
        
    print(mluI[0]+"A")
    print("CGCGT"+mluI[1])

Number of cuts:2
SEQUENCE:482
SEQUENCE:1525
ATGGCAATAACCCCCCGTTTCTACTTCTAGAGGAGAAAAGTATTGACATGAGCGCTCCCGGCACAAGGGCCAAAGAAGTCTCCAATTTCTTATTTCCGAATGACATGCGTCTCCTTGCGGGTAAATCACCGACCGCAATTCATAGAAGCCTGGGGGAACAGATAGGTCTAATTAGCTTAAGAGAGTAAATCCTGGGATCATTCAGTAGTAACCATAAACTTACGCTGGGGCTTCTTCGGCGGATTTTTACAGTTACCAACCAGGAGATTTGAAGTAAATCAGTTGAGGATTTAGCCGCGCTATCCGGTAATCTCCAAATTAAAACATACCGTTCCATGAAGGCTAGAATTACTTACCGGCCTTTTCCATGCCTGCGCTATACCCCCCCACTCTCCCGCTTATCCGTCCGAGCGGAGGCAGTGCGATCCTCCGTTAAGATATTCTTACGTGTGACGTAGCTATGTATTTTGCAGAGCTGGCGAA
CGCGTTGAACACTTCACAGATGGTAGGGATTCGGGTAAAGGGCGTATAATTGGGGACTAACATAGGCGTAGACTACGATGGCGCCAACTCAATCGCAGCTCGAGCGCCCTGAATAACGTACTCATCTCAACTCATTCTCGGCAATCTACCGAGCGACTCGATTATCAACGGCTGTCTAGCAGTTCTAATCTTTTGCCAGCATCGTAATAGCCTCCAAGAGATTGATGATAGCTATCGGCACAGAACTGAGACGGCGCCGATGGATAGCGGACTTTCGGTCAACCACAATTCCCCACGGGACAGGTCCTGCGGTGCGCATCACTCTGAATGTACAAGCAACCCAAGTGGGCCGAGCCTGGACTCAGCTGGTTCCTGCGTGAGCTCGAGACTCGGGATGACAGCTCTTTAAACATAGAGCGGGGGCGTCGAACGGTCGAGAAAGTCATAGTACCTCGGGTACCAACTTACTCAG