In [43]:
'''
# What is r"" ('python raw string')?
A raw string is simply one of ways to represent a string in Python source code.
Inside raw string (r"") \ is treated literally.
'''

print(r"foo" == "foo")
print("foo\nbar")
print(r"foo\nbar")
print("-----------------------------")


s = "qwerty"
import re
print(re.findall("[\w]", s))
print(re.findall(r"[\w]", s))
print("-----------------------------")
'''
What happened: Python got a raw string, it suppressed \w's special meaning, 
so the value remained equal \w and this is what was passed to the regex engine. 
In turn, the regex engine treated \w as a pattern.
'''

s = "q\werty"
print(re.findall(r"[\\w]", s))
print(re.findall(r"[\\\w]", s))
print(re.findall(r"[\\\\w]", s))
print(re.findall(r"[\\\\\w]", s))
# see tutorial :) -> Further we will use r"" syntax with re module.

True
foo
bar
foo\nbar
-----------------------------
['q', 'w', 'e', 'r', 't', 'y']
['q', 'w', 'e', 'r', 't', 'y']
-----------------------------
['\\', 'w']
['q', '\\', 'w', 'e', 'r', 't', 'y']
['\\', 'w']
['q', '\\', 'w', 'e', 'r', 't', 'y']


In [None]:
'''
Lecture 1 
Python RegEx

A RegEx, or Regular Expression, is a sequence of characters that forms a search pattern.
RegEx can be used to check if a string contains the specified search pattern.
'''

import re 

# findall   
# split     
# search    
# sub   

txt = "Python applications in bioinformatics"

In [44]:
# findall  Returns a list containing all matches

print(re.findall(r"ai", txt))
print(re.findall(r"io", txt))

x1 = re.findall(r"ai", txt)
x2 = re.findall(r"io", txt)

print(len(x1), len(x2))

[]
['io', 'io']
0 2


In [45]:
re.findall(r"\w+ti", txt)

['applicati', 'bioinformati']

In [46]:
re.findall(r"s\b", txt)

['s', 's']

In [47]:
print(txt)

Python applications in bioinformatics


In [48]:
# split  Returns a list where the string has been split at each match

re.split(r"s\b", txt)

['Python application', ' in bioinformatic', '']

In [49]:
print(txt)

Python applications in bioinformatics


In [50]:
# search    Returns a Match object if there is a match anywhere in the string

#             The Match object has properties and methods used to retrieve information about the search, 
#             and the result:
#                 .start()
#                 .end()
#                 .span() returns a tupple containing the start-, and end positions of the match.
#                 .string returns the string passed into the function
#                 .group() returns the part of the string where there was a match

# Match object:
print(txt)
print(re.search(r"^Python.*bioinformatics$", txt)) 
print(re.search(r"t", txt))
print(re.search(r"w", txt))

Python applications in bioinformatics
<re.Match object; span=(0, 37), match='Python applications in bioinformatics'>
<re.Match object; span=(2, 3), match='t'>
None


In [51]:
x = re.search(r"\bS\w+", txt)
print(x)

None


In [53]:
# x = re.search(r"\bP", txt)
x = re.search(r"P\b", txt) # None

print(x)

None


In [None]:
# how this 'None' could be used in 'if' statement 

In [54]:
dna = "ATCGCGAATTCAC"
if re.search(r"ATACA", dna):
    print("restriction site found!")

In [55]:
dna = "ATCGCGAATTCAC"
if re.search(r"AT[A|T]CA", dna):
    print("restriction site found!")

restriction site found!


In [56]:
dna = "CGATNCGGAACGATC"
m = re.search(r"[^ATGC]", dna)

if m:
    print("ambiguous base found!")
    ambig = m.group()
    print("the base is " + ambig)

ambiguous base found!
the base is N


In [57]:
print(txt)

Python applications in bioinformatics


In [58]:
x = re.search(r"\bP\w+", txt)
print(x.span())
print(x.start())
print(x.end())
print(x.group())
print(x.string)

(0, 6)
0
6
Python
Python applications in bioinformatics


In [59]:
x = re.search(r"^\D?", txt)
print(x.span())
print(x.start())
print(x.end())
print(x.group())
print(x.string)

(0, 1)
0
1
P
Python applications in bioinformatics


In [60]:
x = re.search(r"fffffff", txt)
print(x)
print(x.string)

None


AttributeError: 'NoneType' object has no attribute 'string'

In [None]:
# A little bit more exercises:

In [61]:
print(txt)

Python applications in bioinformatics


In [62]:
x = re.search(r"\S+?(.)\1\d*", txt)
print(x.group())

app


In [65]:
x = re.search(r"\S+?(.)\1\d*", txt)
print(x.group())

app


In [66]:
x = re.search(r"\bP\w+", txt)
print(x.group())

Python


In [67]:
x = re.search(r"ti\w+", txt)
print(x.group())

tions


In [68]:
x = re.search(r"ti\w+?", txt)
print(x.group())

tio


In [69]:
# another way to handle these functions:

# A pattern can be compiled (converted to an internal representation) to
# speed up the search. This step is not mandatory but recommended for large
# amounts of text.

# Let’s look at findall with a regular pattern 
# and then at its “compiled” version:

print(re.findall(r"[Hh]ello","Hello world, hello Python!"))

rgx = re.compile(r"[Hh]ello")
print(rgx)
print(rgx.findall(r"Hello world, hello Python!")) # Compiled patterns have all methods available.

['Hello', 'hello']
re.compile('[Hh]ello')
['Hello', 'hello']


In [70]:
# sub       Replaces one or many matches with a string
re.sub(r"(GC){3,}", r"", "ATGCGCGCTA")

'ATTA'

In [71]:
seq="ATGCGCGCTA"
regex = re.compile(r"(GC){3,}")
print("Before:", seq)
print("After:", regex.sub(r"",seq))

Before: ATGCGCGCTA
After: ATTA


In [73]:
# Groups can be labeled to refer to them later. To give a name to a group, use: ?P<name>.   

rgx = re.compile(r"(?P<TBX>TATA..).*(?P<CGisland>(GC){3,})")
seq = "ATATAAGATGCGCGCGCTTATGCGCGCA"
result = rgx.search(seq)
print(result.group('TBX'))
print(result.group('CGisland'))

TATAAG
GCGCGC


In [74]:
print(result.group(1))
print(result.group(2))

TATAAG
GCGCGC


In [None]:
# LOOKAROUNDS

In [76]:
f = "foobarbarfoo"

In [77]:
# positive lookahead

x = re.search(r"bar(?=bar)", f)
print(x)

<re.Match object; span=(3, 6), match='bar'>


In [78]:
# negative lookahead

x = re.search(r"bar(?!bar)", f)
print(x)

<re.Match object; span=(6, 9), match='bar'>


In [79]:
# positive lookbehind

x = re.search(r"(?<=foo)bar", f)
print(x)

<re.Match object; span=(3, 6), match='bar'>


In [80]:
# negative lookbehind

x = re.search(r"(?<!foo)bar", f)
print(x)

<re.Match object; span=(6, 9), match='bar'>
