### Regex

In [2]:
import re

In [3]:
def find (regex, text, option="match"):
    p = re.compile(regex)
    method = getattr(p, option, None)
    m = method(text)
    if m:
        if option != 'findall':
            return(print(m.group()))
    return print(m)

In [10]:
regex = 'abc'
text = 'abc babcd'

find(regex, text, 'search')

abc


In [11]:
find(regex, text, 'findall')

['abc', 'abc']


In [14]:
regex = '[0-9]'
text = 'Try1ng t0 f1nd numbers here 4ever 13'
find(regex, text, 'findall')

['1', '0', '1', '4', '1', '3']


In [15]:
# Use + going to find 13
regex = '[0-9]+'
text = 'Try1ng t0 f1nd numbers here 4ever 13'
find(regex, text, 'findall')

['1', '0', '1', '4', '13']


In [20]:
# ^ = not in
regex = '[^0-9]+'
text = 'Try1ng t0 not f1nd numbers here 4ever 13'
find(regex, text, 'findall')

['Try', 'ng t', ' not f', 'nd numbers here ', 'ever ']


In [22]:
# ^ = not in
regex = '[a-h]+'
text = 'Try1ng t0 not f1nd numbers here 4ever 13'
find(regex, text, 'findall')

['g', 'f', 'd', 'be', 'he', 'e', 'e', 'e']


#### metacharacter

In [26]:
regex = '[0-9$]+'
text = 'The price is $200'
find(regex, text, 'findall')

['$200']


In [38]:
# in this way it going to split word_together because of the _
regex = '[a-zA-Z]+'
text = 'The price is $200 word_together'
find(regex, text, 'findall')

['The', 'price', 'is', 'word', 'together']


In [37]:
# how to resolve it:
regex = '[a-z_A-Z]+'
text = 'The price is $200 word_together'
find(regex, text, 'findall')

['The', 'price', 'is', 'word_together']


In [32]:
# what about use []? 
regex = '[a-z[]]+'
text = 'The price is [$200] word_together'
find(regex, text, 'findall')

[]


In [36]:
# to fix it, we need to use \
regex='[a-zA-Z\[\]]+'
text = 'The price is [$200] word_together'
find(regex, text, 'findall')

['The', 'price', 'is', '[', ']', 'word', 'together']


#### Special sequences
* **\d** $\rightarrow$ any digit $\rightarrow$ [0-9].

* **\D** $\rightarrow$ any no digit $\rightarrow$ [^0-9].

* **\s** $\rightarrow$ any character white space $\rightarrow$ [\t\n\r\f\v].

* **\S** $\rightarrow$ any character with no white space $\rightarrow$ [^\t\n\r\f\v].

* **\w** $\rightarrow$ any alphanumeric character $\rightarrow$ [a-zA-Zà-úÁ-Ú0-9_].

* **\W** $\rightarrow$ any no alphanumeric character $\rightarrow$ [^a-zA-Zà-úÁ-Ú0-9_].

In [40]:
# to fix it, we need to use \
regex='[\w]+'
text = 'The price is [$200] word_together'
find(regex, text, 'findall')

['The', 'price', 'is', '200', 'word_together']


In [41]:
# to fix it, we need to use \
regex='[\S]+'
text = 'The price is [$200] word_together'
find(regex, text, 'findall')

['The', 'price', 'is', '[$200]', 'word_together']


In [42]:
# raw string = r
regex=r'\.'
text = 'Put this email email@email.com in this website.com'
find(regex, text, 'findall')

['.', '.']


In [55]:
# raw string = r
regex=r'\w+@?\.com'
text = 'Put this email email@email.com in this website.com'
find(regex, text, 'findall')

['email.com', 'website.com']


- \b: at the word's beginning or the end

In [66]:
regex=r'ta\b'
text = 'Roberta'
find(regex, text, 'findall')

['ta']


In [67]:
regex=r'ta\B'
text = 'tampa e Roberta'
find(regex, text, 'findall')

['ta']


- {m}: numbers of copies

In [68]:
regex = r'\d\d\d\d\d-\d\d\d\d'
text = 'Phones 99999-9999, 91234-5678'
find(regex, text, 'findall')

['99999-9999', '91234-5678']


In [69]:
regex = r'\d{5}-\d{4}'
text = 'Phones 99999-9999, 91234-5678'
find(regex, text, 'findall')

['99999-9999', '91234-5678']


In [70]:
regex = r'\d{5}[-.]?\d{4}'
text = 'Phones 99999-9999, 91234.5678'
find(regex, text, 'findall')

['99999-9999', '91234.5678']


In [72]:
regex = r'[^r]ato'
text = 'rato gato pato'
find(regex,text, 'findall')

['gato', 'pato']


#### String replacement

In [7]:
regex = r'https?://(www\.)?(\w+)\.(com|net|org)(\.\w+)?'
text = """
https://testing.com
http://www.mysite.com
https://www.site.org
https://brazillianwebsite.com.br
"""
p = re.compile(regex)
matches = p.finditer(text)
for m in matches:
    print(m.group(2))

testing
mysite
site
brazillianwebsite


In [13]:
normalized_sites = p.sub(r'https://\2.\3', text)
print(normalized_sites)


https://testing.com
https://mysite.com
https://site.org
https://brazillianwebsite.com



#### Match

In [16]:
regex = r'https?://(www\.)?(\w+)\.(com|net|org)(\.\w+)?'
text = """https://testing.com
http://www.mysite.com
https://www.site.org
https://brazillianwebsite.com.br
"""
p = re.compile(regex)
match = p.match(text)
if match:
    print(match.group())
else:
    print(match)

https://testing.com


In [18]:
regex = r'https?://(www\.)?(\w+)\.(com|net|org)(\.\w+)?'
text = """
https://testing.com
http://www.mysite.com
https://www.site.org
https://brazillianwebsite.com.br
"""
p = re.compile(regex)
match = p.match(text)
if match:
    print(match.group())
else:
    print(match)

None


#### Search

In [19]:
regex = r'https?://(www\.)?(\w+)\.(com|net|org)(\.\w+)?'
text = """
https://testing.com
http://www.mysite.com
https://www.site.org
https://brazillianwebsite.com.br
"""
p = re.compile(regex)
matches_list = p.findall(text)
print(matches_list)


[('', 'testing', 'com', ''), ('www.', 'mysite', 'com', ''), ('www.', 'site', 'org', ''), ('', 'brazillianwebsite', 'com', '.br')]


#### finditer

In [20]:
regex = r'https?://(www\.)?(\w+)\.(com|net|org)(\.\w+)?'
text = """
https://testing.com
http://www.mysite.com
https://www.site.org
https://brazillianwebsite.com.br
"""
p = re.compile(regex)
m = p.finditer(text)

In [21]:
m

<callable_iterator at 0x109eaa880>

In [23]:
next(m)

<re.Match object; span=(1, 20), match='https://testing.com'>

In [24]:
next(m)

<re.Match object; span=(21, 42), match='http://www.mysite.com'>

In [25]:
next(m).group()

'https://www.site.org'