# practice for regular expressions

This notebook is for practicing the regular expressions

```python
import re as regex

#compile a regular expression
pattern = regex.compile('some-pattern')

#find iteratively over the text and get matches
matches = pattern.finditer('some-string-to-search-for-some-pattern')
```

# some important symbols
```
.       - Any Character Except New Line
\d      - Digit (0-9)
\D      - Not a Digit (0-9)
\w      - Word Character (a-z, A-Z, 0-9, _)
\W      - Not a Word Character
\s      - Whitespace (space, tab, newline)
\S      - Not Whitespace (space, tab, newline)

\b      - Word Boundary
\B      - Not a Word Boundary
^       - Beginning of a String
$       - End of a String

[]      - Matches Characters in brackets
[^ ]    - Matches Characters NOT in brackets
|       - Either Or
( )     - Group

Quantifiers:
*       - 0 or More
+       - 1 or More
?       - 0 or One
{3}     - Exact Number
{3,4}   - Range of Numbers (Minimum, Maximum)
```

In [1]:
import re as regex

In [51]:
practice_string = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
123abc

Hello HelloHello

MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )

utexas.edu

321-555-4321
123.555.1234

daniel-mitchell@utexas.edu
test-email@gradingo.in
space-reserach@isro.gov.in
email1@isro.in.
invalid?email1@isro
invalid#email1@isro.in
invalid%email1@isro.co.uk

http://gradingo.com
https://gradingo.com
http://www.gradingo.com
https://www.gradingo.com
http://some-large.domain-name.co.in
https://some-large.domain-name.co.in

http://another.some-large.domain-name.co.in
https://another.some-large.domain-name.co.in.
https://another.some-large.domain-name.co.in#

Mr. Johnson
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''

In [53]:
search_pattern_string = '[\w.+-]+@[\w.+-]+\.[\w]+'
pattern = regex.compile(search_pattern_string)
matches = pattern.finditer(practice_string)

In [54]:
for match in matches:
    print(match)

<re.Match object; span=(197, 223), match='daniel-mitchell@utexas.edu'>
<re.Match object; span=(224, 246), match='test-email@gradingo.in'>
<re.Match object; span=(247, 273), match='space-reserach@isro.gov.in'>
<re.Match object; span=(274, 288), match='email1@isro.in'>
<re.Match object; span=(318, 332), match='email1@isro.in'>
<re.Match object; span=(341, 358), match='email1@isro.co.uk'>


In [109]:
search_pattern_string = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]{2,5}'
pattern = regex.compile(search_pattern_string)
matches = pattern.finditer(practice_string)
for match in matches:
    print(match)

<re.Match object; span=(197, 223), match='daniel-mitchell@utexas.edu'>
<re.Match object; span=(224, 246), match='test-email@gradingo.in'>
<re.Match object; span=(247, 272), match='space-reserach@isro.gov.i'>
<re.Match object; span=(274, 289), match='email1@isro.in.'>
<re.Match object; span=(318, 332), match='email1@isro.in'>
<re.Match object; span=(341, 358), match='email1@isro.co.uk'>


In [88]:
search_pattern_string = r'https?://([\.\w+-]+)+'
pattern = regex.compile(search_pattern_string)
matches = pattern.finditer(practice_string)
for match in matches:
    print(match)
    print(match.group(0))
    print(match.group(1))

<re.Match object; span=(360, 379), match='http://gradingo.com'>
http://gradingo.com
gradingo.com
<re.Match object; span=(380, 400), match='https://gradingo.com'>
https://gradingo.com
gradingo.com
<re.Match object; span=(401, 424), match='http://www.gradingo.com'>
http://www.gradingo.com
www.gradingo.com
<re.Match object; span=(425, 449), match='https://www.gradingo.com'>
https://www.gradingo.com
www.gradingo.com
<re.Match object; span=(450, 485), match='http://some-large.domain-name.co.in'>
http://some-large.domain-name.co.in
some-large.domain-name.co.in
<re.Match object; span=(486, 522), match='https://some-large.domain-name.co.in'>
https://some-large.domain-name.co.in
some-large.domain-name.co.in
<re.Match object; span=(524, 567), match='http://another.some-large.domain-name.co.in'>
http://another.some-large.domain-name.co.in
another.some-large.domain-name.co.in
<re.Match object; span=(568, 613), match='https://another.some-large.domain-name.co.in.'>
https://another.some-large.domain

In [92]:
# groups
# 0: protocol
# 1: host name
# 3. TLD (.com, .org, .co)
search_pattern_string = r'(https)+://([\.\w+-]+)+\.(\w+)'
pattern = regex.compile(search_pattern_string)
matches = pattern.finditer(practice_string)
for match in matches:
    print(f'Match Object: {match}')
    print(f'Full: {match.group(0)}')
    print(f'Protocol: {match.group(1)}')
    print(f'Host: {match.group(2)}')
    print(f'TLD: {match.group(3)}')

Match Object: <re.Match object; span=(380, 400), match='https://gradingo.com'>
Full: https://gradingo.com
Protocol: https
Host: gradingo
TLD: com
Match Object: <re.Match object; span=(425, 449), match='https://www.gradingo.com'>
Full: https://www.gradingo.com
Protocol: https
Host: www.gradingo
TLD: com
Match Object: <re.Match object; span=(486, 522), match='https://some-large.domain-name.co.in'>
Full: https://some-large.domain-name.co.in
Protocol: https
Host: some-large.domain-name.co
TLD: in
Match Object: <re.Match object; span=(568, 612), match='https://another.some-large.domain-name.co.in'>
Full: https://another.some-large.domain-name.co.in
Protocol: https
Host: another.some-large.domain-name.co
TLD: in
Match Object: <re.Match object; span=(614, 658), match='https://another.some-large.domain-name.co.in'>
Full: https://another.some-large.domain-name.co.in
Protocol: https
Host: another.some-large.domain-name.co
TLD: in


In [106]:
search_pattern_string = r'(https)+://([\.\w+-]+)+\.(\w+)'
pattern = regex.compile(search_pattern_string)
matches = pattern.finditer(practice_string)
for match in matches:
    print('******************************************************')
    print('##########################')
    print(practice_string[match.start():match.end()])
    print(match.span(0))
    print(practice_string[match.span(0)[0]:match.span(0)[1]])
    print('##########################')
    print(match.span(1))
    print(practice_string[match.span(1)[0]:match.span(1)[1]])
    print('##########################')
    print(match.span(2))
    print(practice_string[match.span(2)[0]:match.span(2)[1]])
    print('##########################')
    print(match.span(3))
    print(practice_string[match.span(3)[0]:match.span(3)[1]])
    print('##########################')
    print(f'Host+TLD: {match.group(2)}.{match.group(3)}')
    print(f'Host+TLD: {practice_string[match.span(2)[0]:match.span(2)[1]]}.{practice_string[match.span(3)[0]:match.span(3)[1]]}')
    print('##########################')

******************************************************
##########################
https://gradingo.com
(380, 400)
https://gradingo.com
##########################
(380, 385)
https
##########################
(388, 396)
gradingo
##########################
(397, 400)
com
##########################
Host+TLD: gradingo.com
Host+TLD: gradingo.com
##########################
******************************************************
##########################
https://www.gradingo.com
(425, 449)
https://www.gradingo.com
##########################
(425, 430)
https
##########################
(433, 445)
www.gradingo
##########################
(446, 449)
com
##########################
Host+TLD: www.gradingo.com
Host+TLD: www.gradingo.com
##########################
******************************************************
##########################
https://some-large.domain-name.co.in
(486, 522)
https://some-large.domain-name.co.in
##########################
(486, 491)
https
##########################
(494, 