In [None]:
'''
.    - Any character except new line
\d   - Any numeral 
\D   - Any non-numeral
\w   - Any word character (a-z, A-Z, 0-9 and _ )
\W   - Any non-word character
\s   - Whitespace (space, tab, newline)
\S   - Non white-space

\b   - Word boundary
\B   - Not a Word boundary
^    - Beginning of a String
$    - End of String

[]   - Matches characters in brackets
[^ ] - Matches characters NOT in brackets
|    - Either or
(  ) - Group

*    - 0 or More
+    - 1 or More
?    - 0 or 1
{3}  - Exact number
{1,5}- Range of Numbers (Min, Max)
'''

In [1]:
import re

srchTxt= '''abcdefghijklmnopqrstuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
ABC123abc123

hello Hello HellohelloHello

. ~ ! @ # $ % ^ & * ( ) { } [ ] - + / \ | " " , ' ' _

http://www.utexas.edu
https://www.google.com
IN.REDIFF.COM
PRINT.KODAK.IO
http://craigslist.com
www.yahoo.co.uk


123-456-7890
123.456.7890

abc-def@utexas.edu
johndoe@epcindia.in
_janedoe_@design.io

Mr. John Smith
Mr Smith
Ms Davis
Mrs. Robinson
Mr.T'''

In [None]:
pattern = re.compile(r'abc')   # r is used to refer to 'raw' characters in the pattern
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

In [None]:
pattern = re.compile(r'\.')   # '.' matches every character except a new line ''\n', so we escape it with '\'
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

In [None]:
pattern = re.compile(r'\d')   # '\d' matches numerals
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

In [None]:
pattern = re.compile(r'\D')   # '\D' matches non-numerals
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

In [None]:
pattern = re.compile(r'\w')   # '\w' matches a-z, A-Z, 0-9 and _
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

In [None]:
pattern = re.compile(r'\w\d')   # matches non-overlapping printable character followed by numeral
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

In [None]:
pattern = re.compile(r'\d\s')   # matches numeral followed by whitespace
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

### Character sets

In [None]:
# \b is used to the left and right of the pattern to match for a word
# Word boundary is a new line, tab, space, . - etc.

pattern = re.compile(r'\bHello\b')      
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

In [None]:
pattern = re.compile(r'\d\b')      
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

In [None]:
# \B is used to the left and right of the pattern to match for a word embedded within a word

pattern = re.compile(r'Hello\B')      
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

In [None]:
# ^ is used to search for beginning of string - in the example above the srchTxt is 1 big string and searches for beginning of the string
pattern = re.compile(r'^\w')      
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

In [None]:
# $ is used to search for end of string - note to place the $ at the end of the regex
pattern = re.compile(r'\w$')      
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

In [None]:
pattern = re.compile(r'\b[a-zA-Z]')      
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

In [None]:
pattern = re.compile(r'\b[a-zA-Z0-9-]')      
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

In [None]:
pattern = re.compile(r'[a-z][^A-Z0-9]')      
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

### Character Groups

In [None]:
# anything within () is a char group that is searched together, using | for logical OR
pattern = re.compile(r'(abc|Hello)')      
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

In [None]:
pattern = re.compile(r'\b[hH]ello\b')      
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

### Quantifiers

In [None]:
pattern = re.compile(r'[a-z]+')      
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

In [None]:
pattern = re.compile(r'Mr\.?\s?[A-Z][a-z]*')      
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

In [None]:
pattern = re.compile(r'[a-zA-Z]+\.[a-zA-Z]+\.[a-zA-Z]{2,3}')      
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

In [None]:
pattern = re.compile(r'[\S]+\@[a-zA-Z]+\.[a-zA-Z]{2,3}')      
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)

### Accessing info in Match object

In [17]:
pattern = re.compile(r'([\S@])(\w)(.a-zA-Z)([a-zA-Z]{2})?')
# pattern = re.compile(r'[\S]+\@[a-zA-Z]+\.[a-zA-Z]{2,3}')      
matches = pattern.finditer(srchTxt)
for m in matches:
    print('Span : ', m.span(0))
    print('Group : ',m.group(0))
    print(srchTxt[m.span(0)[0]:m.span(0)[1]],"\n")
    

In [4]:
pattern = re.compile(r'((https?://)?[a-zA-Z]+\.[a-zA-Z]+\.[a-zA-Z]{2,3}(.[a-zA-Z]{2})?)|(https?://([a-zA-Z]+\.)?[a-zA-Z]+\.[a-zA-Z]{2,3}(.[a-zA-Z]{2})?)')      
matches = pattern.finditer(srchTxt)
for m in matches:
    print(m)


<re.Match object; span=(163, 184), match='http://www.utexas.edu'>
<re.Match object; span=(185, 207), match='https://www.google.com'>
<re.Match object; span=(208, 221), match='IN.REDIFF.COM'>
<re.Match object; span=(222, 236), match='PRINT.KODAK.IO'>
<re.Match object; span=(237, 258), match='http://craigslist.com'>
<re.Match object; span=(259, 274), match='www.yahoo.co.uk'>


In [9]:
srchTxt1='''
www.google.com
http://in.rediff.com
PRINT.KODAK.IO
https://BIQ.DESIGN.IO
www.yahoo.co.uk
http://cmcell.tn.gov.in
'''
pattern = re.compile(r'(https?://)?([a-zA-Z]+\.)([a-zA-Z]+\.)([a-zA-Z]{2,3})(.[a-zA-Z]{2})?')      
matches = pattern.finditer(srchTxt1)
for m in matches:
    print(m.group(0))
    print(m.group(1)," ",m.group(2)," ",m.group(3)," ",m.group(4)," ",m.group(5))

www.google.com
None   www.   google.   com   None
http://in.rediff.com
http://   in.   rediff.   com   None
PRINT.KODAK.IO
None   PRINT.   KODAK.   IO   None
https://BIQ.DESIGN.IO
https://   BIQ.   DESIGN.   IO   None
www.yahoo.co.uk
None   www.   yahoo.   co   .uk
http://cmcell.tn.gov.in
http://   cmcell.   tn.   gov   .in


In [2]:
sr = ['L122323A','L123123C','S34553a','S12321312s','L123123123-a','S123123_b']
sr1 = []
sr2 = []

def
for s in sr:
    print(s)
    
#     sr1.push()

L122323A
L123123C
S34553a
S12321312s
L123123123-a
S123123_b
