In [1]:
import re


"""
Regular expressions are text matching patterns described with a formal syntax.

You'll often hear regular expressions referred to as 'regex' or 'regexp' in conversation. Regular expressions can include a variety of rules, fro finding repetition, to text-matching, and much more.

As you advance in Python you'll see that a lot of your parsing problems can be solved with regular expressions (they're also a common interview question!).
If you're familiar with Perl, you'll notice that the syntax for regular expressions are very similar in Python. We will be using the re module with Python for this lecture.
"""
patterns = ['term1', 'term2']
text = "This is a string that contains term1 and not the other term"
print(re.search('hello', 'hello world'))
print("----------------------------------------------------------------------")
for pattern in patterns:
    print("Searching for '{0}' in '{1}'".format(pattern, text))

    if re.search(pattern, text):
        print("\nThe match was found \n")
    else:
        print("\nThe match was not found \n")

mat = re.search(patterns[0], text)
print(mat)
print(mat.start())
print(mat.end())
print("---------------------------------------------------------------------------")
split_term = "@"
phrase = "What is your email address, is it ravi@gmail.com"
print(re.split(split_term,phrase))


<_sre.SRE_Match object; span=(0, 5), match='hello'>
----------------------------------------------------------------------
Searching for 'term1' in 'This is a string that contains term1 and not the other term'

The match was found 

Searching for 'term2' in 'This is a string that contains term1 and not the other term'

The match was not found 

<_sre.SRE_Match object; span=(31, 36), match='term1'>
31
36
---------------------------------------------------------------------------
['What is your email address, is it ravi', 'gmail.com']


In [2]:
import re

"""
Pattern re Syntax
This will be the bulk of this lecture on using re with Python. Regular expressions supports a huge variety of patterns the just simply finding where a single string occurred.

We can use metacharacters along with re to find specific types of patterns.

Since we will be testing multiple re syntax forms, let's create a function that will print out results given a list of various regular expressions and a phrase to parse:
"""
print(re.findall('match', 'We are having a match and we do not want to lose that match'))

print("-----------------------------------------------------------------------------------------------------------")
print("-----------------------------------------------------------------------------------------------------------")
print("-----------------------------------------------------------------------------------------------------------")

def multi_find(patterns, phrase):
    for pat in patterns:
        print("This is the pattern : %r" % pat)
        print(re.findall(pat, phrase))
        print("**************************************")


"""
Repetition Syntax
There are five ways to express repetition in a pattern:

1.) A pattern followed by the meta-character * is repeated zero or more times.

2.) Replace the * with + and the pattern must appear at least once.

3.) Using ? means the pattern appears zero or one time.

4.) For a specific number of occurrences, use {m} after the pattern, where m is replaced with the number of times the pattern should repeat.

5.) Use {m,n} where m is the minimum number of repetitions and n is the maximum. Leaving out n ({m,}) means the value appears at least m times, with no maximum.

"""
test_phrase = 'sdsd..sssddd...sdddsddd...dsds...dsssss...sdddd'

test_patterns = ['sd*',  # s followed by zero or more d's
                 'sd+',  # s followed by one or more d's
                 'sd?',  # s followed by zero or one d's
                 'sd{3}',  # s followed by three d's
                 'sd{2,3}',  # s followed by two to three d's
                 ]

multi_find(test_patterns, test_phrase)
test_patterns = ['[sd]',  # either s or d

                 's[sd]+'  # s followed by either s or d
                 ]

multi_find(test_patterns, test_phrase)

test_phrase = 'This is a string! But it has punctuation. How can we remove it?'
print(re.findall('[^!.?]+',test_phrase))

['match', 'match']
-----------------------------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------------------------
This is the pattern : 'sd*'
['sd', 'sd', 's', 's', 'sddd', 'sddd', 'sddd', 'sd', 's', 's', 's', 's', 's', 's', 'sdddd']
**************************************
This is the pattern : 'sd+'
['sd', 'sd', 'sddd', 'sddd', 'sddd', 'sd', 'sdddd']
**************************************
This is the pattern : 'sd?'
['sd', 'sd', 's', 's', 'sd', 'sd', 'sd', 'sd', 's', 's', 's', 's', 's', 's', 'sd']
**************************************
This is the pattern : 'sd{3}'
['sddd', 'sddd', 'sddd', 'sddd']
**************************************
This is the pattern : 'sd{2,3}'
['sddd', 'sddd', 'sddd', 'sddd']
**************************************
This is the pattern : '

In [3]:
import re
def multi_find(patterns, phrase):
    for pat in patterns:
        print("This is the pattern : %r" % pat)
        print(re.findall(pat, phrase))
        print("**************************************")

test_phrase = 'This is an example sentence. Lets see if we can find some letters.'

test_patterns = ['[a-z]+',  # sequences of lower case letters
                 '[A-Z]+',  # sequences of upper case letters
                 '[a-zA-Z]+',  # sequences of lower or upper case letters
                 '[A-Z][a-z]+']  # one upper case letter followed by lower case letters

multi_find(test_patterns, test_phrase)

"""
You can use special escape codes to find specific types of patterns in your data, such as digits, non-digits,whitespace, and more.

Code Meaning

\d a digit

\D a non-digit

\s whitespace (tab, space, newline, etc.)

\S non-whitespace

\w alphanumeric

\W non-alphanumeric

Escapes are indicated by prefixing the character with a backslash (). Unfortunately, a backslash must itself be escaped in normal Python strings,

and that results in expressions that are difficult to read. Using raw strings, created by prefixing the literal value with r,

for creating regular expressions eliminates this problem and maintains readability.
"""
test_phrase = 'This is a string with some numbers 1233 and a symbol #hashtag'

test_patterns=[ r'\d+', # sequence of digits
                r'\D+', # sequence of non-digits
                r'\s+', # sequence of whitespace
                r'\S+', # sequence of non-whitespace
                r'\w+', # alphanumeric characters
                r'\W+', # non-alphanumeric
                ]

multi_find(test_patterns,test_phrase)

This is the pattern : '[a-z]+'
['his', 'is', 'an', 'example', 'sentence', 'ets', 'see', 'if', 'we', 'can', 'find', 'some', 'letters']
**************************************
This is the pattern : '[A-Z]+'
['T', 'L']
**************************************
This is the pattern : '[a-zA-Z]+'
['This', 'is', 'an', 'example', 'sentence', 'Lets', 'see', 'if', 'we', 'can', 'find', 'some', 'letters']
**************************************
This is the pattern : '[A-Z][a-z]+'
['This', 'Lets']
**************************************
This is the pattern : '\\d+'
['1233']
**************************************
This is the pattern : '\\D+'
['This is a string with some numbers ', ' and a symbol #hashtag']
**************************************
This is the pattern : '\\s+'
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
**************************************
This is the pattern : '\\S+'
['This', 'is', 'a', 'string', 'with', 'some', 'numbers', '1233', 'and', 'a', 'symbol', '#hashtag']
*************