# Objects and Methods

In [None]:
import re
help(re)
# https://docs.python.org/3/library/re.html

In [None]:
dir(re)

In [None]:
import re
string = "The Euro STOXX 600 index, which tracks all stock markets across Europe including the FTSE, fell by 11.48% - the worst day since it launched in 1998. The panic selling prompted by the coronavirus has wiped £2.7tn off the value of STOXX 600 shares since its all-time peak on 19 February."

In [None]:
path=r"\User\temp\newFile"
print(path)

In [None]:
# Compile
s=r"\d{4}"
t = re.compile(s)
print(type(t), t)
result = re.findall(t, string)
result

In [None]:
# Compile
s=r"\d{4}"
result = re.search(s, string)
print(type(result))
print(result)

In [None]:
# SEARCH - Finds the first occurance and stops
# Returns - On success - Match Object
#           On failure - None
s=r"\w{2}"
re.search(s, string)

In [None]:
# FINDALL - Finds all the occurances
s=r"\w{2}"
res = re.findall(s, string)
print(type(res))

In [None]:
# MATCH - Like search, but matches only from the start of the string. If no match, return None
re.search(r"\w{4}", string)
res = re.match(r"\w{4}", string)
print(type(res), res)
re.match(r"\w{2}", string)


In [None]:
# FULLMATCH - matches the entire string, else fails and returns None
re.fullmatch(r"\w{2}", string)

In [None]:
len(string)

In [None]:
#$ . (dot) - Matches any character in the text, apart from newline
re.fullmatch(r".{285}", string)

In [None]:
re.fullmatch(r"\w{285}", string)

In [None]:
# Split
## String split

result = string.split(' ')
print(type(result), result)

In [None]:
res = re.split(r"\s", string)
print(type(res), res)

In [None]:
res = re.split(r"\d{2}", string)
print(type(res), res)

In [None]:
# sub() -  Replace a pattern matched with a provided string
res = re.sub(r"[A-Z]{2,}", "INDEX", string, 2)
print(type(res), res)

In [None]:
# subn() - returns a tuple;(1: mod string, 2: no. of replacements)
res = re.subn(r"[A-Z]{2,}", "INDEX", string)
print(type(res), res)

In [None]:
# Group, Groups
# groups() - tuple of all matches
# group() ==>  group(0)
# group(1)
# group(2)

res = re.search(r".+\s(.+ex).+(\d\d\s.+).", string)
res, res.groups(), res.group(1), res.group(2)

In [None]:
res.group(), res.group(0)

In [None]:
res.group(1, 2)

In [None]:
# SPAN, START, END
res = re.search(r".+\s(.+ex).+(\d\d\s.+).", string)
res.start(1), res.end(1)

In [None]:
string[19:24]

In [None]:
res.start(2), res.end(2)

In [None]:
string[273:284]

In [None]:
res.span(1), res.span(2)

## Optional flags

In [None]:
# re.I - ignorecase
# re.M - Multiline
# re.S - DotAll
# re.X - Verbose
# flags = re.I | re.M | re.S

In [None]:
import re

string2 = '''The Euro STOXX 600 index, which tracks all stock markets across Europe including the FTSE, 
fell by 11.48% – the worst day since it launched in 1998. 
The panic selling prompted by the coronavirus 
has wiped £2.7tn off the value of STOXX 600 shares since its all-time peak on 19 February.'''

re.findall(r"the", string)

In [None]:
re.findall(r"the", string, re.I)

In [None]:
re.findall(r"^The", string2)

In [None]:
re.findall(r"^The", string2, re.M)

In [None]:
re.findall(r".+", string)

In [None]:
res = re.findall(r".+", string2)
len(res), res

In [None]:
res = re.findall(r".+", string2, re.S)
len(res), res

In [None]:
res = re.search(r".+\s(.+ex).+(\d\d\s.+).", string)
res.groups()

In [None]:
# Steps:
# 1. Change the poattern string to a multiline string
# 2. Add the flag re.X
# 3. Breakdown the pattern and add comments

res = re.search(r'''.+\s #Beginning of the string, that we don't want
                (.+ex)  # Any word ending with 'ex'
                .+ # Every character in between the patterns/groups we are interested
                (\d\d\s.+) # To capture the date at the end, 19 February
                . # Last character, probably a punctuation''', string, re.X)
res.groups()

## Meta characters

In [None]:
# . - any character, other than newline
# ^ - Beginnning of the line, not string. More obvious if using the re.M
# $ - End of the line, not string.

In [None]:
# * - 0 or more occurances
re.findall(r"\d\d\d*", string) # Greedy pattern

In [None]:
# + - 1 or more occurances
re.findall(r"\d\d\d+", string) # Greedy pattern

In [None]:
# ? - 0 or 1 occurances
re.findall(r"\d\d\d?", string) # Non-Greedy pattern

In [None]:
# Non greedy searches
res1 = re.findall(r"\d\d\d*", string)
res2 = re.findall(r"\d\d\d*?", string)
res3 = re.findall(r"\d\d\d+", string)
res4 = re.findall(r"\d\d\d+?", string)

print(res1, res2, res3, res4, sep='\n')

In [None]:
res1 = re.findall(r"\d\d\d?", string)
res2 = re.findall(r"\d\d\d??", string)
print(res1, res2, sep='\n')

In [None]:
# Backslash \
# 1. denotes a special sequence
# 2. escape special characters

res = re.findall(r".", string)
print(res)
res = re.findall(r"\.", string)
print(res)

In [None]:
# Square brackets []
# [a-z]  [A-Z] [0-9] [a-zA-Z] [aeiou]
res = re.findall(r"[aeiou]", string)
print(res)
print(set(res))

In [None]:
# Whitespace characters
# space
# line-feed / newline (\n)
# tab (\t)
# vertical tab (\v)
# formfeed (\f)
# carraige return (\r)

res = re.findall(r"[ \n\t\v\f\r]", string2)
print(res)

res = re.findall(r"[^ \n\t\v\f\r]", string2)
print(res)

In [None]:
# {4} {4,} {4, 8}

In [None]:
# PIPES
# for given match patterns: A, B, C
# if we want the match to succeed for a match with any one of the three
# A | B | C

In [None]:
# Beg and End of string, rather than line
# \A - Beginnning        \Z - End

In [None]:
# Word - collectioon of consequetive alphanumeric characters
# \b - word boundary
'''
This is
a
string
'''

# \B - Opposite of a word boundary

In [None]:
# \d - digit/number
# \D - Opposite of \d

In [None]:
# \w - alphanumeric
# \W - Non-alphanumeric

In [None]:
# \s - Space
# \S - Not a space