In [26]:
import re

https://github.com/nikhilkumarsingh/RegEx-In-Python/blob/master/01.%20Introduction.ipynb

In [27]:
# Topics - findall, search, split, sub, finditer

match()             Determine if the RE matches at the beginning of the string.                             
search()         	Scan through a string, looking for any location where this RE matches.                  
findall()       	Find all substrings where the RE matches, and returns them as a list.                 
finditer()	       Find all substrings where the RE matches, and returns them as an iterator.                

In [28]:
# meta charactor

[]  Represent a character class
^   Matches the beginning
$   Matches the end
.   Matches any character except newline
?   Matches zero or one occurrence.
|   Means OR (Matches with any of the characters separated by it.
*   Any number of occurrences (including 0 occurrences)
+   One or more occurrences
{}  Indicate number of occurrences of a preceding RE to match.
()  Enclose a group of REs

In [33]:
\d   Matches any decimal digit, this is equivalent to the set class [0-9].
\D   Matches any non-digit character.[0-9]
\s   Matches any whitespace character.[\t\n\f\v]
\S   Matches any non-whitespace character[^\t\n\f\v]
\w   Matches any alphanumeric character, this is equivalent to the class [a-zA-Z0-9_].
\W   Matches any non-alphanumeric character [^a-zA-Z0-9_].

In [36]:
from re import split
print(split('\W+', 'Words, words , Words'))
print(split('\W+', "Word's words Words"))
print(split('\W+', 'On 12th Jan 2016, at 11:02 AM'))
print(split('\d+', 'On 12th Jan 2016, at 11:02 AM'))

['Words', 'words', 'Words']
['Word', 's', 'words', 'Words']
['On', '12th', 'Jan', '2016', 'at', '11', '02', 'AM']
['On ', 'th Jan ', ', at ', ':', ' AM']


## match(string[, pos[, endpos]])

A match is checked only at the beginning (by default).                                  

Checking starts from pos index of the string. (default is 0)                                    

Checking is done until endpos index of string. endpos is set as a very large integer (by default).            

Returns None if no match found.                                                           

If a match is found, a Match object is returned, containing information about the match: where it starts and ends, the substring it matched, and more.

In [39]:
pattern = re.compile("hello")
match = pattern.match("hello world")
print(match)

<re.Match object; span=(0, 5), match='hello'>


In [40]:
pattern.match("say hello", pos=4) 

<re.Match object; span=(4, 9), match='hello'>

## search(string[, pos[, endpos]])

A match is checked throughtout the string.                                            

Same behaviour of pos and endpos as the match() function.                                      

Returns None if no match found.                                                         

If a match is found, a Match object is returned.                                                   


In [41]:
pattern = re.compile("hello")

In [44]:
pattern.search('say hello')

<re.Match object; span=(4, 9), match='hello'>

## findall(string[, pos[, endpos]])

Finds all non-overlapping substrings where the match is found, and returns them as a list.          

Same behaviour of pos and endpos as the match() and search() function.                     

In [45]:
pattern.findall("say hello hello")

['hello', 'hello']

In [52]:
text = "I went to him at 11 A.M. on 4th July 1886"
p = re.compile('\d')
print(p.findall(text))

p = re.compile('\d+')
print(p.findall(text))

['1', '1', '4', '1', '8', '8', '6']
['11', '4', '1886']


In [53]:
text = "He said * in some_lang."
p = re.compile('\w')
print(p.findall(text))

p = re.compile('\w+')
print(p.findall(text))

p = re.compile('\W')
print(p.findall(text))

['H', 'e', 's', 'a', 'i', 'd', 'i', 'n', 's', 'o', 'm', 'e', '_', 'l', 'a', 'n', 'g']
['He', 'said', 'in', 'some_lang']
[' ', ' ', '*', ' ', ' ', '.']


In [55]:
import re
text = "Aye, Mr sad Girlwomsen amend"
p = re.compile('[a-e]')
print(p.findall(text))

['e', 'a', 'd', 'e', 'a', 'e', 'd']


In [54]:
text = '''because 9956178018 CONCATENATE may not be available in future versions of Excel.(995)-613-72'''
pattern = '\(\d{3}\)-\d{3}-\d{2}'
matches = re.findall(pattern, text)
print(matches)
pattern2 = '\(\d{3}\)-\d{3}-\d{2}|\d{10}'
matches2 = re.findall(pattern2, text)
print(matches2)

['(995)-613-72']
['9956178018', '(995)-613-72']


## finditer(string[, pos[, endpos]])

Finds all non-overlapping substrings where the match is found, and returns them as a list.           

Same behaviour of pos and endpos as the match() and search() function.                            



In [46]:
matches = pattern.finditer("say hello hello")

In [47]:
for match in matches:
    print(match.span())


(4, 9)
(10, 15)


In [48]:
txt = "This book costs $15."
pattern = re.compile("$15")

In [49]:
pattern.search(txt)

### No match found. Why?
$ is a metacharacter and has a special meaning for regex engine. Here, we want to treat it like a literal.     

In order to treat a metacharacter like a literal, you need to escape it using \ character.     



In [50]:
pattern = re.compile("\$15")

In [51]:
pattern.search(txt)


<re.Match object; span=(16, 19), match='$15'>

In [56]:
import re
text = "Aye, Mr and sad Girlwomsen and amend"
#p = re.compile('and')
#p = re.compile('[a-e]')
#p = re.compile('^Aye')
#p = re.compile('end$')
#p = re.compile('an*')
#p = re.compile('an+')
#p = re.compile('an{1}')
#p = re.compile('(an){1}')
p = re.compile('and$|sen') #either or case
#p = re.compile('.')
matches = p.finditer(text)

for match in matches:
    print(match)

<re.Match object; span=(23, 26), match='sen'>


In [3]:
import re
mydata = ''' My name is Rishabh and how are you my sir ramesh'''
patt = re.compile(r'sir')
matches = patt.finditer(mydata)
for match in matches:
    print(match)

<re.Match object; span=(39, 42), match='sir'>


## Day -3

In [4]:
txt = """
Yesterday, I was driving my car without a driving licence. The traffic police stopped me and asked me for my 
license. I told them that I forgot my licence at home. 
"""

In [5]:
pattern = re.compile("licen[cs]e")

In [6]:
pattern.findall(txt)

['licence', 'license', 'licence']

### Let us consider an example in which we want to retrieve all the years from the given text.

In [7]:
txt = """
The first season of Indian Premiere League (IPL) was played in 2008. 
The second season was played in 2009 in South Africa. 
Last season was played in 2018 and won by Chennai Super Kings (CSK).
CSK won the title in 2010 and 2011 as well.
Mumbai Indians (MI) has also won the title 3 times in 2013, 2015 and 2017.
"""

In [8]:
pattern = re.compile("[1-9][0-9][0-9][0-9]")

In [9]:
pattern.findall(txt)

['2008', '2009', '2018', '2010', '2011', '2013', '2015', '2017']

In [None]:
[^A-Z0-9] # It means the chacter should not be  A-Z  and 0-9

In [10]:
pattern = re.compile("[1-9]\d\d\d")

In [11]:
pattern.findall(txt)

['2008', '2009', '2018', '2010', '2011', '2013', '2015', '2017']

Consider a scenario where you want to find all occurances of and, or, the in a given text.

One way is to write and execute 3 separate regular expressions. Using alteration, it can be done in a single regular expression!

In [12]:
txt = """
the most common conjunctions are and, or and but.
"""


In [13]:
pattern = re.compile("and|or|the")
pattern.findall(txt)

['the', 'and', 'or', 'and']

In [14]:
# Consider one more example now in which we want to search the substrings What is and Who is.
txt = """
What is your name?
Who is that guy?
"""

In [15]:
pattern = re.compile("What|Who is")
pattern.findall(txt)

['What', 'Who is']

## Quantifiers
Quantifiers are the mechanisms to define how a character, metacharacter, or character set can be repeated.

Ex.1 Find all the matches for dog and dogs in the given text.

In [16]:
txt = """
I have 2 dogs. One dog is 1 year old and other one is 2 years old. Both dogs are very cute! 
"""

In [17]:
pattern = re.compile("dogs?")
pattern.findall(txt)

['dogs', 'dog', 'dogs']

Ex.2 Find all filenames starting with file and ending with .txt in the given text.

In [18]:
txt = """
file1.txt
file_one.txt
file.txt
fil.txt
file.xml
file-1.txt
"""

In [20]:
pattern = re.compile("file[\w-]*\.txt")
pattern.findall(txt)

['file1.txt', 'file_one.txt', 'file.txt', 'file-1.txt']

Ex3. Find all filenames starting with file followed by 1 or more digits and ending with .txt in the given text.

In [21]:
txt = """
file1.txt
file_one.txt
file09.txt
fil.txt
file23.xml
file.txt
"""

In [23]:
pattern = re.compile("file\d+\.txt")
pattern.findall(txt)

['file1.txt', 'file09.txt']

Ex4. Find years in the given text.

In [24]:
txt = """
The first season of Indian Premiere League (IPL) was played in 2008. 
The second season was played in 2009 in South Africa. 
Last season was played in 2018 and won by Chennai Super Kings (CSK).
CSK won the title in 2010 and 2011 as well.
Mumbai Indians (MI) has also won the title 3 times in 2013, 2015 and 2017.
"""

In [25]:
pattern = re.compile("\d\d\d\d")
pattern.findall(txt)

['2008', '2009', '2018', '2010', '2011', '2013', '2015', '2017']

In [26]:
pattern = re.compile("\d{4}")
pattern.findall(txt)

['2008', '2009', '2018', '2010', '2011', '2013', '2015', '2017']

Ex5. In the given text, filter out all 4 or more digit numbers.

In [28]:
txt = """
123143
432
5657
4435
54
65111
"""

In [29]:
pattern = re.compile("\d{4,}")
pattern.findall(txt)

['123143', '5657', '4435', '65111']

Ex6.Telephone numbers can be of the form: 555-555-5555, 555 555 5555, 5555555555

In [30]:
txt = """
555-555-5555
555 555 5555
5555555555
"""

In [32]:
pattern = re.compile("\d{3}[-\s]?\d{3}[-\s]?\d{4}")
pattern.findall(txt)

['555-555-5555', '555 555 5555', '5555555555']

# Part 7 has completed

1.Write a Python program to check that a string contains only a certain set of characters (in this case a-z, A-Z and 0-9).

In [25]:
text = 'Rishabhvishwakarma013@gmail.com'

d1 = re.findall(r'[^a-zA-Z0-9]')
var1 = d1.search(text)
return var1

TypeError: findall() missing 1 required positional argument: 'string'