# Regex

In [1]:
import re

In [2]:
test_string = 'My name is Praneesh. I am a Data Scientis.' 
regex_pattern = 'Praneesh'

In [3]:
match = re.findall(regex_pattern, test_string)
print("Number of matches :", len(match))

Number of matches : 1


In [4]:
pattern = re.compile(r'data')

In [5]:
text_pattern = '''Data analysis is a process of inspecting, cleansing, transforming, and modeling data with the goal of discovering 
                  useful information, informing conclusions, and supporting decision-making. Data analysis has multiple facets
                  and approaches, encompassing diverse techniques under a variety of names, while being used in different business, 
                  science, and social science domains.'''

matches = pattern.finditer(text_pattern)

In [6]:
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(80, 84), match='data'>


** Meta characters that needs to be escaped**


** . ^ $ + * ? { } [ ] \ | ( ) **

- '.'  -->> finds any character except a new line 
- '\d' -->> finds digits between 0-9
- '\D' -->> finds "not a digit 0-9"
- '\w' -->> finds word character (a-z, A-Z, 0-9, _ )
- '\W' -->> finds "not a word character"
- '\s' -->> finds whitespaces, tabs, newline
- '\S' -->> finds not white spaces, tabs or new line


In [7]:
pattern = re.compile(r'\s')
matches = pattern.finditer(text_pattern)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(4, 5), match=' '>
<_sre.SRE_Match object; span=(13, 14), match=' '>
<_sre.SRE_Match object; span=(16, 17), match=' '>
<_sre.SRE_Match object; span=(18, 19), match=' '>
<_sre.SRE_Match object; span=(26, 27), match=' '>
<_sre.SRE_Match object; span=(29, 30), match=' '>
<_sre.SRE_Match object; span=(41, 42), match=' '>
<_sre.SRE_Match object; span=(52, 53), match=' '>
<_sre.SRE_Match object; span=(66, 67), match=' '>
<_sre.SRE_Match object; span=(70, 71), match=' '>
<_sre.SRE_Match object; span=(79, 80), match=' '>
<_sre.SRE_Match object; span=(84, 85), match=' '>
<_sre.SRE_Match object; span=(89, 90), match=' '>
<_sre.SRE_Match object; span=(93, 94), match=' '>
<_sre.SRE_Match object; span=(98, 99), match=' '>
<_sre.SRE_Match object; span=(101, 102), match=' '>
<_sre.SRE_Match object; span=(113, 114), match=' '>
<_sre.SRE_Match object; span=(114, 115), match='\n'>
<_sre.SRE_Match object; span=(115, 116), match=' '>
<_sre.SRE_Match object; span=(116, 117), ma

## Anchors 

They dont match patterns, but allow more detailed string search


- '\b' : Word boundry
- '\B' : Not a word boundry
-  ^   : Begining of a string
-  $   : End of a string

In [9]:
new_text_pattern = '''
                      Ha HaHa
                      Sentence : Start a sentence and then stop it'
                      '''

In [10]:
pattern = re.compile(r'\BHa')
matches = pattern.finditer(new_text_pattern)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(28, 30), match='Ha'>


### Matching Numbers

In [140]:
phone_num = """
              2-324-242-1098
              3-301-123-1098
              1.201.765.1987
              1-301-682-1437
              1.201.502.9928
            """

In [124]:
pattern = re.compile(r'\d[*.]\d\d\d[*.]\d\d\d[*.]\d\d\d\d')
matches = pattern.finditer(phone_num)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(73, 87), match='1.201.765.1987'>
<_sre.SRE_Match object; span=(131, 145), match='1.201.502.9928'>


- find out the US phone numbers with area code '301' or '201'

In [145]:
pattern = re.compile(r'1[-.][23]00[-.]\d\d\d[-.]\d\d\d')
matches = pattern.finditer(phone_num)

for match in matches:
    print(matches)

### Character Negation

In [136]:
# find all words starting with anything except 'b'

string = '''
            cat
            bat
            mat
            spat
            bloc
            
            Sentence : Blob was going to block the black gate with a basketball
            
            '''

pattern = re.compile(r'[^b]at')
matches = pattern.finditer(string)
li = []
for match in matches:
    li.append(match)
print(li)

[<_sre.SRE_Match object; span=(13, 16), match='cat'>, <_sre.SRE_Match object; span=(45, 48), match='mat'>, <_sre.SRE_Match object; span=(62, 65), match='pat'>, <_sre.SRE_Match object; span=(153, 156), match='gat'>]


### Quantifiers -  to match multiple charcters

- '*' : macthes 0 or more
- '+' : matches 1 or more
- '?' : matches 0 or 1
- {3} : exact number of digits to match
- {3,7}: range of numbers


In [143]:
# find phone numbers using quantifiers

phone_num = """ 2-324-242-1098, 3-301-123-1098, 1.201.765.1987, 1-301-682-1437, 1.201.502.9928"""

pattern = re.compile(r'\d-\d\d\d-\d\d\d-\d\d\d\d')
match = pattern.finditer(phone_num)
for match in matches:
    print(match)

In [188]:
# Match all prefix of names from a phone book

with open('regex.txt', 'r') as f:
    contents = f.read()
    
    pattern = re.compile(r'Mr\.')
    matches = pattern.finditer(contents)
    
    for match in matches:
        print(match)

<_sre.SRE_Match object; span=(141, 144), match='Mr.'>
<_sre.SRE_Match object; span=(215, 218), match='Mr.'>


In [160]:
pattern = re.compile(r'Mr\.?')
matches = pattern.finditer(contents)
    
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(0, 2), match='Mr'>
<_sre.SRE_Match object; span=(28, 31), match='Mr.'>
<_sre.SRE_Match object; span=(86, 88), match='Mr'>
<_sre.SRE_Match object; span=(115, 118), match='Mr.'>


In [169]:
pattern = re.compile(r'Mr\.\s[A-Z]\w*')
matches = pattern.finditer(contents)
    
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(28, 37), match='Mr. Smith'>
<_sre.SRE_Match object; span=(115, 120), match='Mr. T'>


In [187]:
# Find all names with prefix Mr, Mrs or Ms

pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s[A-Z]\w*')
matches = pattern.finditer(contents)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(0, 10), match='Mr Schafer'>
<_sre.SRE_Match object; span=(28, 37), match='Mr. Smith'>
<_sre.SRE_Match object; span=(57, 66), match='Ms. Davis'>
<_sre.SRE_Match object; span=(86, 99), match='Mrs. Robinson'>
<_sre.SRE_Match object; span=(115, 120), match='Mr. T'>


In [197]:
# find out the emails of Mr Schafer

pattern = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z]+\.(com|edu)')
matches = pattern.finditer(contents)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(61, 84), match='coreyMSchafer@gmail.com'>
<_sre.SRE_Match object; span=(85, 106), match='corey.schafer@umd.edu'>
<_sre.SRE_Match object; span=(107, 140), match='corey-321-schafer@datasociety.com'>


In [217]:
urls = '''
       https://github.com/jalajthanaki/credit-risk-modelling/blob/master/Credit%20Risk%20Analysis.ipynb   
       http://pandas.pydata.org/
       http://google.com
       https://www.nasa.gov
       '''

pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches = pattern.finditer(urls)

for match in matches:
    print(match.group(0))

https://github.com
http://pandas.pydata
http://google.com
https://www.nasa.gov


In [11]:
A = [[1, 4, 5, 12], 
    [-5, 8, 9, 0],
    [-6, 7, 11, 19]]

In [17]:
A[1][1]

8

In [45]:
a = [1,4,8,9,3,87,2,2,4,6,88,9,9,9,1092]


In [48]:
def max_number(a,max_num = 0):
    for i in range(len(a)):
        if len(a) == 0:
            max_num = 0  
        for j in range(len(a)):
            if i > j : 
                max_num = i
            else :
                max_num = j
    return max_num

In [49]:
max_number(a)

14

In [51]:
a = [12322,4,8,9,3,87,2,2,4,6,88,9,9,9,1092]
max= 0
for i in a:
    if i > max:
        max=i
print(max)

12322
