### 2.1 - Splitting Strings

In [6]:
import re

line = "asds asdasd; sadasd, wewe, wewe        weewqe"

#exp = re.compile(r"[;,/\s]\s*") 
# Note: we cannot use this since the above exp.match() will give us the patterns, we are looking to split the text by the pattern

exp = r'[;,\s]\s*'
print(re.split(exp, line))

['asds', 'asdasd', 'sadasd', 'wewe', 'wewe', 'weewqe']


In [8]:
## If using capture pattern even the patterns get included in the answer

exp = r'(;|,|\s)\s*'
print(re.split(exp, line))

['asds', ' ', 'asdasd', ';', 'sadasd', ',', 'wewe', ',', 'wewe', ' ', 'weewqe']


### 2.2 Making matching text

In [9]:
import os
def get_files(loc):
    for _file in os.listdir(loc):
        if _file.endswith(".ipynb"):
            print(_file)

get_files(os.getcwd())

strings_and_text.ipynb
data_structures.ipynb


## 2.3 Matching string using shell wildcards patterns

In [12]:
from fnmatch import fnmatch, fnmatchcase

print(fnmatch('foo.txt', '*.txt'))

print(fnmatch('DataStructure.txt', 'Data*.txt'))

names  = ["file1.txt", "file2.txt", "nfile3.txt"]

print([name for name in names if fnmatch(name, "file*.txt")])

True
True
['file1.txt', 'file2.txt']


Note: fnmatch is case-sensitive depending on platform hence its better to use fmatchcase for identical solutions

### 2.4 Matching and searching for text patterns

In [17]:
text = "A quick brown fox jumps over the lazy dog"

# exact match
print("A quick brown fox jumps over the lazy dog" == text)

# starts with
print(text.startswith("A quick brown"))

# ends with
print(text.endswith("lazy dog"))

# find - searches for the location of the first occurance
print(text.find("brown"))

# For more complex matching we use regular expression
text1 = '11/27/2012'
text2 = 'Nov 27, 2012'

if re.match(r'\d+/\d+/\d+', text1):
    print('Valid date') 
else:
    print('Invalid date')

# similarly creating pattern for the 2nd one
if re.match(r'[A-Z][a-z]{2}\s+\d+,\s+\d+', text2):
    print('Valid date')
else:
    print('Invalid date')

if re.match(r'[A-Z][a-z]{2}\s+\d+,\s+\d+', 'nov 13, 2012'):
    print('Valid date')
else:
    print('Invalid date')    

True
True
True
8
Valid date
Valid date
Invalid date


In [21]:
# use compile if matching more frequently

valid_date = re.compile(r'[A-Z][a-z]{2}\s+\d+,\s+\d+')

def validate_date(date_str):
    return date_str, "is_valid" if valid_date.match(date_str) else "in_valid"

print(validate_date('nov 13, 2012'))
print(validate_date('Nov 27, 2012'))

('nov 13, 2012', 'in_valid')
('Nov 27, 2012', 'is_valid')


In [24]:
# It is also common to capture groups using regex
datecap = re.compile(r'(\d+)/(\d+)/(\d+)')

m = datecap.match('12/09/2012')

print(m.group(0), m.group(1), m.group(2), m.group(3))

12/09/2012 12 09 2012


In [30]:
text = "I will on vacation from 12/10/2024 - 25/10/2024"

datecap = re.compile(r'(\d+)/(\d+)/(\d+)')

for date in datecap.findall(text):   # also we may use finditer
    day, month, year =  date
    print(f'{day} - {month} - {year}')

12 - 10 - 2024
25 - 10 - 2024


### 2.5 Searching and Replacing text

In [31]:
text = "A quick brown fox jumps over the lazy dog"

print(text.replace("dog", "cat"))

A quick brown fox jumps over the lazy cat


### 2.6 Specifying a regular expression for the shortest path

In [34]:
text = '"no" and "yes" are opposites.'

# we would like to capture "no" and "yes" from the string

# r'\"(.*)\"' - ideally this should capture the no and yes in the string, but .* is greedy and doesnt stop at "

print(re.findall(r'\"(.*)\"', text))

print(re.findall(r'\"(.*?)\"', text)) # adding ? forces the matcher to pick the shortest possible path

['no" and "yes']
['no', 'yes']
