### 2.1. Splitting Strings on Any of Multiple Delimiters

In [1]:
line = 'asdf fjdk; afed, fjek,asdf, foo'
import re
re.split(r'[;,\s]\s*', line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

In [2]:
re.split(r'(;|,|\s)\s*', line)

['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']

In [3]:
re.split(r'(?:,|;|\s)\s*', line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

### 2.2. Matching Text at the Start or End of a String

In [4]:
filename = 'spam.txt'
filename.endswith('.txt')

True

In [5]:
filename.startswith('tt')

False

In [7]:
files = [ 'Makefile', 'foo.c', 'bar.py', 'spam.c', 'spam.h' ]
[name for name in files if name.endswith(('.c','.h'))]

['foo.c', 'spam.c', 'spam.h']

In [10]:
any(name.endswith('.cc') for name in files)

False

In [15]:
choice = ['http','https']
url = 'http://www.baidu.com'
url.startswith(tuple(choice))
url.startswith(choice)#goes wrong

TypeError: startswith first arg must be str or a tuple of str, not list

#### The startswith() and endswith() methods provide a very convenient way to perform basic prefix and suffix checking.

### 2.3. Matching Strings Using Shell Wildcard Patterns

In [16]:
from fnmatch import fnmatch, fnmatchcase
fnmatch('foo.txt', '*.txt')

True

In [18]:
fnmatch('foo.txt', '?oo.txt')

False

In [19]:
fnmatch('Dat45.csv', 'Dat[0-9]*')

True

In [20]:
fnmatchcase('foo.txt','*.TXT')

False

In [21]:
# on windows non case sensitive
fnmatch('foo.txt', '*.TXT')

True

In [22]:
addresses = [
'5412 N CLARK ST',
'1060 W ADDISON ST',
'1039 W GRANVILLE AVE',
'2122 N CLARK ST',
'4802 N BROADWAY',
]
[addr for addr in addresses if fnmatch(addr, '* ST')]

['5412 N CLARK ST', '1060 W ADDISON ST', '2122 N CLARK ST']

In [43]:
[addr for addr in addresses if fnmatch(addr, '[0-9]* ST')]

[]

### 2.4. Matching and Searching for Text Patterns

In [45]:
text = 'yeah, but no, but yeah, but no, but yeah'
text =='yeah'

False

In [46]:
text.find('no')

10

In [50]:
text1 = '11/27/2017'
text2 = 'Nov 27, 2017'
import re
if re.match(r'\d+/\d+/\d', text1):
    print('yes')
else:
    print('no')
if re.match(r'\d+/\d+/\d', text2):
    print('yes')
else:
    print('no')

yes
no


In [52]:
pattern = re.compile(r'\d+/\d+/\d+')
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
pattern.findall(text)

['11/27/2012', '3/13/2013']

### 2.5. Searching and Replacing Text

In [53]:
text = 'yeah, but no, but yeah, but no, but yeah'
text.replace('yeah', 'yep')

'yep, but no, but yep, but no, but yep'

In [54]:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
import re
re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text)

'Today is 2012-11-27-. PyCon starts 2013-3-13-.'

In [55]:
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
datepat.sub(r'\3-\1-\2', text)

'Today is 2012-11-27. PyCon starts 2013-3-13.'

In [57]:
from calendar import month_abbr
def change_date(m):
    print(m)
    # get month name
    mon_name = month_abbr[int(m.group(1))]
    return '{} {} {}'.format(m.group(2), mon_name, m.group(3))
datepat.sub(change_date, text)

<_sre.SRE_Match object; span=(9, 19), match='11/27/2012'>
<_sre.SRE_Match object; span=(34, 43), match='3/13/2013'>


'Today is 27 Nov 2012. PyCon starts 13 Mar 2013.'

### 2.6. Searching and Replacing Case-Insensitive Text

In [60]:
text = 'UPPER PYTHON, lower python, Mixed Python'
re.findall('python', text, flags=re.IGNORECASE)

['PYTHON', 'python', 'Python']

In [61]:
re.sub('python', 'snake', text, flags= re.IGNORECASE)

'UPPER snake, lower snake, Mixed snake'

In [67]:
def matchcase(word):
    print(word)
    
    def replace(m):
        print(m)
        text = m.group()
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.capitalize()
        else:
            return word
    return replace
re.sub('python', matchcase('snake'), text, flags=re.IGNORECASE)

snake
<_sre.SRE_Match object; span=(6, 12), match='PYTHON'>
<_sre.SRE_Match object; span=(20, 26), match='python'>
<_sre.SRE_Match object; span=(34, 40), match='Python'>


'UPPER SNAKE, lower snake, Mixed Snake'

### 2.7. Specifying a Regular Expression for the Shortest Match

In [68]:
str_pat= re.compile(r'\"(.*)\"')
text1 = 'Computer say "no."'
str_pat.findall(text1)

['no.']

In [69]:
text2 = 'Computer says "no." Phone says "yes."'
str_pat.findall(text2)

['no." Phone says "yes.']

In [70]:
str_pat = re.compile(r'\"(.*?)\"')
str_pat.findall(text2)

['no.', 'yes.']

### 2.8. Writing a Regular Expression for Multiline Patterns

In [71]:
comment = re.compile(r'/\*(.*?)\*/')
text1 = '/* this is a comment */'
text2 = '''/* this is a
    multiline comment */
    '''
comment.findall(text1)

[' this is a comment ']

In [72]:
comment.findall(text2)

[]

In [73]:
comment = re.compile(r'/\*((?:.|\n)*?)\*/')
comment.findall(text2)

[' this is a\n    multiline comment ']

### 2.9. Normalizing Unicode Text to a Standard Representation

In [74]:
s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'
print(s1)
print(s2)

Spicy Jalapeño
Spicy Jalapeño


In [75]:
import unicodedata
t1 = unicodedata.normalize('NFC', s1)

In [76]:
t1

'Spicy Jalapeño'

### 2.10. Working with Unicode Characters in Regular Expressions

In [77]:
import re
num = re.compile('\d+')
num.match('123')

<_sre.SRE_Match object; span=(0, 3), match='123'>

In [78]:
num.match('\u0661\u0662\u0663')

<_sre.SRE_Match object; span=(0, 3), match='١٢٣'>

### 2.11. Stripping Unwanted Characters from Strings

In [83]:
s = ' hello world \n'
print(s.strip())
print(s.rstrip())
print(s.lstrip())
# character stripping
s = '------ hello world====='
print(s.rstrip('='))
print(s.lstrip('-'))
print(s.strip('-='))
print(s.strip('-=d'))

hello world
 hello world
hello world 

------ hello world
 hello world=====
 hello world
 hello worl


### 2.12. Sanitizing and Cleaning Up Text

### 2.13. Aligning Text Strings

In [84]:
text = 'Hello World'
text.center(20)

'    Hello World     '

In [85]:
text.rjust(20)

'         Hello World'

In [86]:
text.ljust(20)

'Hello World         '

In [87]:
text.ljust(5)
# is just len is less than length of word, won't work

'Hello World'

In [88]:
text.center(20, '*')

'****Hello World*****'

### 2.14. Combining and Concatenating Strings

In [89]:
parts = ['Is', 'Chicago', 'Not', 'Chicago?']
' '.join(parts)

'Is Chicago Not Chicago?'

In [91]:
data = ['ACME', 50, 91.1]
','.join(str(x) for x in data)

'ACME,50,91.1'

In [93]:
def sample():
    yield 'Is'
    yield 'Chicago'
    yield 'Not'
    yield 'Chicago?'

In [100]:
s = sample()
for i in s:
    print(i)

Is
Chicago
Not
Chicago?


### 2.15. Interpolating Variables in Strings

In [107]:
s = '{name} has {n} messages.'
s.format(name = 'Ben', n = 3)
#print(str_out)
name = 'Ben'
n   = 10
s.format_map(vars())

'Ben has 10 messages.'

In [108]:
class Info:
    def __init__(self, name, n):
        self.name = name
        self.n = n
a = Info('benkwok', 100)
s.format_map(vars(a))

'benkwok has 100 messages.'

In [113]:
class safesub(dict):
    def __missing__(self, key):
        return '{'+key+'}'
n = 1
del n
s.format_map(safesub(vars()))

'Ben has {n} messages.'

### 2.16. Reformatting Text to a Fixed Number of Columns

In [115]:
s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."
import textwrap
print(textwrap.fill(s, 20))print(textwrap.fill(s, 70))

Look into my eyes, look into my eyes, the eyes, the eyes, the eyes,
not around the eyes, don't look around the eyes, look into my eyes,
you're under.


In [116]:
print(textwrap.fill(s, 20))

Look into my eyes,
look into my eyes,
the eyes, the eyes,
the eyes, not around
the eyes, don't look
around the eyes,
look into my eyes,
you're under.


In [120]:
print(textwrap.fill(s, 40, initial_indent='     '))

     Look into my eyes, look into my
eyes, the eyes, the eyes, the eyes, not
around the eyes, don't look around the
eyes, look into my eyes, you're under.


In [119]:
print(textwrap.fill(s, 40, subsequent_indent='     '))

Look into my eyes, look into my eyes,
     the eyes, the eyes, the eyes, not
     around the eyes, don't look around
     the eyes, look into my eyes, you're
     under.


### 2.17. Handling HTML and XML Entities in Text

In [121]:
s = 'Elements are written as "<tag>text</tag>".'
import html
print(s)

Elements are written as "<tag>text</tag>".


In [123]:
print(html.escape(s))

Elements are written as &quot;&lt;tag&gt;text&lt;/tag&gt;&quot;.


In [124]:
print(html.escape(s, quote=False))

Elements are written as "&lt;tag&gt;text&lt;/tag&gt;".


In [126]:
s = 'Spicy &quot;Jalape&#241;o&quot.'
from html.parser import HTMLParser
p = HTMLParser()
html.unescape(s)

'Spicy "Jalapeño".'

In [127]:
t = 'The prompt is &gt;&gt;&gt;'
from xml.sax.saxutils import unescape
unescape(t)

'The prompt is >>>'

### 2.18. Tokenizing Text