# 2 - Strings and Text

## Splitting strings on any of multiple delimiters

In [1]:
s = "This is a string"
s

'This is a string'

In [2]:
s.split(" ")  # using just a basic split

['This', 'is', 'a', 'string']

In [3]:
s = "12312a666b777c"
s

'12312a666b777c'

In [6]:
import re

re.split("[a-z]", s)  # or we can split using regex

['12312', '666', '777', '']

In [7]:
[int(x) for x in re.split("[a-z]", s) if len(x) > 0]

[12312, 666, 777]

## Matching text at the start or end of a string

In [8]:
s = "do be do be do"
s

'do be do be do'

In [9]:
s.startswith("do")

True

In [10]:
s.endswith("do")

True

In [13]:
"be" in s, "do" in s

(True, True)

We have to use a tuple when passing multiple args to startswith or endswith.

In [15]:
choices = ["be", "do"]

s.startswith(choices)  # must be str or a tuple of str, not list

TypeError: startswith first arg must be str or a tuple of str, not list

In [16]:
s.startswith(tuple(choices))

True

## matching strings using shell wildcard patterns
Some of these wirdcard patterns could be *.py, Dat[0-9]*.csv, etc, for example.

In [2]:
from fnmatch import fnmatch, fnmatchcase

fnmatch

<function fnmatch.fnmatch(name, pat)>

In [3]:
fnmatch('foo.txt', '*.txt')

True

In [4]:
fnmatch('foo.txt', 'foo*')

True

In [5]:
fnmatch('Foo.txt', 'foo*')

True

In [6]:
fnmatchcase('Foo.txt', 'foo*')

False

In [7]:
fnmatch('Dat45.csv', 'Dat[0-9]*')

True

In [8]:
names = ['Dat1.csv', 'Dat2.csv', 'config.ini', 'foo.py']
[name for name in names if fnmatch(name, 'Dat*.csv')]

['Dat1.csv', 'Dat2.csv']

In [9]:
addresses = [
 '5412 N CLARK ST',
 '1060 W ADDISON ST',
 '1039 W GRANVILLE AVE',
 '2122 N CLARK ST',
 '4802 N BROADWAY',
]

[addr for addr in addresses if fnmatchcase(addr, '* ST')]

['5412 N CLARK ST', '1060 W ADDISON ST', '2122 N CLARK ST']

fnmatch sits somewhere between the functionality of simple string methods and the full power of regular expressions.

## Matching and Searching for Text Patterns

In [10]:
s = "This is some string to match with"
s

'This is some string to match with'

In [12]:
s.find("to")

20

In [13]:
s[s.find("to"):]

'to match with'

In [15]:
s.find("z") # there are no z's in the string

-1

We can also do regular expression matching.

In [19]:
import re

some_date = "11/27/2012"
sre = re.match(r"\d+/\d+/\d+", some_date)
sre

<_sre.SRE_Match object; span=(0, 10), match='11/27/2012'>

In [24]:
bool(sre)

True

If you’re going to perform a lot of matches using the same pattern, it usually pays to precompile the regular expression pattern into a pattern object first. 

In [38]:
datepat = re.compile(r"\d+/\d+/\d+")
datepat

re.compile(r'\d+/\d+/\d+', re.UNICODE)

In [39]:
some_date1 = "11/27/2012"
some_date2 = "Nov 27, 2012"
some_date1, some_date2

('11/27/2012', 'Nov 27, 2012')

In [41]:
match1 = datepat.match(some_date1)
match2 = datepat.match(some_date2)

In [42]:
match1

<_sre.SRE_Match object; span=(0, 10), match='11/27/2012'>

In [44]:
match2 is None

True

In [45]:
bool(match1), bool(match2)

(True, False)

In [46]:
if match1:
    print("We have a match for date1")

We have a match for date1


We can also use findall to get all occurrances.

In [48]:
text = "Today is 11/27/2012. PyCon starts 3/13/2013."
datepat.findall(text)

['11/27/2012', '3/13/2013']

For [more on Python regex](https://www.tutorialspoint.com/python/python_reg_expressions.htm).

## Searching and Replacing Text

In [49]:
s = "Let's eat, Grandma!"
s

"Let's eat, Grandma!"

In [50]:
s.replace(",", "")

"Let's eat Grandma!"

We can also use sub from the re module.

In [53]:
import re

text = "Today is 11/27/2012. PyCon starts 3/13/2013."
re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text)

'Today is 2012-11-27. PyCon starts 2013-3-13.'

Consider compiling patterns if you are going to use them several times.

In [55]:
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
datepat

re.compile(r'(\d+)/(\d+)/(\d+)', re.UNICODE)

In [56]:
datepat.sub(r'\3-\1-\2', text)

'Today is 2012-11-27. PyCon starts 2013-3-13.'

## Searching and Replacing Case-Insensitive Text

In [57]:
text = 'UPPER PYTHON, lower python, Mixed Python'
re.findall('python', text, flags=re.IGNORECASE)

['PYTHON', 'python', 'Python']

In [58]:
re.sub('python', 'snake', text, flags=re.IGNORECASE)

'UPPER snake, lower snake, Mixed snake'

In [59]:
def matchcase(word):
    def replace(m):
        text = m.group()
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.capitalize()
        else:
            return word
    return replace

re.sub('python', matchcase('snake'), text, flags=re.IGNORECASE)

'UPPER SNAKE, lower snake, Mixed Snake'

## Specifying a Regular Expression for the Shortest Match
The * operator in a regular expression is greedy...

In [60]:
str_pat = re.compile(r'\"(.*)\"')
text1 = 'Computer says "no."'
str_pat.findall(text1)

['no.']

In [61]:
text2 = 'Computer says "no." Phone says "yes."'
str_pat.findall(text2)

['no." Phone says "yes.']

To fix this, add the ? modifier after the * operator in the pattern.

In [62]:
str_pat = re.compile(r'\"(.*?)\"')
str_pat.findall(text2)

['no.', 'yes.']

## Writing a Regular Expression for Multiline Patterns

In [63]:
comment = re.compile(r'/\*(.*?)\*/')
comment

re.compile(r'/\*(.*?)\*/', re.UNICODE)

In [65]:
text1 = '/* this is a comment */'

text2 = '''/* this is a
              multiline comment */ '''

In [67]:
comment.findall(text1)

[' this is a comment ']

In [68]:
comment.findall(text2)  # mwah, mwah, mwah...

[]

Add support for new lines. 

In [69]:
comment = re.compile(r'/\*((?:.|\n)*?)\*/')
comment

re.compile(r'/\*((?:.|\n)*?)\*/', re.UNICODE)

In [70]:
comment.findall(text2)

[' this is a\n              multiline comment ']

## Normalizing Unicode Text to a Standard Representation
In Unicode, certain characters can be represented by more than one valid sequence of code points. 

In [71]:
s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'

In [72]:
s1

'Spicy Jalapeño'

In [73]:
s2

'Spicy Jalapeño'

In [74]:
s1 == s2

False

In [75]:
len(s1), len(s2)

(14, 15)

Having multiple representations is a problem for programs that compare strings

In [76]:
import unicodedata

n1 = unicodedata.normalize("NFC", s1)
n2 = unicodedata.normalize("NFC", s2)

In [77]:
n1

'Spicy Jalapeño'

In [78]:
n2

'Spicy Jalapeño'

In [79]:
n1 == n2

True

See the [unicodedata](https://docs.python.org/3/library/unicodedata.html) module. 

## Working with Unicode Characters in Regular Expressions
See the alternative [regex](https://pypi.org/project/regex/) module which builds on what is available from the standard library. 

## Stripping Unwanted Characters from Strings

In [80]:
" hello world ".strip()

'hello world'

In [81]:
" hello world ".rstrip()

' hello world'

In [82]:
" hello world ".lstrip()

'hello world '

In [83]:
"---hello world===".strip("-")

'hello world==='

In [84]:
"---hello world---".lstrip("-")

'hello world---'

##  Sanitizing and Cleaning Up Text

In [89]:
# from https://www.tutorialspoint.com/python3/string_maketrans.htm

intab = "aeiou"
outtab = "12345"
trantab = str.maketrans(intab, outtab)
trantab

{97: 49, 101: 50, 105: 51, 111: 52, 117: 53}

In [90]:
s = "this is string example....wow!!!"
print (s.translate(trantab))

th3s 3s str3ng 2x1mpl2....w4w!!!


In [92]:
ins = "abc"
out = "123"

trans = str.maketrans(ins, out)
s = "a Python munched on the mouse..."
s.translate(trans)

'1 Python mun3hed on the mouse...'

## Aligning Text Strings
Use ljust, rjust and center...

In [93]:
text = 'Hello World'

In [101]:
text.ljust(20)

'Hello World         '

In [102]:
text.rjust(20)

'         Hello World'

In [103]:
text.center(20)

'    Hello World     '

In [104]:
text.center(20, "*")

'****Hello World*****'

We can also use format...

In [109]:
format(text, "<20")

'Hello World         '

In [110]:
format(text, ">20")

'         Hello World'

In [111]:
"{:>10s} {:>10s}".format("Hello", "World")

'     Hello      World'

## Combining and Concatenating Strings
The most important thing to remember is that using the + operator to join a large number of string is grossly inefficient.

In [1]:
some_list = ["Here", "are", "some", "strings"]
some_list

['Here', 'are', 'some', 'strings']

In [3]:
" ".join(some_list)

'Here are some strings'

In [4]:
"{} {}".format("str1", "str2")

'str1 str2'

In [8]:
"str1" + " str2"

'str1 str2'

In [9]:
"str1" " str2"

'str1 str2'

Time a join() vs the + operator...

In [14]:
import string

string.ascii_lowercase
some_list = [i for i in string.ascii_lowercase]
"".join(some_list)

'abcdefghijklmnopqrstuvwxyz'

In [22]:
result = "".join(some_list)
result

'abcdefghijklmnopqrstuvwxyz'

In [23]:
result = ""
for i in some_list:
    result += i
result

'abcdefghijklmnopqrstuvwxyz'

In [24]:
%%timeit
result = "".join(some_list)

253 ns ± 15.9 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [25]:
%%timeit
result = ""
for i in some_list:
    result += i

1.28 µs ± 41 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


## Interpolating variables in strings

In [34]:
s = "{name} has {n} messages"

In [35]:
s.format(name="Guido", n="37")

'Guido has 37 messages'

In [36]:
[k for k,v in vars().items()][:15]

['__name__',
 '__doc__',
 '__package__',
 '__loader__',
 '__spec__',
 '__builtin__',
 '__builtins__',
 '_ih',
 '_oh',
 '_dh',
 'In',
 'Out',
 'get_ipython',
 'exit',
 'quit']

In [37]:
name = "Guido"
n = 37

In [39]:
"name" in vars(), "n" in vars()

(True, True)

In [40]:
s

'{name} has {n} messages'

In [41]:
s.format_map(vars())  # ooh

'Guido has 37 messages'

In [43]:
class Info:
    def __init__(self, name, n):
        self.name = name
        self.n = n

a = Info("Guido", 37)
a

<__main__.Info at 0x24b795637f0>

In [44]:
vars(a)

{'name': 'Guido', 'n': 37}

In [45]:
s.format_map(vars(a))

'Guido has 37 messages'

However, a downside of format and format_map is that they do not deal gracefully with missing values.

In [46]:
class safesub(dict):
    def __missing__(self, key):
        return "{" + key + "}"


In [47]:
del n

In [49]:
s.format_map(vars())

KeyError: 'n'

In [50]:
s.format_map(safesub(vars()))

'Guido has {n} messages'

## Reformatting text to a fixed number of columns
Use the textwrap module...

In [1]:
s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."

s

"Look into my eyes, look into my eyes, the eyes, the eyes, the eyes, not around the eyes, don't look around the eyes, look into my eyes, you're under."

In [2]:
import textwrap

print(textwrap.fill(s, 70))

Look into my eyes, look into my eyes, the eyes, the eyes, the eyes,
not around the eyes, don't look around the eyes, look into my eyes,
you're under.


In [3]:
print(textwrap.fill(s, 40))

Look into my eyes, look into my eyes,
the eyes, the eyes, the eyes, not around
the eyes, don't look around the eyes,
look into my eyes, you're under.


In [4]:
print(textwrap.fill(s, 40, initial_indent="    "))

    Look into my eyes, look into my
eyes, the eyes, the eyes, the eyes, not
around the eyes, don't look around the
eyes, look into my eyes, you're under.


In [5]:
print(textwrap.fill(s, 40, subsequent_indent="    "))

Look into my eyes, look into my eyes,
    the eyes, the eyes, the eyes, not
    around the eyes, don't look around
    the eyes, look into my eyes, you're
    under.


In [6]:
import os

os.get_terminal_size().columns

120

## Handling HTML and XML Entities in Text

In [1]:
s = 'Elements are written as "<tag>text</tag>".'
s

'Elements are written as "<tag>text</tag>".'

In [2]:
import html

html.escape(s)

'Elements are written as &quot;&lt;tag&gt;text&lt;/tag&gt;&quot;.'

In [3]:
html.escape(s, quote=False)

'Elements are written as "&lt;tag&gt;text&lt;/tag&gt;".'

## Tokenizing Text

In [7]:
import re

NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))
master_pat

re.compile(r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)|(?P<NUM>\d+)|(?P<PLUS>\+)|(?P<TIMES>\*)|(?P<EQ>=)|(?P<WS>\s+)',
re.UNICODE)

In [8]:
from collections import namedtuple

Token = namedtuple('Token', ['type','value'])


def generate_tokens(pat, text):
    scanner = pat.scanner(text)
    for m in iter(scanner.match, None):
        yield Token(m.lastgroup, m.group())


for tok in generate_tokens(master_pat, 'foo = 42'):
    print(tok)


Token(type='NAME', value='foo')
Token(type='WS', value=' ')
Token(type='EQ', value='=')
Token(type='WS', value=' ')
Token(type='NUM', value='42')


## Performing Text Operations on Byte Strings
Byte strings already support most of the same built-in operations as text strings.

In [10]:
data = b'Hello World'
data

b'Hello World'

In [11]:
data[:5]

b'Hello'

In [12]:
data.startswith(b'Hello'), data.endswith(b'World')

(True, True)

In [13]:
data.replace(b'Hello', b'Hello Small')

b'Hello Small World'

regex patterns will also need to be expressed as bytes...

In [14]:
import re

data = b'FOO:BAR,SPAM'
data

b'FOO:BAR,SPAM'

In [15]:
re.split('[:,]',data)

TypeError: cannot use a string pattern on a bytes-like object

In [16]:
re.split(b'[:,]',data)

[b'FOO', b'BAR', b'SPAM']

***