In [18]:
import string

In [19]:
print string.ascii_letters
print repr(string.whitespace)
print string.punctuation

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
'\t\n\x0b\x0c\r '
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [20]:
# Note the 'u' at the beginnning of the unicode string!
# Unicode can poison your script when you write to a file,
# especially if it is mixed with ascii. Beware of
# utf-16 and utf-32 encoded text...

unicodeStr = u'A wild m—dash!'
asciiStr = 'This is an ascii string. It is the default in Python.'

print unicodeStr
print asciiStr

unicodeStr = unicodeStr.encode('ascii', 'xmlcharrefreplace') # or 'ignore'

with open("foo.txt", "w") as outfile:
    outfile.write(unicodeStr)
    
print unicodeStr

A wild m—dash!
This is an ascii string. It is the default in Python.
A wild m&#8212;dash!


In [21]:
# Tokenization is the process of processing 
# natural langauge strings into lists of words...

sentence = "Sarah went to the store to buy some milk or something."
tokens = sentence.split()
print tokens

['Sarah', 'went', 'to', 'the', 'store', 'to', 'buy', 'some', 'milk', 'or', 'something.']


In [22]:
# Time for some basic regex...
import re

print re.findall(r"\w+", sentence)
print re.findall(r"\w+|\W+", sentence)

['Sarah', 'went', 'to', 'the', 'store', 'to', 'buy', 'some', 'milk', 'or', 'something']
['Sarah', ' ', 'went', ' ', 'to', ' ', 'the', ' ', 'store', ' ', 'to', ' ', 'buy', ' ', 'some', ' ', 'milk', ' ', 'or', ' ', 'something', '.']


In [23]:
sentence = "Sarah\'s gone to buy some \'milk\' or something. \'Don't forget the_bread,\' yelled Susan."

print re.findall(r"\w+", sentence)
print re.findall(r"\w+|\W+", sentence)

['Sarah', 's', 'gone', 'to', 'buy', 'some', 'milk', 'or', 'something', 'Don', 't', 'forget', 'the_bread', 'yelled', 'Susan']
['Sarah', "'", 's', ' ', 'gone', ' ', 'to', ' ', 'buy', ' ', 'some', " '", 'milk', "' ", 'or', ' ', 'something', ". '", 'Don', "'", 't', ' ', 'forget', ' ', 'the_bread', ",' ", 'yelled', ' ', 'Susan', '.']


In [31]:
print re.findall(r"\b[\w+']+\b", sentence)

# Now done with splitting, and lowercase... but note the empty string as the final token!
tokens = re.split(r"[^a-zA-Z0-9]{2,}|[^a-zA-Z0-9']+", sentence)

print tokens

["Sarah's", 'gone', 'to', 'buy', 'some', 'milk', 'or', 'something', "Don't", 'forget', 'the_bread', 'yelled', 'Susan']
["Sarah's", None, ' ', 'gone', None, ' ', 'to', None, ' ', 'buy', None, ' ', 'some', " '", None, 'milk', "' ", None, 'or', None, ' ', 'something', ". '", None, "Don't", None, ' ', 'forget', None, ' ', 'the', None, '_', 'bread', ",' ", None, 'yelled', None, ' ', 'Susan', None, '.', '']


In [25]:
# List comprehensions!

capsLockTokens = [t.upper() for t in tokens if t]

print capsLockTokens

["SARAH'S", 'GONE', 'TO', 'BUY', 'SOME', 'MILK', 'OR', 'SOMETHING', "DON'T", 'FORGET', 'THE', 'BREAD', 'YELLED', 'SUSAN']


In [26]:
# Dict comprehensions!

tokenDict = {i:tokens[i] for i in range(len(tokens)) if tokens[i]}

print tokenDict

# But usually, this is better...
print dict(enumerate(tokens))

# If / else statements in list or dict comprehensions can
# also look like this...

print [t.upper() if t else "FOO!" for t in tokens]

{0: "Sarah's", 1: 'gone', 2: 'to', 3: 'buy', 4: 'some', 5: 'milk', 6: 'or', 7: 'something', 8: "Don't", 9: 'forget', 10: 'the', 11: 'bread', 12: 'yelled', 13: 'Susan'}
{0: "Sarah's", 1: 'gone', 2: 'to', 3: 'buy', 4: 'some', 5: 'milk', 6: 'or', 7: 'something', 8: "Don't", 9: 'forget', 10: 'the', 11: 'bread', 12: 'yelled', 13: 'Susan', 14: ''}
["SARAH'S", 'GONE', 'TO', 'BUY', 'SOME', 'MILK', 'OR', 'SOMETHING', "DON'T", 'FORGET', 'THE', 'BREAD', 'YELLED', 'SUSAN', 'FOO!']


In [27]:
# Sorting... finally a use for the lambda function

tokens.sort()

print tokens

tokens = map(lambda x: x.lower(), tokens)

tokens.sort()

print tokens

print sorted(tokens, key=lambda x: len(x))

print max(tokens, key=lambda x: len(x))

print min(tokens, key=lambda x: len(x))

['', "Don't", "Sarah's", 'Susan', 'bread', 'buy', 'forget', 'gone', 'milk', 'or', 'some', 'something', 'the', 'to', 'yelled']
['', 'bread', 'buy', "don't", 'forget', 'gone', 'milk', 'or', "sarah's", 'some', 'something', 'susan', 'the', 'to', 'yelled']
['', 'or', 'to', 'buy', 'the', 'gone', 'milk', 'some', 'bread', "don't", 'susan', 'forget', 'yelled', "sarah's", 'something']
something

