In [2]:
import re

In [3]:
phone = '123-456-7891'

In [4]:
re_phone = re.compile("[0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]")

In [11]:
re_phone.findall(phone)

['123-456-7891']

In [12]:
re_phone = re.compile("\d\d\d-\d\d\d-\d\d\d\d")

In [13]:
re_phone.findall(phone)

['123-456-7891']

In [14]:
re_phone = re.compile('\d{3}-\d{3}-\d{4}')

In [15]:
re_phone.findall(phone)

['123-456-7891']

In [17]:
re_phone = re.compile('\d+-\d+-\d+')
re_phone.findall(phone)

['123-456-7891']

In [18]:
re_punc = re.compile("([\"\''().,;:/_?!-\-])")

In [21]:
text = "I don't know who Kara's new friend is-- is it 'Mr. Toad'?"
re_punc.sub(r" \1 ", text)

"I don ' t know who Kara ' s new friend is -  -  is it  ' Mr .  Toad '  ? "

In [42]:
re_punc.findall("Hello ''().,__?-")

["'", "'", '(', ')', '.', ',', '_', '_', '?', '-']

In [44]:
re_punc = re.compile("([\"\''().,;:/_?!—\-])") # add spaces around punctuation
re_apos = re.compile(r"n ' t ")    # n't
re_bpos = re.compile(r" ' s ")     # 's
re_mult_space = re.compile(r"  *") # replace multiple spaces with just one

def simple_toks(sent):
    sent = re_punc.sub(r" \1 ", sent)
    sent = re_apos.sub(r" n't ", sent)
    sent = re_bpos.sub(r" 's ", sent)
    sent = re_mult_space.sub(' ', sent)
    return sent.lower().split()

In [45]:
' '.join(simple_toks(text))

"i do n't know who kara 's new friend is - - is it ' mr . toad ' ?"

In [50]:
text2 = re_punc.sub(r" \1 ", text); text2

"I don ' t know who Kara ' s new friend is -  -  is it  ' Mr .  Toad '  ? "

In [51]:
text3 = re_apos.sub(r" n't ", text2); text3

"I do n't know who Kara ' s new friend is -  -  is it  ' Mr .  Toad '  ? "

In [52]:
text4 = re_bpos.sub(r" 's ", text3); text4

"I do n't know who Kara 's new friend is -  -  is it  ' Mr .  Toad '  ? "

In [53]:
sentences = ['All this happened, more or less.',
             'The war parts, anyway, are pretty much true.',
             "One guy I knew really was shot for taking a teapot that wasn't his.",
             'Another guy I knew really did threaten to have his personal enemies killed by hired gunmen after the war.',
             'And so on.',
             "I've changed all their names."]

In [54]:
tokens = list(map(simple_toks, sentences))

In [55]:
tokens

[['all', 'this', 'happened', ',', 'more', 'or', 'less', '.'],
 ['the',
  'war',
  'parts',
  ',',
  'anyway',
  ',',
  'are',
  'pretty',
  'much',
  'true',
  '.'],
 ['one',
  'guy',
  'i',
  'knew',
  'really',
  'was',
  'shot',
  'for',
  'taking',
  'a',
  'teapot',
  'that',
  'was',
  "n't",
  'his',
  '.'],
 ['another',
  'guy',
  'i',
  'knew',
  'really',
  'did',
  'threaten',
  'to',
  'have',
  'his',
  'personal',
  'enemies',
  'killed',
  'by',
  'hired',
  'gunmen',
  'after',
  'the',
  'war',
  '.'],
 ['and', 'so', 'on', '.'],
 ['i', "'", 've', 'changed', 'all', 'their', 'names', '.']]

In [56]:
import collections

In [59]:
PAD = 0; SOS = 1


def toks2ids(sentences):
    voc_cnt = collections.Counter(t for sent in sentences for t in sent)
    vocab = sorted(voc_cnt, key=voc_cnt.get, reverse=True) # words in frequency order
    vocab.insert(PAD, "<PAD>")
    vocab.insert(SOS, "<SOS>")
    w2id = {w:i for i,w in enumerate(vocab)}
    ids = [[w2id[t] for t in sent] for sent in sentences]
    return ids, vocab, w2id, voc_cnt

In [60]:
ids, vocab, w2id, voc_cnt = toks2ids(tokens)

In [63]:
vocab[:3]

['<PAD>', '<SOS>', '.']

In [65]:
w2id["<PAD>"], w2id["<SOS>"], w2id["."]

(0, 1, 2)

In [67]:
voc_cnt["."]

6

In [68]:
ids

[[5, 13, 14, 3, 15, 16, 17, 2],
 [6, 7, 18, 3, 19, 3, 20, 21, 22, 23, 2],
 [24, 8, 4, 9, 10, 11, 25, 26, 27, 28, 29, 30, 11, 31, 12, 2],
 [32, 8, 4, 9, 10, 33, 34, 35, 36, 12, 37, 38, 39, 40, 41, 42, 43, 6, 7, 2],
 [44, 45, 46, 2],
 [4, 47, 48, 49, 5, 50, 51, 2]]

In [78]:
" ".join([vocab[v] for v in ids[0]])

'all this happened , more or less .'

In [77]:
" ".join(tokens[0])

'all this happened , more or less .'

In [79]:
vocab

['<PAD>',
 '<SOS>',
 '.',
 ',',
 'i',
 'all',
 'the',
 'war',
 'guy',
 'knew',
 'really',
 'was',
 'his',
 'this',
 'happened',
 'more',
 'or',
 'less',
 'parts',
 'anyway',
 'are',
 'pretty',
 'much',
 'true',
 'one',
 'shot',
 'for',
 'taking',
 'a',
 'teapot',
 'that',
 "n't",
 'another',
 'did',
 'threaten',
 'to',
 'have',
 'personal',
 'enemies',
 'killed',
 'by',
 'hired',
 'gunmen',
 'after',
 'and',
 'so',
 'on',
 "'",
 've',
 'changed',
 'their',
 'names']

In [80]:
w2id

{'<PAD>': 0,
 '<SOS>': 1,
 '.': 2,
 ',': 3,
 'i': 4,
 'all': 5,
 'the': 6,
 'war': 7,
 'guy': 8,
 'knew': 9,
 'really': 10,
 'was': 11,
 'his': 12,
 'this': 13,
 'happened': 14,
 'more': 15,
 'or': 16,
 'less': 17,
 'parts': 18,
 'anyway': 19,
 'are': 20,
 'pretty': 21,
 'much': 22,
 'true': 23,
 'one': 24,
 'shot': 25,
 'for': 26,
 'taking': 27,
 'a': 28,
 'teapot': 29,
 'that': 30,
 "n't": 31,
 'another': 32,
 'did': 33,
 'threaten': 34,
 'to': 35,
 'have': 36,
 'personal': 37,
 'enemies': 38,
 'killed': 39,
 'by': 40,
 'hired': 41,
 'gunmen': 42,
 'after': 43,
 'and': 44,
 'so': 45,
 'on': 46,
 "'": 47,
 've': 48,
 'changed': 49,
 'their': 50,
 'names': 51}

In [82]:
message = "😒🎦 🤢🍕"
re_frown = re.compile(r"😒|🤢")
re_frown.sub(r"😊", message)

'😊🎦 😊🍕'