# Regex

| symbol | description |
| --- | ------------ |
| . | any char |
| ? | this char might have this char or not (0 or 1)|
| * | this char appears __0 or more times__ |
| + | this char appears __at least once__ |
| ^ | starts with this char |
| \$ | ends with this char |
| \{n\} | repeats `n` times |
| \{n1, n2\} | repeats at least `n1` times, and at most `n2` times |
| \[abc \] | this char should match any of the char in the bracket (can be range, ex. a-z)|
| a\|b | a or b |

## regex using backslash (\\)

|symbol|description|
|---|-----|
|\\ \\ | backslash itself|
| \\d | any digit ([0-9]) |
| \\D | All char __except__ any digit ([^0-9]) |
| \\s | whitespace |
| \\S | All char __except__ whitespace |
| \\w | any char OR digit ([a-zA-Z0-9])|
| \\W | Anything __except__ all char or digit ([^a-zA-Z0-9])|


In [28]:
import re

# re.match

In [36]:
check = 'ab.'

print(re.match(check, 'abc'))
print(re.match(check, 'ab2'))
print(re.match(check, 'c'))
print(re.match(check, 'ab'))

<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(0, 3), match='ab2'>
None
None


# re.compile

In [37]:
import time
normal_s_time = time.time()
r = 'ab.'
for i in range(1000):
    re.match(r, 'abc' )
print('time it took: ', time.time() - normal_s_time)

time it took:  0.0006031990051269531


In [39]:
compile_s_time = time.time()
r = re.compile('ab.')
for i in range(1000):
    r.match('abc')
print('complie time: ', time.time() - compile_s_time)

complie time:  0.0002760887145996094


# re.search

In [48]:
check = 'ab?'

print(re.match('ab', check))
print(re.search('a', check))

print(re.match('kkk ab', check))
print(re.search('kkk ab', check))


<re.Match object; span=(0, 2), match='ab'>
<re.Match object; span=(0, 1), match='a'>
None
None


# re.split

In [50]:
r = re.compile(' ')
print(r.split('aaa bbbb cccc'))

r = re.compile('c')
print(r.split('abc abbc abcbab'))

r = re.compile('[0-9]')
print(r.split('s1abc2bc2320ja1j23l4'))

['aaa', 'bbbb', 'cccc']
['ab', ' abb', ' ab', 'bab']
['s', 'abc', 'bc', '', '', '', 'ja', 'j', '', 'l', '']


# re.sub

In [52]:
print(re.sub('[a-z]', 'abcdefg', '1'))

print(re.sub('[^a-z]', 'abc defg', '1'))

1
abc defg


# re.findall

In [61]:
print(re.findall('[\d]', '1ab 2cd'))

print(re.findall('[\W]', '!abc@#@#Fg'))

print(re.findall('\w', 'james james james hwang'))

['1', '2']
['!', '@', '#', '@', '#']
['j', 'a', 'm', 'e', 's', 'j', 'a', 'm', 'e', 's', 'j', 'a', 'm', 'e', 's', 'h', 'w', 'a', 'n', 'g']


# re.finditer

In [63]:
iter1 =re.finditer('[\d]', '1ab 2cd 3ef')
print(iter1)
for i in iter1:
    print(i)

<callable_iterator object at 0x104addf40>
<re.Match object; span=(0, 1), match='1'>
<re.Match object; span=(4, 5), match='2'>
<re.Match object; span=(8, 9), match='3'>


# Tokenization

In [65]:
s = 'time is gold'
token = [x for x in s.split()]
token

['time', 'is', 'gold']

use `nltk`

In [67]:
import nltk

In [68]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/siro/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# word tokenize

use `nltk`'s `word_tokenize`

In [70]:
from nltk.tokenize import word_tokenize

tokens = word_tokenize(s)
tokens

['time', 'is', 'gold']

# sentence tokenize

`nltk`'s `sent_tokenize`

In [72]:
sentences = 'The world is a beautiful place.\nI want pizza.'
print(sentences)

tokens = [x for x in sentences.split('\n')]
tokens

The world is a beautiful place.
I want pizza.


['The world is a beautiful place.', 'I want pizza.']

In [73]:
from nltk.tokenize import sent_tokenize
tokens = sent_tokenize(sentences)
tokens

['The world is a beautiful place.', 'I want pizza.']

In [76]:
from nltk.tokenize import RegexpTokenizer

s = 'Where there\'s a will, there\'s a way'

tokenizer = RegexpTokenizer('[\w]+')

tokens = tokenizer.tokenize(s)
tokens

['Where', 'there', 's', 'a', 'will', 'there', 's', 'a', 'way']

In [79]:
tokenizer2 = RegexpTokenizer('[\s]+', gaps=True)
tokens = tokenizer2.tokenize(s)
tokens

['Where', "there's", 'a', 'will,', "there's", 'a', 'way']

In [80]:
from keras.preprocessing.text import text_to_word_sequence

s = 'Where there\'s a will, there\'s a way'

text_to_word_sequence(s)

['where', "there's", 'a', 'will', "there's", 'a', 'way']

In [85]:
from nltk import ngrams
s = 'There is no royal road to learning'
bigram = list(ngrams(s.split(), 2))
bigram

[('There', 'is'),
 ('is', 'no'),
 ('no', 'royal'),
 ('royal', 'road'),
 ('road', 'to'),
 ('to', 'learning')]

In [86]:
words = word_tokenize("Think like man of action and act like man of thought.")
words

['Think',
 'like',
 'man',
 'of',
 'action',
 'and',
 'act',
 'like',
 'man',
 'of',
 'thought',
 '.']

In [87]:
nltk.download('averaged_perceptron_tagger')

nltk.pos_tag(words)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/siro/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('Think', 'VBP'),
 ('like', 'IN'),
 ('man', 'NN'),
 ('of', 'IN'),
 ('action', 'NN'),
 ('and', 'CC'),
 ('act', 'NN'),
 ('like', 'IN'),
 ('man', 'NN'),
 ('of', 'IN'),
 ('thought', 'NN'),
 ('.', '.')]

In [88]:
nltk.pos_tag(word_tokenize("A rolling stone gathers no moss."))

[('A', 'DT'),
 ('rolling', 'VBG'),
 ('stone', 'NN'),
 ('gathers', 'NNS'),
 ('no', 'DT'),
 ('moss', 'NN'),
 ('.', '.')]

In [89]:
nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /Users/siro/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [91]:
stop_words = stopwords.words('english')
stop_words[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [93]:
s = "If you do not walk today, you will have to run tomorrow."

words = word_tokenize(s)

non_stop = []
for w in words:
    if w not in stop_words:
        non_stop.append(w)
non_stop

['If', 'walk', 'today', ',', 'run', 'tomorrow', '.']

# Spell check

In [1]:
from autocorrect import Speller

ModuleNotFoundError: No module named 'autocorrect'

In [99]:
spell = Speller('en')

print(spell('peoplle'))
print(spell('poeple'))
print(spell('peopae'))

people
people
people


In [100]:
s = word_tokenize('Earlly biird catchees the womm.')
ss = ' '.join([spell(s) for s in s])
ss

'Early bird catches the worm .'