# NLP Basics: Learning how to use regular expressions

### Using regular expressions in Python

Python's `re` package is the most commonly used regex resource. More details can be found [here](https://docs.python.org/3/library/re.html).

In [1]:
import re

re_test = 'This is a made up string to test 2 different regex methods'
re_test_messy = 'This      is a made up     string to test 2    different regex methods'
re_test_messier = 'This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods'

### Splitting a sentence into a list of words

In [2]:
# Split using a single white space \s
re.split('\s', re_test) 

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [3]:
# Split using one or more white spaces \s+
re.split('\s+', re_test_messy) 

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [4]:
# Split using a non-Word character \W+ ie. Special character
re.split('\W+', re_test_messier) 

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [5]:
# Find all the word characters \w+
re.findall('\w+', re_test_messier) 

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [6]:
# Find all the non-whitespaces characters \S+
re.findall('\S+', re_test) 

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

## Replacing a specific string

In [7]:
pep8_test = 'I try to follow PEP8 guidelines'
pep7_test = 'I try to follow PEP7 guidelines' #typo 1
peep8_test = 'I try to follow PEEP8 guidelines' #typo 2

### Finding the sepcific string and typos

In [15]:
# Find strings with captial letters with one or more characters
re.findall('[A-Z]+[0-9]+', pep8_test)

['PEP8']

In [16]:
# Find strings with captial letters with one or more characters
re.findall('[A-Z]+[0-9]+', pep7_test)

['PEP7']

In [17]:
# Find strings with captial letters with one or more characters
re.findall('[A-Z]+[0-9]+', peep8_test)

['PEEP8']

### Replacing the deteced string

In [19]:
# Find and replace the string with new string
re.sub('[A-Z]+[0-9]+', 'PEP8 Python Style Guide', pep8_test)

'I try to follow PEP8 Python Style Guide guidelines'

In [20]:
# Find and replace the string with new string
re.sub('[A-Z]+[0-9]+', 'PEP8 Python Style Guide', pep7_test)

'I try to follow PEP8 Python Style Guide guidelines'

In [21]:
# Find and replace the string with new string
re.sub('[A-Z]+[0-9]+', 'PEP8 Python Style Guide', peep8_test)

'I try to follow PEP8 Python Style Guide guidelines'

### Other examples of regex methods

- re.search()
- re.match()
- re.fullmatch()
- re.finditer()
- re.escape()