# Chapter 2: Corpus Processing Tools
Author: Pierre Nugues

We use the `regex` module to have a better Unicode support

In [None]:
import regex as re

## Matching one occurrence

#### A first match with `re.search()`

In [None]:
line = 'The aerial acceleration alerted the ace pilot'
match = re.search('ab*c', line)
match      # <regex.Match object; span=(11, 13), match='ac'>

#### Getting the match value

In [None]:
match.group() # ac

## Getting all the matches

#### The list of all the strings

In [None]:
match_list = re.findall('ab*c', line)   # ['ac', 'ac']
match_list

#### The match groups (the objects)

In [None]:
match_iter = re.finditer('ab*c', line)   
list(match_iter)

## Interactive match

#### Using the shell (does not work with the notebooks)

In [None]:
import sys

for line in sys.stdin:
    if re.search('ab*c', line):    # m/ab*c/
        print('-> ' + line, end='')

#### Using IPython ipywidgets

In [None]:
# https://github.com/ipython/ipywidgets
import ipywidgets as widgets
from IPython.display import display

# The input box
text = widgets.Text()
display(text)

def handle_submit(sender):
    if re.search('ab*c', text.value):
        print('->', text.value)
    text.value = ''

# Hitting return fires handle_submit
text.on_submit(handle_submit)

## Nonprintable characters and modifiers
#### Start of a line

We create a list of multiple strings with `split()`

In [None]:
# text = sys.stdin.read()
text = """Sing, O goddess, the anger of Achilles 
son of Peleus, that brought countless ills upon the Achaeans.
""".split('\n')
text

`split()` adds empty strings. We strip the string before we split it. 

In [None]:
# text = sys.stdin.read()
text = """Sing, O goddess, the anger of Achilles 
son of Peleus, that brought countless ills upon the Achaeans.
""".strip().split('\n')
text

In [None]:
for line in text:
    match = re.search('^s', line) # m/^s/
    if match:
        print('-> ' + match.group())

#### The case-insensitive modifier

We did not match `S`. We can make the regex case-insensitive

In [None]:
for line in text:
    match = re.search('^s', line, re.I) # m/^s/i
    if match:
        print('-> ' + match.group())

#### Case insensitive and multiline

The start anchor `^` corresponds to the unique start a string. With the multiline modifier, `re.M` a `\n` also defines a start position

In [None]:
text = """Sing, O goddess, the anger of Achilles son
of Peleus, that brought countless ills upon the Achaeans.
""".strip()
text

In [None]:
match = re.search('^s', text, re.I | re.M) # m/^s/im
if match:
    print('-> ' + match.group())

In [None]:
match = re.search('^o', text, re.I | re.M)
if match:
    print('-> ' + match.group())

#### Getting all the matches

In [None]:
text = """Sing, O goddess, the anger of Achilles 
son of Peleus, that brought countless ills upon the Achaeans.
""".strip()
text

In [None]:
match_list = re.findall('^s', text, re.I | re.M)
print(match_list)

#### Getting all the matches with `finditer()`

In [None]:
match_list = re.finditer('^s', text, re.I | re.M)
match_list

In [None]:
for match in match_list:
    print(match)

## Substitution

#### Global replacement: `s/regex/replacement/g`

In [None]:
text = """Sing, O goddess, the anger of Achilles 
son of Peleus, that brought countless ills upon the Achaeans.
""".strip().split('\n')
text

In [None]:
for line in text:
    if re.search('es+', line):
        print("Old: " + line)
        # Replaces all the occurrences
        line = re.sub('es+', 'ES', line)
        print("New: " + line)
# s/ab+c/ABC/g

 
#### Just one replacement: s/regex/replacement/

In [None]:
text = """Sing, O goddess, the anger of Achilles 
son of Peleus, that brought countless ills upon the Achaeans.
""".strip().split('\n')
text

In [None]:
for line in text:
    if re.search('es+', line):
        print("Old: " + line)
        # Replaces all the occurrences
        line = re.sub('es+', 'ES', line, 1)
        print("New: " + line)
# s/ab+c/ABC/

## Backreferences

In [None]:
line = 'abbbcdeeef'
match = re.search(r'(.)\1\1', line)
match.group(1)                # 'b'

 #### Substitutions `s/(.)\1\1/***/g`

In [None]:
re.sub(r'(.)\1\1', '***', 'abbbcdeeef')  # 'a***cd***f'

#### Multiple backreferences `m/\$ *([0-9]+)\.?([0-9]*)/`

In [None]:
price = "We'll buy it for $72.40"

In [None]:
match = re.search('\$ *([0-9]+)\.?([0-9]*)', price)
match.group() # ’$72.40’ The entire match

In [None]:
match.group(1) # ’72’ The first group

In [None]:
match.group(2) # ’40’ The second group

#### Substitutions `s/\$ *([0-9]+)\.?([0-9]*)/\1 dollars and \2 cents/g`

In [None]:
re.sub('\$ *([0-9]+)\.?([0-9]*)',
       r'\1 dollars and \2 cents', price)
   # We’ll buy it for 72 dollars and 40 cents

#### Why `r`

In [None]:
'\1'

In [None]:
'\141'

In [None]:
r'\1'

In [None]:
r'\141'

## Matching objects

In [None]:
price = "We'll buy it for $72.40"

In [None]:
match = re.search('\$ *([0-9]+)\.?([0-9]*)', price)
match

#### Input

In [None]:
match.string            # We’ll buy it for $72.40

#### Groups

In [None]:
match.groups()          # (’72’, ’40’)

In [None]:
match.group(0)          # '$72.40'

In [None]:
match.group(1)

In [None]:
match.group(2)

#### Match objects: The indices

In [None]:
match.start(0)

In [None]:
match.end(0)

In [None]:
match.start(1)

In [None]:
match.end(1)

#### Example

In [None]:
line = """Tell me, O muse, of that ingenious hero
  who travelled far and wide after he had sacked
  the famous town of Troy.""".strip()
line

In [None]:
match = re.search(',.*,', line, re.S)
match

In [None]:
line[0:match.start()]             # ’Tell me’

In [None]:
line[match.start():match.end()]   # ’, O muse,’

In [None]:
line[match.end():]   # ’of that ingenious hero
         #  who travelled far and wide after he had sacked
         #  the famous town of Troy.’

## Concordances: `.{0,15}Nils Holgersson.{0,15}`

In [None]:
pattern = 'Nils Holgersson'
width = 15

We build a regex from these parameters: `.{0,width}pattern.{0,width}`

In [None]:
('.{{0,{width}}}{pattern}.{{0,{width}}}'
 .format(pattern=pattern, width=width))

In [None]:
file_name = '../../corpus/Selma.txt'
text = open(file_name).read()
text[:50]

In [None]:
# spaces match tabs and newlines
pattern = re.sub(' ', '\\s+', pattern)
# Replaces newlines with spaces in the text
text = re.sub('\s+', ' ', text)
concordance = ('(.{{0,{width}}}{pattern}.{{0,{width}}})'
               .format(pattern=pattern, width=width))
for match in re.finditer(concordance, text):
    print(match.group(1))