# Regular Expressions

https://docs.python.org/2/library/re.html

re.search(pattern, string, flags=0)<br>
re.match(pattern, string, flags=0)<br>

search and match return MatchObject or None.  Zero-length match is still a MatchObject.

https://docs.python.org/2/library/re.html#re.search
https://docs.python.org/2/library/re.html#search-vs-match

re.match() checks for a match only at the beginning of the string, while re.search() checks for a match anywhere in the string.

re.compile(pattern, string, flags=0)

https://docs.python.org/2/library/re.html#re.compile

prog = re.compile(pattern)
result = prog.match(string)

flags can be re.I (re.IGNORECASE), re.M (re.MULTILINE), re.S (re.DOTALL), re.X (re.VERBOSE), re.DEBUG

re.findall(pattern, string, flags=0)<br>
re.finditer(pattern, string, flags=0)

re.sub(pattern, repl, string, count=0, flags=0)

In [1]:
import re

In [2]:
match = re.match(r'<[a-z0-9]+>', "<h1>Heading</h1>")
type(match), match

(_sre.SRE_Match, <_sre.SRE_Match at 0x104f8a648>)

https://docs.python.org/2/library/re.html#re.MatchObject

In [3]:
print match

<_sre.SRE_Match object at 0x104f8a648>


In [4]:
match.group()

'<h1>'

In [5]:
match.group(0)

'<h1>'

In [6]:
re.match(r'<.*>', "<h1>Heading</h1>").group()

'<h1>Heading</h1>'

In [7]:
re.match(r'<.*?>', "<h1>Heading</h1>").group()

'<h1>'

In [8]:
match = re.match(r'(\w+ \w+) fox .* (\w+) dog', "Quick brown fox jumps over the lazy dog.")

In [9]:
match.group(0)

'Quick brown fox jumps over the lazy dog'

In [10]:
match.groups()

('Quick brown', 'lazy')

In [11]:
match.group(0), match.group(1), match.group(2)

('Quick brown fox jumps over the lazy dog', 'Quick brown', 'lazy')

In [12]:
match.group(43)

IndexError: no such group

In [13]:
match.lastindex

2

In [14]:
[ match.group(i) for i in range(0, match.lastindex + 1)]

['Quick brown fox jumps over the lazy dog', 'Quick brown', 'lazy']

In [15]:
match.start(1), match.end(1)

(0, 11)

In [16]:
match.string[match.start(1):match.end(1)]

'Quick brown'

In [17]:
match = re.match(r'</.*?>', "<h1>Heading</h1>")
if match:
    print match.group()
else:
    print "No match!"

No match!


In [18]:
re.search(r'</.*?>', "<h1>Heading</h1>").group()

'</h1>'

In [19]:
import requests

response = requests.get(r'https://www.google.com/finance?q=NASDAQ%3AINTC')
page = response.text

In [20]:
flags = re.MULTILINE | re.DOTALL
match = re.search(r'<div class=event>.*?</div>', page, flags)
match.group()

u'<div class=event>\n<div class="bld date">May 20, 2016</div>'

In [21]:
match = re.search(r'<div class=event>.*?<div class="bld date">.*?</div>.*?</div>', page, flags)
match.group()

u'<div class=event>\n<div class="bld date">May 20, 2016</div>\n<div>Intel Corp Annual Shareholders Meeting (Estimated)\n- <span class=time>11:30AM EDT</span> -\n<a onclick="_GF_click(\'\', \'add_cal_evnt\', \'INTC\', \'\')" href= "\n  //www.google.com/calendar/event?ei=iLTtVqnDAcm3igKV-YL4DA&hl=en&action=TEMPLATE&text=Intel+Corp+Annual+Shareholders+Meeting+(Estimated)&dates=20160520T153000Z/20160520T163000Z&sprop=finance"  class=time>\n<img alt="Add to calendar" src="/finance/s/m4C5KaofCbA/images/cleardot.gif"\n                width=16 height=16 class=SP_calendar id=calendar>\n</a>\n</div>'

In [22]:
match = re.search(r'<div class=event>.*?<div class="bld date">.*?</div>.*?<div>.*?</div>.*?</div>', page, flags)
match.group()

u'<div class=event>\n<div class="bld date">May 20, 2016</div>\n<div>Intel Corp Annual Shareholders Meeting (Estimated)\n- <span class=time>11:30AM EDT</span> -\n<a onclick="_GF_click(\'\', \'add_cal_evnt\', \'INTC\', \'\')" href= "\n  //www.google.com/calendar/event?ei=iLTtVqnDAcm3igKV-YL4DA&hl=en&action=TEMPLATE&text=Intel+Corp+Annual+Shareholders+Meeting+(Estimated)&dates=20160520T153000Z/20160520T163000Z&sprop=finance"  class=time>\n<img alt="Add to calendar" src="/finance/s/m4C5KaofCbA/images/cleardot.gif"\n                width=16 height=16 class=SP_calendar id=calendar>\n</a>\n</div>\n</div>'

In [23]:
flags |= re.VERBOSE

In [24]:
eventregex = re.compile(r"""<div\ class=event>.*?
                              <div\ class="bld\ date">.*?</div>.*?
                              <div>.*?</div>.*?
                            </div>""", flags)
match = eventregex.search(page)
match.group()

u'<div class=event>\n<div class="bld date">May 20, 2016</div>\n<div>Intel Corp Annual Shareholders Meeting (Estimated)\n- <span class=time>11:30AM EDT</span> -\n<a onclick="_GF_click(\'\', \'add_cal_evnt\', \'INTC\', \'\')" href= "\n  //www.google.com/calendar/event?ei=iLTtVqnDAcm3igKV-YL4DA&hl=en&action=TEMPLATE&text=Intel+Corp+Annual+Shareholders+Meeting+(Estimated)&dates=20160520T153000Z/20160520T163000Z&sprop=finance"  class=time>\n<img alt="Add to calendar" src="/finance/s/m4C5KaofCbA/images/cleardot.gif"\n                width=16 height=16 class=SP_calendar id=calendar>\n</a>\n</div>\n</div>'

In [25]:
print match.group()

<div class=event>
<div class="bld date">May 20, 2016</div>
<div>Intel Corp Annual Shareholders Meeting (Estimated)
- <span class=time>11:30AM EDT</span> -
<a onclick="_GF_click('', 'add_cal_evnt', 'INTC', '')" href= "
  //www.google.com/calendar/event?ei=iLTtVqnDAcm3igKV-YL4DA&hl=en&action=TEMPLATE&text=Intel+Corp+Annual+Shareholders+Meeting+(Estimated)&dates=20160520T153000Z/20160520T163000Z&sprop=finance"  class=time>
<img alt="Add to calendar" src="/finance/s/m4C5KaofCbA/images/cleardot.gif"
                width=16 height=16 class=SP_calendar id=calendar>
</a>
</div>
</div>


In [26]:
events = eventregex.findall(page)
events

[u'<div class=event>\n<div class="bld date">May 20, 2016</div>\n<div>Intel Corp Annual Shareholders Meeting (Estimated)\n- <span class=time>11:30AM EDT</span> -\n<a onclick="_GF_click(\'\', \'add_cal_evnt\', \'INTC\', \'\')" href= "\n  //www.google.com/calendar/event?ei=iLTtVqnDAcm3igKV-YL4DA&hl=en&action=TEMPLATE&text=Intel+Corp+Annual+Shareholders+Meeting+(Estimated)&dates=20160520T153000Z/20160520T163000Z&sprop=finance"  class=time>\n<img alt="Add to calendar" src="/finance/s/m4C5KaofCbA/images/cleardot.gif"\n                width=16 height=16 class=SP_calendar id=calendar>\n</a>\n</div>\n</div>',
 u'<div class=event>\n<div class="bld date">Apr 19, 2016</div>\n<div>Q1 2016 Intel Corp Earnings Call\n- <span class=time>5:00PM EDT</span> -\n<a onclick="_GF_click(\'\', \'add_cal_evnt\', \'INTC\', \'\')" href= "\n  //www.google.com/calendar/event?ei=iLTtVqnDAcm3igKV-YL4DA&hl=en&action=TEMPLATE&text=Q1+2016+Intel+Corp+Earnings+Call&dates=20160419T210000Z/20160419T220000Z&sprop=finance"  c

In [27]:
events = [ re.sub(r'<img[^>]*>', '', event, flags) for event in events ]
events

[u'<div class=event>\n<div class="bld date">May 20, 2016</div>\n<div>Intel Corp Annual Shareholders Meeting (Estimated)\n- <span class=time>11:30AM EDT</span> -\n<a onclick="_GF_click(\'\', \'add_cal_evnt\', \'INTC\', \'\')" href= "\n  //www.google.com/calendar/event?ei=iLTtVqnDAcm3igKV-YL4DA&hl=en&action=TEMPLATE&text=Intel+Corp+Annual+Shareholders+Meeting+(Estimated)&dates=20160520T153000Z/20160520T163000Z&sprop=finance"  class=time>\n\n</a>\n</div>\n</div>',
 u'<div class=event>\n<div class="bld date">Apr 19, 2016</div>\n<div>Q1 2016 Intel Corp Earnings Call\n- <span class=time>5:00PM EDT</span> -\n<a onclick="_GF_click(\'\', \'add_cal_evnt\', \'INTC\', \'\')" href= "\n  //www.google.com/calendar/event?ei=iLTtVqnDAcm3igKV-YL4DA&hl=en&action=TEMPLATE&text=Q1+2016+Intel+Corp+Earnings+Call&dates=20160419T210000Z/20160419T220000Z&sprop=finance"  class=time>\n\n</a>\n</div>\n</div>',
 u'<div class=event>\n<div class="bld date">Apr 19, 2016</div>\n<div>Q1 2016 Intel Corp Earnings Release\

In [28]:
events = [ re.sub(r'<a[^>]*>[^<]*</a>', '', event, flags) for event in events ]
events

[u'<div class=event>\n<div class="bld date">May 20, 2016</div>\n<div>Intel Corp Annual Shareholders Meeting (Estimated)\n- <span class=time>11:30AM EDT</span> -\n\n</div>\n</div>',
 u'<div class=event>\n<div class="bld date">Apr 19, 2016</div>\n<div>Q1 2016 Intel Corp Earnings Call\n- <span class=time>5:00PM EDT</span> -\n\n</div>\n</div>',
 u'<div class=event>\n<div class="bld date">Apr 19, 2016</div>\n<div>Q1 2016 Intel Corp Earnings Release\n- <span class=time>4:00PM EDT</span> -\n\n</div>\n</div>',
 u'<div class=event>\n<div class="bld date">Mar 1, 2016</div>\n<div>Intel Corp at Morgan Stanley Technology, Media &amp; Telecom Conference\n</div>\n</div>',
 u'<div class=event>\n<div class="bld date">Feb 22, 2016</div>\n<div>Intel Corp at GSM Association Mobile World Congress\n</div>\n</div>',
 u'<div class=event>\n<div class="bld date">Jan 19, 2016</div>\n<div>Intel Corp Webcast Announcing 6th Generation Intel Core vPro Processors -\n\n</div>\n</div>',
 u'<div class=event>\n<div class

In [29]:
events = [ re.sub(r'\n', ' ', event, flags) for event in events ]
events = [ re.sub(r' +', ' ', event, flags) for event in events ]
events = [ re.sub(r'> +<', '><', event, flags) for event in events ]
events

[u'<div class=event><div class="bld date">May 20, 2016</div><div>Intel Corp Annual Shareholders Meeting (Estimated) - <span class=time>11:30AM EDT</span> - </div></div>',
 u'<div class=event><div class="bld date">Apr 19, 2016</div><div>Q1 2016 Intel Corp Earnings Call - <span class=time>5:00PM EDT</span> - </div></div>',
 u'<div class=event><div class="bld date">Apr 19, 2016</div><div>Q1 2016 Intel Corp Earnings Release - <span class=time>4:00PM EDT</span> - </div></div>',
 u'<div class=event><div class="bld date">Mar 1, 2016</div><div>Intel Corp at Morgan Stanley Technology, Media &amp; Telecom Conference </div></div>',
 u'<div class=event><div class="bld date">Feb 22, 2016</div><div>Intel Corp at GSM Association Mobile World Congress </div></div>',
 u'<div class=event><div class="bld date">Jan 19, 2016</div><div>Intel Corp Webcast Announcing 6th Generation Intel Core vPro Processors - </div></div>',
 u'<div class=event><div class="bld date">Jan 14, 2016</div><div>Q4 2015 Intel Corp E

In [30]:
events = [ re.sub(r'<.*?>', ' ', event, flags) for event in events ]
events = [ re.sub(r' +', ' ', event, flags) for event in events ]
events

[u' May 20, 2016 Intel Corp Annual Shareholders Meeting (Estimated) - 11:30AM EDT - ',
 u' Apr 19, 2016 Q1 2016 Intel Corp Earnings Call - 5:00PM EDT - ',
 u' Apr 19, 2016 Q1 2016 Intel Corp Earnings Release - 4:00PM EDT - ',
 u' Mar 1, 2016 Intel Corp at Morgan Stanley Technology, Media &amp; Telecom Conference ',
 u' Feb 22, 2016 Intel Corp at GSM Association Mobile World Congress ',
 u' Jan 19, 2016 Intel Corp Webcast Announcing 6th Generation Intel Core vPro Processors - ',
 u' Jan 14, 2016 Q4 2015 Intel Corp Earnings Call ',
 u' Jan 14, 2016 Q4 2015 Intel Corp Earnings Release ',
 u' Jan 6, 2016 Intel Corp at JPMorgan Technology Forum ']