# Parsing Web Pages Using Regex

In [1]:
import requests, re
import webbrowser
from pprint import pprint

In [2]:
url = "http://www.summet.com/dmsi/html/codesamples/addresses.html"

In [3]:
webbrowser.open(url)

True

##### Dowload the page content using request

In [4]:
resp = requests.get(url)

##### Extracting the html content from the request object

In [5]:
html = resp.text
pprint(html)

('<html>\n'
 '<head>\n'
 '\t<title>Sample Addresses!</title>\n'
 '<script async '
 'src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js?client=ca-pub-2760663110461940"\n'
 '     crossorigin="anonymous"></script>\n'
 '\n'
 '</head>\n'
 '<body>\n'
 '<h1> A page full of sample addresses for your parsing enjoyment!</h1>\n'
 '<h2> (All data is random....)</h2>\n'
 '<ul>\n'
 '\n'
 '<script async '
 'src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js?client=ca-pub-2760663110461940"\n'
 '     crossorigin="anonymous"></script>\n'
 '<!-- summet.com-dmsi-addresses -->\n'
 '<ins class="adsbygoogle"\n'
 '     style="display:block"\n'
 '     data-ad-client="ca-pub-2760663110461940"\n'
 '     data-ad-slot="5472586510"\n'
 '     data-ad-format="auto"\n'
 '     data-full-width-responsive="true"></ins>\n'
 '<script>\n'
 '     (adsbygoogle = window.adsbygoogle || []).push({});\n'
 '</script>\n'
 '\n'
 '<li>Cecilia Chapman<br/>711-2880 Nulla St.<br/>Mankato Mississippi '
 

##### Extracting titles

In [6]:
title = re.findall("<title>.*</title>", html)
title

['<title>Sample Addresses!</title>']

##### "." represents that we want to match any character a new line  and "*" signifies its repetetions.

##### If we dont want title tags

In [7]:
title = re.findall("<title>(.*)</title>", html)
title

['Sample Addresses!']

##### The bracket here represents the token we want to extract after the pattern has been matched.

###### Extract all the contents from h1 tag

In [8]:
h_data = re.findall("<h1>(.*)</h1>", html)
h_data

[' A page full of sample addresses for your parsing enjoyment!']

##### The page contains many headers, so lets extract all of them.

In [9]:
h_data = re.findall("<h.>(.*)</h.>", html)
h_data

[' A page full of sample addresses for your parsing enjoyment!',
 ' (All data is random....)']

###### Combine different building blocks available within regex to set up our pattern matches. 
###### Lets extract li tags, which ends with a break. Only extract alphabets seperated by a case. 
###### '+' says that any pattern which is within "[]" should be repeated.  

In [10]:
names = re.findall("<li>([A-Za-z]+ [A-Za-z]+)<br/>", html)
names

['Cecilia Chapman',
 'Iris Watson',
 'Celeste Slater',
 'Theodore Lowe',
 'Calista Wise',
 'Kyla Olsen',
 'Forrest Ray',
 'Hiroko Potter',
 'Nyssa Vazquez',
 'Lawrence Moreno',
 'Ina Moran',
 'Aaron Hawkins',
 'Hedy Greene',
 'Melvin Porter',
 'Keefe Sellers',
 'Joan Romero',
 'Davis Patrick',
 'Leilani Boyer',
 'Colby Bernard',
 'Bryar Pitts',
 'Rahim Henderson',
 'Noelle Adams',
 'Lillith Daniel',
 'Adria Russell',
 'Hilda Haynes',
 'Sheila Mcintosh',
 'Rebecca Chambers',
 'Christian Emerson',
 'Nevada Ware',
 'Margaret Joseph',
 'Edward Nieves',
 'Imani Talley',
 'Bertha Riggs',
 'Wallace Ross',
 'Chester Bennett',
 'Castor Richardson',
 'Sonya Jordan',
 'Harrison Mcguire',
 'Malcolm Long',
 'Raymond Levy',
 'Hedley Ingram',
 'David Mathews',
 'Xyla Cash',
 'Madeline Gregory',
 'Griffith Daniels',
 'Anne Beasley',
 'Chaney Bennett',
 'Daniel Bernard',
 'Willow Hunt',
 'Judith Floyd',
 'Seth Farley',
 'Zephania Sanders',
 'Calista Merritt',
 'Craig Williams',
 'Lee Preston',
 'Kately

##### If we want to extract all the first name and last name seperately, control the paranthesis. 

In [11]:
names = re.findall("<li>([A-Za-z]+) ([A-Za-z]+)<br/>", html)
names

[('Cecilia', 'Chapman'),
 ('Iris', 'Watson'),
 ('Celeste', 'Slater'),
 ('Theodore', 'Lowe'),
 ('Calista', 'Wise'),
 ('Kyla', 'Olsen'),
 ('Forrest', 'Ray'),
 ('Hiroko', 'Potter'),
 ('Nyssa', 'Vazquez'),
 ('Lawrence', 'Moreno'),
 ('Ina', 'Moran'),
 ('Aaron', 'Hawkins'),
 ('Hedy', 'Greene'),
 ('Melvin', 'Porter'),
 ('Keefe', 'Sellers'),
 ('Joan', 'Romero'),
 ('Davis', 'Patrick'),
 ('Leilani', 'Boyer'),
 ('Colby', 'Bernard'),
 ('Bryar', 'Pitts'),
 ('Rahim', 'Henderson'),
 ('Noelle', 'Adams'),
 ('Lillith', 'Daniel'),
 ('Adria', 'Russell'),
 ('Hilda', 'Haynes'),
 ('Sheila', 'Mcintosh'),
 ('Rebecca', 'Chambers'),
 ('Christian', 'Emerson'),
 ('Nevada', 'Ware'),
 ('Margaret', 'Joseph'),
 ('Edward', 'Nieves'),
 ('Imani', 'Talley'),
 ('Bertha', 'Riggs'),
 ('Wallace', 'Ross'),
 ('Chester', 'Bennett'),
 ('Castor', 'Richardson'),
 ('Sonya', 'Jordan'),
 ('Harrison', 'Mcguire'),
 ('Malcolm', 'Long'),
 ('Raymond', 'Levy'),
 ('Hedley', 'Ingram'),
 ('David', 'Mathews'),
 ('Xyla', 'Cash'),
 ('Madeline', '

###### List all bullet items

In [12]:
names = re.findall("<li>(C\w+)", html)
names

['Cecilia',
 'Celeste',
 'Calista',
 'Colby',
 'Christian',
 'Chester',
 'Castor',
 'Chaney',
 'Calista',
 'Craig',
 'Cara',
 'Cleo']

##### First character after the bullet is 'C' and the paranthesis implies we want extract the content of the list item. 

In [13]:
names = re.findall("<li>(C\w+ [A-Za-z]+)", html)
names

['Cecilia Chapman',
 'Celeste Slater',
 'Calista Wise',
 'Colby Bernard',
 'Christian Emerson',
 'Chester Bennett',
 'Castor Richardson',
 'Chaney Bennett',
 'Calista Merritt',
 'Craig Williams',
 'Cara Whitehead',
 'Cleo Best']

##### Lets extract phone numbers

In [14]:
phone_numbers = re.findall("\(\d{3}\) \d{3}-\d{4}", html)
phone_numbers

['(257) 563-7401',
 '(372) 587-2335',
 '(786) 713-8616',
 '(793) 151-6230',
 '(492) 709-6392',
 '(654) 393-5734',
 '(404) 960-3807',
 '(314) 244-6306',
 '(947) 278-5929',
 '(684) 579-1879',
 '(389) 737-2852',
 '(660) 663-4518',
 '(608) 265-2215',
 '(959) 119-8364',
 '(468) 353-2641',
 '(248) 675-4007',
 '(939) 353-1107',
 '(570) 873-7090',
 '(302) 259-2375',
 '(717) 450-4729',
 '(453) 391-4650',
 '(559) 104-5475',
 '(387) 142-9434',
 '(516) 745-4496',
 '(326) 677-3419',
 '(746) 679-2470',
 '(455) 430-0989',
 '(490) 936-4694',
 '(985) 834-8285',
 '(662) 661-1446',
 '(802) 668-8240',
 '(477) 768-9247',
 '(791) 239-9057',
 '(832) 109-0213',
 '(837) 196-3274',
 '(268) 442-2428',
 '(850) 676-5117',
 '(861) 546-5032',
 '(176) 805-4108',
 '(715) 912-6931',
 '(993) 554-0563',
 '(357) 616-5411',
 '(121) 347-0086',
 '(304) 506-6314',
 '(425) 288-2332',
 '(145) 987-4962',
 '(187) 582-9707',
 '(750) 558-3965',
 '(492) 467-3131',
 '(774) 914-2510',
 '(888) 106-8550',
 '(539) 567-3573',
 '(693) 337-

In [15]:
cities = re.findall("[A-Za-z]+ [A-Za-z]+ \d{5}", html)
cities

['Mankato Mississippi 96522',
 'Frederick Nebraska 20620',
 'Roseville NH 11523',
 'New York 39531',
 'Antonio MI 47096',
 'Tamuning PA 10855',
 'New Mexico 08219',
 'Muskegon KY 12482',
 'Chelsea MI 67708',
 'Rosa MN 98804',
 'Lebanon KY 69409',
 'Rhode Island 24975',
 'Latrobe DE 38100',
 'South Dakota 45149',
 'Woodruff SC 49854',
 'Falls Ohio 19253',
 'Bethlehem Utah 02913',
 'Bernardino ND 09289',
 'Amesbury HI 93373',
 'Dodge GA 20783',
 'City Ohio 90255',
 'Gardena Colorado 37547',
 'Centennial Delaware 48432',
 'North Dakota 58563',
 'Weirton IN 93479',
 'Monica FL 30309',
 'Liberal Vermont 51324',
 'Hills Georgia 92358',
 'Bay Indiana 19759',
 'Tamuning Washington 55797',
 'Rhode Island 37232',
 'Yigo Massachusetts 50654',
 'Easthampton TN 31626',
 'Park Hawaii 43526',
 'Minot AZ 95302',
 'Lynchburg DC 29738',
 'Visalia VA 54886',
 'Fernando ID 77373',
 'Charles Maine 11292',
 'New Mexico 73585',
 'Independence Texas 87535',
 'Moscow Kentucky 77382',
 'Boise CT 35282',
 'Walla