In [16]:
# Importing the necessary modules(pandas, numpy , matplotlib.pyplot, seaborn)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### To perform web Scrapping, you should also import the libraries shown below.

   * The urlib.request module is used to open URLs
   * The BeautifulSoup package is used to extract data from html files.
   * The BeautifulSoup library's name is bs4 which stands for Beautiful Soup,Version 4

In [17]:
from urllib.request import urlopen 
from bs4 import BeautifulSoup

In [18]:
# Getting the html of the page is just the first step.
# After importing necessray modules, specify the URL containing the dataset and pass it to openurl() to get the html of the page.

url = "https://news.ycombinator.com/"
html = urlopen(url)
html

<http.client.HTTPResponse at 0x22710170880>

In [19]:
# Next step is to create a BeautifulSoup object from the html
# This is done by passing the html to the BeautifulSoup() function
# The BeautifulSoup package is used to parse the html, that is take the raw html text and break it into Python objects.
# The second argument'lxml' is the html parser.

soup = BeautifulSoup(html, 'lxml')
type(soup)

bs4.BeautifulSoup

In [20]:
# The soup object allows you to extract interesting information about the website you are scraping such ad getting the title of the page as shown below.

# Getting the title

title = soup.title
print(title)

<title>Hacker News</title>


#### find_all() method of soup is used to extract the useful html tags within a webpage.
Examples of the useful tags include 
* < a > for hyperlinks
* < table > for tables 
* < tr > for table rows
* < td > for table cells

#### The code below shows how to extract all the hyperlinks within the webpage.


In [21]:
soup.find_all('a')

[<a href="https://news.ycombinator.com"><img height="18" src="y18.svg" style="border:1px white solid; display:block" width="18"/></a>,
 <a href="news">Hacker News</a>,
 <a href="newest">new</a>,
 <a href="front">past</a>,
 <a href="newcomments">comments</a>,
 <a href="ask">ask</a>,
 <a href="show">show</a>,
 <a href="jobs">jobs</a>,
 <a href="submit" rel="nofollow">submit</a>,
 <a href="login?goto=news">login</a>,
 <a href="vote?id=41353328&amp;how=up&amp;goto=news" id="up_41353328"><div class="votearrow" title="upvote"></div></a>,
 <a href="https://www.gkogan.co/removing-stuff/">Removing stuff is never obvious yet often better</a>,
 <a href="from?site=gkogan.co"><span class="sitestr">gkogan.co</span></a>,
 <a class="hnuser" href="user?id=mooreds">mooreds</a>,
 <a href="item?id=41353328">1 hour ago</a>,
 <a href="hide?id=41353328&amp;goto=news">hide</a>,
 <a href="item?id=41353328">11 comments</a>,
 <a href="vote?id=41353284&amp;how=up&amp;goto=news" id="up_41353284"><div class="votear

In [8]:
# As you see from the output above, html tags sometimes come with as class, src, etc.
# These attribiutes provide additional information about the html elements.
# You can use a for loop and the get('href') method to extract and print out only hyperlinks.

all_links = soup.find_all('a')

for link in all_links:
    print(link.get('href'))

https://news.ycombinator.com
news
newest
front
newcomments
ask
show
jobs
submit
login?goto=news
vote?id=41353079&how=up&goto=news
https://www.cinder.co/blog-posts/north-korean-engineers-in-our-application-pile
from?site=cinder.co
user?id=erehweb
item?id=41353079
hide?id=41353079&goto=news
item?id=41353079
vote?id=41353328&how=up&goto=news
https://www.gkogan.co/removing-stuff/
from?site=gkogan.co
user?id=mooreds
item?id=41353328
hide?id=41353328&goto=news
item?id=41353328
vote?id=41351446&how=up&goto=news
https://notgull.net/announcing-dozer/
from?site=notgull.net
user?id=todsacerdoti
item?id=41351446
hide?id=41351446&goto=news
item?id=41351446
vote?id=41351219&how=up&goto=news
https://www.raphkoster.com/2009/01/08/database-sharding-came-from-uo/
from?site=raphkoster.com
user?id=fanf2
item?id=41351219
hide?id=41351219&goto=news
item?id=41351219
vote?id=41350225&how=up&goto=news
https://ntietz.com/blog/til-uses-for-the-different-uuid-versions/
from?site=ntietz.com
user?id=fagnerbrack
ite

In [23]:
# To print out table rows only, pass the  'tr' arguments in soup.find_all()

# Print the first 4 rows for sanity check
rows = soup.find_all("tr")
print(rows[:4])

[<tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" style="padding:2px" width="100%"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img height="18" src="y18.svg" style="border:1px white solid; display:block" width="18"/></a></td>
<td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
<a href="newest">new</a> | <a href="front">past</a> | <a href="newcomments">comments</a> | <a href="ask">ask</a> | <a href="show">show</a> | <a href="jobs">jobs</a> | <a href="submit" rel="nofollow">submit</a> </span></td><td style="text-align:right;padding-right:4px;"><span class="pagetop">
<a href="login?goto=news">login</a>
</span></td>
</tr></table></td></tr>, <tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img height="18" src="y18.svg" style="border:1px white solid; display:block" width="18"/></a></td>
<td style="line-height:12pt; height:10p

The goal of this tutorial is to take a table from a webpage and convert into a dataframe for easier manipualtion using Python
To get there, you should get all table rows in list form first and then convert that list into a dataframe.
Below is a for loop that iterates through table rows and prints out the cells of the rows. 

In [24]:
for row in rows:
    row_td = row.find_all("td")
print(row_td)
type(row_td)

[<td bgcolor="#ff6600"></td>]


bs4.element.ResultSet

The output above shows that each row is printed with html tags embedded in each rows. This is not what you want. You can use remove the html tags using BeautifulSoup or regular expressions. 

The easiest way  to remove html tags is to use BeautifulSoup, and it takes just a line of code to do this. Pass the string of interest into BeautifulSoup() and use the get_text() method to extract the text without html tags.


In [28]:
str_cells = str(row_td)
cleantxt = BeautifulSoup(str_cells, "lxml").get_text()
print(cleantxt)

[]


In [30]:
import re 

list_rows = []
for row in rows:
    cells = row.find_all('td')
    str_cells = str(cells)
    clean = re.compile('<.*?>')
    clean2 = (re.sub(clean, '', str_cells))
    list_rows.append(clean2)
print(clean2)
type(clean2)
    

[]


str

In [33]:
df = pd.DataFrame(list_rows)
df.head()

Unnamed: 0,0
0,[\nHacker News\nnew | past | comments | ask | ...
1,"[, Hacker News\nnew | past | comments | ask | ..."
2,[]
3,[\n\n1. Removing stuff is never obvious yet of...
4,"[1., , Removing stuff is never obvious yet oft..."
