Date : 2020.03.19

Writer : Sumi Kim, sumikim323@naver.com

Theme : Scraping & parsing 국사편찬위원회(http://db.history.go.kr/) top page

#Web Scraping
**HTTP** is a set of rules for communication between client computer and server computer.
**'requests'**(a python module) speak and understand HTTP.
A client program, your code will use 'requests' to communicate servers which you want to get data from

In [25]:
from urllib.request import urlopen
html = urlopen('http://db.history.go.kr/')
print(html.read())

b'\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">\r\n<html>\r\n\t<head>\r\n\t\t<meta http-equiv="X-UA-Compatible" content="IE=edge">\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\r\n\t\t<title>\xed\x95\x9c\xea\xb5\xad\xec\x82\xac\xeb\x8d\xb0\xec\x9d\xb4\xed\x84\xb0\xeb\xb2\xa0\xec\x9d\xb4\xec\x8a\xa4</title>\r\n\t\t<link rel="shortcut icon" type="image/x-icon" href="/favicon.ico" />\r\n\t\t<link rel="stylesheet" type="text/css" href="/css/basic.css"/> \r\n\t\t<link rel="stylesheet" type="text/css" href="/css/main.css"/> \r\n\t\t<style type="text/css">\r\n\t\t</style>\r\n\t\t<script type="text/javascript" src="/jQuery/jquery-1.8.3.js"></script>\r\n\t\t<script type="text/javascript" src="/js/jquery.selectbox-0.2.js"></script>\r\n\t\t<script type="text/javascript" src="/js/common.js"></script>\r\n\t\t<script type="text/javascript" src="/js/makePCookie.js"></script>\r

In [26]:
# See how error comes out.

r2 = urlopen("http://db.history.go.kr/thisurldoesnotexist")
print(r2)

HTTPError: ignored

In [27]:
# If the server which you want to get data from is not valid, it returns '404' error.

import requests
r = requests.get("http://db.history.go.kr/thisurldoesnotexist")
print(r)

<Response [404]>


In [28]:
# To avoid raltime errors and stopping : use 'try & except'
# Give friendly information if you use Error type class modules

from urllib.error import HTTPError
from urllib.error import URLError

try:
  html = urlopen("http://db.history.go.kr/thisurldoesnotexist")
except HTTPError :
  print("The server returned an HTTP error")
except URLError :
  print("The server could not be found!")
else:
  print(html.read())

The server returned an HTTP error


#Parsing Web data
with BeautifulSoup
https://en.wikipedia.org/wiki/Beutiful_Soup_(HTML_parser)

In [29]:
from urllib.request import urlopen
!pip install bs4 # To install bs4 to use BeautifulSoup
from bs4 import BeautifulSoup # Previously !pip install bs4

r = urlopen('http://db.history.go.kr/') # Because of the above html.read()
print(r) # http object
bs = BeautifulSoup(r.read(), 'html.parser')

# print(bs)
# Have a look at the result and then move on to next
print(bs.h1)

# To get the whole text content.
# print(bs.get_text())

<http.client.HTTPResponse object at 0x7f762aee4320>
<h1><img src="/images/main/h1_logo.png" title="한국사 데이터베이스"/></h1>


In [30]:
# Same with the above, but requests is a bit simpler
import requests
!pip install bs4
from bs4 import BeautifulSoup # Previously !pip install bs4

url = 'http://db.history.go.kr/'
r = requests.get(url)
print(r)
# print(r.text)
# Have a look at the result and then move on to next

bs = BeautifulSoup(r.text, 'html.parser')
print(bs.title)
#print(bs.get_text()) #To get the whole text content.

<Response [200]>
<title>한국사데이터베이스</title>


In [31]:
# Get data as forms of item by item
print(bs.h1)
print(bs.h2)
print(bs.h3)
#https://www.crummy.com/software/BeautifulSoup/bs4/doc

<h1><img src="/images/main/h1_logo.png" title="한국사 데이터베이스"/></h1>
<h2><a href="http://www.history.go.kr" target="_blank"><img src="/images/common/h2_logo.gif" title="국사편찬위원회"/></a></h2>
<h3 class="blind">대메뉴</h3>


In [32]:
# Define your own iser defiend function with Error handling
# In case of using urlopen
import sys
from urllib.error import HTTPError

def getTitle(url):
      try:
        html = urlopen(url)
      except HTTPError as e:
        print(e, file=sys.stderr)
        return None
      try:
        bsObj = BeautifulSoup(html.read(), 'html.parser')
        title = bsObj.body.h1
      except AttributeError as e : # If specified with Errortype
        return None
      return title

url = 'http://db.history.go.kr/' # try 'url = "http://db.history.go.kr/abc"
title = getTitle(url=url)
if title == None:
  print("Title could not be found")
else:
  print(title)

<h1><img src="/images/main/h1_logo.png" title="한국사 데이터베이스"/></h1>


In [33]:
# When you call it with an incorrect url
url = 'http://db.history.go.kr/abc' 
title = getTitle(url=url)
if title == None:
  print("Title could not be found")
else:
  print(title)

Title could not be found


HTTP Error 404: Not Found


In [34]:
# Define your own iser defiend function with Error handling
# In case of using requests
import sys
import requests

def getTitle(url):
      try:
        r = requests.get(url)
      except HTTPError as e:
        print(e, file=sys.stderr)
        return None
      try:
        bsObj = BeautifulSoup(r.text, 'html.parser')
        title = bsObj.body.h1
      except Exception as e : # If specified with Errortype
        return None
      return title

url = 'http://db.history.go.kr/' # try 'url = "http://db.history.go.kr/abc"
title = getTitle(url=url)
if title == None:
  print("Title could not be found")
else:
  print(title)

<h1><img src="/images/main/h1_logo.png" title="한국사 데이터베이스"/></h1>


In [35]:
# When you call it with an incorrect url
url = 'http://db.history.go.kr/abc'
title = getTitle(url=url)
if title == None:
  print("Title could not be found")
else:
  print(title)

Title could not be found


In [36]:
# Define your own iser defiend function with Error handling
# In case of using requests
import sys
import requests

def getTitle(url):
      try:
        r = requests.get(url)
      except HTTPError as e:
        print(e, file=sys.stderr)
        return None
      print(r.text) # Not going to Error / 'Requests' is a bit more generous
      try:
        bsObj = BeautifulSoup(r.text, 'html.parser')
        title = bsObj.body.h1
      except Exception as e : # If specified with Errortype
        return None
      return title

url = 'http://db.history.go.kr/abc' # try 'url = "http://db.history.go.kr/abc"
title = getTitle(url=url)
if title == None:
  print("Title could not be found")
else:
  print(title)











<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="ko" xml:lang="ko">
<head>
<meta http-equiv="content-type" content="text/html;charset=utf-8" />
<title>한국사데이터베이스시스템</title>
<link rel="stylesheet" type="text/css" href="/css/basic.css;jsessionid=7E19670F8C3EF7E05D0D66485958C010"/> 
<link rel="stylesheet" type="text/css" href="/css/error.css;jsessionid=7E19670F8C3EF7E05D0D66485958C010"/> 
</head>
<body>
<div id="pop_wrap">
	<div class="errorMessage">
		<img src="/images/etc/errorMessage.gif" border="0" usemap="#Map" />
        <map name="Map" id="Map">
          <area shape="rect" coords="107,164,239,214" href="/index.jsp;jsessionid=7E19670F8C3EF7E05D0D66485958C010" title="되돌아가기" />
          <area shape="rect" coords="290,251,329,279" href="#" title="메일보내기" />
        </map>
    </div>

</div>
</body>
</html>
Title could not be f