In [1]:
from builtwith import builtwith

builtwith('http://www.google.com')

{'web-servers': ['Google Web Server']}

In [2]:
from whois import whois

whois('http://www.naver.com')

{'domain_name': ['NAVER.COM', 'naver.com'],
 'registrar': 'Gabia, Inc.',
 'whois_server': 'whois.gabia.com',
 'referral_url': None,
 'updated_date': [datetime.datetime(2016, 8, 5, 6, 37, 57),
  datetime.datetime(2018, 2, 28, 11, 27, 15)],
 'creation_date': [datetime.datetime(1997, 9, 12, 4, 0),
  datetime.datetime(1997, 9, 12, 0, 0)],
 'expiration_date': [datetime.datetime(2023, 9, 11, 4, 0),
  datetime.datetime(2023, 9, 11, 0, 0)],
 'name_servers': ['NS1.NAVER.COM',
  'NS2.NAVER.COM',
  'ns1.naver.com',
  'ns2.naver.com'],
 'status': ['clientDeleteProhibited https://icann.org/epp#clientDeleteProhibited',
  'clientTransferProhibited https://icann.org/epp#clientTransferProhibited',
  'clientUpdateProhibited https://icann.org/epp#clientUpdateProhibited',
  'ok https://icann.org/epp#ok'],
 'emails': ['white.4818@navercorp.com',
  'dl_ssl@navercorp.com',
  'abuse@gabia.com'],
 'dnssec': 'unsigned',
 'name': 'NAVER Corp.',
 'org': 'NAVER Corp.',
 'address': '6 Buljung-ro, Bundang-gu, Seongn

### 로봇파서
answers qustions about whether or not a particular user agent can fetch a URL on the website that published the robots.txt

In [3]:
from urllib import robotparser

parser = robotparser.RobotFileParser()
parser.set_url('http://www.google.com/robots.txt')
parser.read()
parser.can_fetch('*', '/?h1=') 

False

In [4]:
parser = robotparser.RobotFileParser()
parser.set_url('http://cafe.naver.com/robots.txt')
parser.read()
parser.can_fetch('*', '/starbuckgossip')

True

### urlib/request

In [2]:
from urllib import request

resp = request.urlopen('http://www.google.com')

In [3]:
resp.geturl()

'http://www.google.com'

In [4]:
resp.getcode()

200

In [5]:
resp.getheaders()

[('Date', 'Wed, 15 May 2019 09:51:21 GMT'),
 ('Expires', '-1'),
 ('Cache-Control', 'private, max-age=0'),
 ('Content-Type', 'text/html; charset=ISO-8859-1'),
 ('P3P', 'CP="This is not a P3P policy! See g.co/p3phelp for more info."'),
 ('Server', 'gws'),
 ('X-XSS-Protection', '0'),
 ('X-Frame-Options', 'SAMEORIGIN'),
 ('Set-Cookie',
  '1P_JAR=2019-05-15-09; expires=Fri, 14-Jun-2019 09:51:21 GMT; path=/; domain=.google.com'),
 ('Set-Cookie',
  'NID=183=B23tGiqZeKiPF_GWlSK8kPf7bImZdPbQPy7aAFBvipZZ4--RLuqSf066jeqwiTyV_ohziZDdueWxeDDS0DtkaokM_V-EfitpNjc3MHddenk0XmO1yjOFDynrAYOQ8FlMmacMB96rNANDP4fQLVu8epQ_GV3nIQ68PVhRhlYjIVM; expires=Thu, 14-Nov-2019 09:51:21 GMT; path=/; domain=.google.com; HttpOnly'),
 ('Accept-Ranges', 'none'),
 ('Vary', 'Accept-Encoding'),
 ('Connection', 'close')]

# 박보영 검색

In [13]:
from urllib import error

try:
    resp = request.urlopen('https://www.google.com/search?ei=vjPIXLq4J5DmwQPElJOoDQ&q=%EB%B0%95%EB%B3%B4%EC%98%81&oq=%EB%B0%95%EB%B3%B4%EC%98%81&gs_l=psy-ab.3..0i67l3j0i131l3j0i67j0j0i131j0.3377.3949..4013...0.0..0.90.354.4......0....1..gws-wiz.nODVn5n3xr8')
except error.HTTPError as e:
    print(e.code)
    print(e.reason)
    print(e.headers)

403
Forbidden
Content-Type: text/html; charset=UTF-8
Date: Wed, 15 May 2019 10:09:31 GMT
Server: gws
Cache-Control: private
X-XSS-Protection: 0
X-Frame-Options: SAMEORIGIN
Alt-Svc: quic=":443"; ma=2592000; v="46,44,43,39"
Accept-Ranges: none
Vary: Accept-Encoding
Connection: close




In [14]:
from urllib import parse

url = 'https://www.google.com/search'
header = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}

def download(url, params={}, retries=3):
    resp = None
    
    try:
        req = request.Request(url + '?' + parse.urlencode(params), headers=header)
        resp = request.urlopen(req)
    except error.HTTPError as e:
        if 500 <= e.code < 600 and retries > 0:
            resp = download(url, params, retries-1)
        else:
            print(e.code)
            print(e.reason)
            print(e.geturl())
            print(e.headers)
            
    return resp

In [15]:
params = {'q':'박보영'}
parse.urlencode(params)

'q=%EB%B0%95%EB%B3%B4%EC%98%81'

In [16]:
parse.urlparse(url)
parse.urljoin(url,'urljoin')

'https://www.google.com/urljoin'

In [17]:
result = parse.urlparse('https://www.google.com/search?q=%EB%B0%95%EB%B3%B4%EC%98%81')
[_ for _ in result]

['https', 'www.google.com', '/search', '', 'q=%EB%B0%95%EB%B3%B4%EC%98%81', '']

In [18]:
parse.unquote('%EB%B0%95%EB%B3%B4%EC%98%81')

'박보영'

In [19]:
parse.quote('박 보 영'), parse.quote_plus('박 보 영')

('%EB%B0%95%20%EB%B3%B4%20%EC%98%81', '%EB%B0%95+%EB%B3%B4+%EC%98%81')

# requests

In [22]:
import requests

resp = requests.get('http://www.google.com/search', params={'q':'박보영'})
# resp.text

In [23]:
resp.status_code, resp.reason, resp.headers

(200,
 'OK',
 {'Date': 'Wed, 15 May 2019 10:10:59 GMT', 'Expires': '-1', 'Cache-Control': 'private, max-age=0', 'Content-Type': 'text/html; charset=ISO-8859-1', 'P3P': 'CP="This is not a P3P policy! See g.co/p3phelp for more info."', 'Content-Encoding': 'gzip', 'Server': 'gws', 'X-XSS-Protection': '0', 'X-Frame-Options': 'SAMEORIGIN', 'Transfer-Encoding': 'chunked', 'Set-Cookie': '1P_JAR=2019-05-15-10; expires=Fri, 14-Jun-2019 10:10:59 GMT; path=/; domain=.google.com, CGIC=IgMqLyo; expires=Mon, 11-Nov-2019 10:10:59 GMT; path=/complete/search; domain=.google.com; HttpOnly, CGIC=IgMqLyo; expires=Mon, 11-Nov-2019 10:10:59 GMT; path=/search; domain=.google.com; HttpOnly, NID=183=MeelhFolOy0te6L2LnTib_cK05rBvra7vQnbwb5BX-vXecHZv3TQPcgFXN_ycESTXRTpszHzd4JJdpvqvu__FYtMCeLJE-gs0YnQyO1aM0aThV4rzKdnoAmPjWH4apLJadZW6oycmXjOzu_Lgws1vd2pNol5cWTl4y0eTM4uDl0; expires=Thu, 14-Nov-2019 10:10:59 GMT; path=/; domain=.google.com; HttpOnly'})

In [24]:
resp.request.headers

{'User-Agent': 'python-requests/2.20.1', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}