# 1.1 urlopen

In [1]:
import urllib.request
import urllib.parse
import socket
import urllib.error

## 1.1.1 了解urlopen - 发送请求

In [2]:
response = urllib.request.urlopen('https://www.python.org/')
print(type(response))
print(response.read().decode('utf-8'))

<class 'http.client.HTTPResponse'>
<!doctype html>
<!--[if lt IE 7]>   <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9">   <![endif]-->
<!--[if IE 7]>      <html class="no-js ie7 lt-ie8 lt-ie9">          <![endif]-->
<!--[if IE 8]>      <html class="no-js ie8 lt-ie9">                 <![endif]-->
<!--[if gt IE 8]><!--><html class="no-js" lang="en" dir="ltr">  <!--<![endif]-->

<head>
    <!-- Google tag (gtag.js) -->
    <script async src="https://www.googletagmanager.com/gtag/js?id=G-TF35YF9CVH"></script>
    <script>
      window.dataLayer = window.dataLayer || [];
      function gtag(){dataLayer.push(arguments);}
      gtag('js', new Date());
      gtag('config', 'G-TF35YF9CVH');
    </script>

    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <link rel="prefetch" href="//ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js">
    <link rel="prefetch" href="//ajax.googleapis.com/ajax/libs/jqueryui/1.12.1/jquery-ui.min.js">

    <meta name="appl

In [3]:
# status 查看状态码
# getheaders 返回一个包含所有响应头信息的列表
# getheader 获取特定响应头字段的值，例如 'Server' 头字段
print(response.status)
print(response.getheaders())
print(response.getheader('Server'))

200
[('Connection', 'close'), ('Content-Length', '50071'), ('Report-To', '{"group":"heroku-nel","max_age":3600,"endpoints":[{"url":"https://nel.heroku.com/reports?ts=1698731661&sid=67ff5de4-ad2b-4112-9289-cf96be89efed&s=i%2BfDTTLz7SWng1F9ZWG8e7V4bN7jm227fieKQ9bDAXE%3D"}]}'), ('Reporting-Endpoints', 'heroku-nel=https://nel.heroku.com/reports?ts=1698731661&sid=67ff5de4-ad2b-4112-9289-cf96be89efed&s=i%2BfDTTLz7SWng1F9ZWG8e7V4bN7jm227fieKQ9bDAXE%3D'), ('Nel', '{"report_to":"heroku-nel","max_age":3600,"success_fraction":0.005,"failure_fraction":0.05,"response_headers":["Via"]}'), ('Server', 'nginx'), ('Content-Type', 'text/html; charset=utf-8'), ('X-Frame-Options', 'SAMEORIGIN'), ('Via', '1.1 vegur, 1.1 varnish, 1.1 varnish'), ('Accept-Ranges', 'bytes'), ('Date', 'Tue, 31 Oct 2023 06:45:20 GMT'), ('Age', '3004'), ('X-Served-By', 'cache-iad-kiad7000025-IAD, cache-bur-kbur8200116-BUR'), ('X-Cache', 'HIT, HIT'), ('X-Cache-Hits', '10, 2'), ('X-Timer', 'S1698734720.106884,VS0,VE0'), ('Vary', 'Co

## 1.1.2 post请求  - data参数

In [4]:
data = bytes(urllib.parse.urlencode({'name': 'germy'}), encoding='utf-8')
response = urllib.request.urlopen('https://www.httpbin.org/post', data=data)
print(response.read().decode('utf-8'))

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "name": "germy"
  }, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Content-Length": "10", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "www.httpbin.org", 
    "User-Agent": "Python-urllib/3.11", 
    "X-Amzn-Trace-Id": "Root=1-6540a282-133102ce28f8cb801826a25b"
  }, 
  "json": null, 
  "origin": "206.189.175.168", 
  "url": "https://www.httpbin.org/post"
}



## 1.1.3 get请求 - timeout参数

#### 其他参数

1. context 参数，该参数必须是ssl.SSLContext类型，用来指定SSL的设置
2. cafile喝capath分别指定CA证书和其路径，这两个在请求HTTPS链接时会游泳

In [5]:
try:
    response = urllib.request.urlopen('https://www.httpbin.org/get', timeout=5)
    print(response.read().decode('utf-8'))
except urllib.error.URLError as e:
    if isinstance(e.reason, socket.timeout):
        print('Time out')

{
  "args": {}, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Host": "www.httpbin.org", 
    "User-Agent": "Python-urllib/3.11", 
    "X-Amzn-Trace-Id": "Root=1-6540a284-700304ad194a523b3e03fed1"
  }, 
  "origin": "206.189.175.168", 
  "url": "https://www.httpbin.org/get"
}



## 1.2 Request

In [6]:
request = urllib.request.Request('https://www.python.org/')
response = urllib.request.urlopen(request)
# response.read().decode('utf-8')

## 1.2.1 parse方法

In [7]:
from urllib import request,parse

In [8]:
url = 'https://www.httpbin.org/post'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
           'Host': 'www.httpbin.org'}
dict = {'name': 'germy'}

# 要用bytes方法编码，对dice序列化操作
data = bytes(parse.urlencode(dict), encoding='utf-8')
req = request.Request(url, data=data, headers=headers, method='POST')
response = request.urlopen(req)
print(response.read().decode('utf-8'))

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "name": "germy"
  }, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Content-Length": "10", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "www.httpbin.org", 
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", 
    "X-Amzn-Trace-Id": "Root=1-6540a286-0d7721195d5e2c204a0dd8e3"
  }, 
  "json": null, 
  "origin": "206.189.175.168", 
  "url": "https://www.httpbin.org/post"
}



## 1.3 高级用法
1. HTTPDefaultErroryand1er 用于处理HTTP响应错误，所有错误都会抛出HTTPError类型的异常
2. HTTPRedirectHandler 用于处理重定向
3. HTTPC00kieProcessor 用于处理Cookie
4. ProxyHandler 用于设置代理，代理默认为空
5. HTTPPasswordMgr 用于管理密码，它维护着用户名密码的对照表
6. HTTPBaSiCAuthHand1er 用于管理认证，如果一个链接在打开时需要认证，那么可以用这个类来解决认证问题。

## 1.3.1 验证

In [52]:
from urllib.request import HTTPPasswordMgrWithDefaultRealm, HTTPBasicAuthHandler, build_opener
from urllib.error import URLError

In [53]:
username = 'admin'
password = 'admin'
url = 'https://ssr3.scrape.center/'

In [54]:
# 实例化HTTPPasswordMgrWithDefaultRealm对象
p = HTTPPasswordMgrWithDefaultRealm()
p.add_password(None, url, username, password)

# 实例化一个HTTPBasicAuthHandler对象auth_handler
# 参数：HTTPPasswordMgrWithDefaultRealm对象，利用add_password添加账号、密码
# 建立一个用来处理验证的Handler类
auth_handler = HTTPBasicAuthHandler(p)

# 将auth_handler类当作参数传入build_opener方法，构建一个Opener类
opener = build_opener(auth_handler)

try:
    result = opener.open(url)
    html = result.read().decode('utf-8')
    print(html)
except URLError as e:
    print(e.reason)

<html lang="en">
<head>
  
  <meta charset="utf-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="viewport" content="width=device-width,initial-scale=1">
  <link rel="icon" href="/static/img/favicon.ico">
  <title>Scrape | Movie</title>
  

  <link href="/static/css/app.css" type="text/css" rel="stylesheet">
  
<link href="/static/css/index.css" type="text/css" rel="stylesheet">

</head>
<body>
<div id="app">
  <div data-v-74e8b908="" class="el-row" id="header">
    <div data-v-74e8b908="" class="container el-col el-col-18 el-col-offset-3">
      <div data-v-74e8b908="" class="el-row">
        <div data-v-74e8b908="" class="logo el-col el-col-4">
          <a data-v-74e8b908="" href="/" class="router-link-exact-active router-link-active">
            <img data-v-74e8b908="" src="/static/img/logo.png" class="logo-image">
            <span data-v-74e8b908="" class="logo-title">Scrape</span>
          </a>
        </div>
      </div>
    </div>
  </div>
  
<div dat

## 1.3.2 代理

In [12]:
from urllib.error import URLError
from urllib.request import ProxyHandler, build_opener

In [49]:
proxy_handler = ProxyHandler({
    'http':'183.64.239.19',
    'http':'123.169.34.113'
})

url = "https://www.baidu.com/"
headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url, headers=headers)

try:
    opener = urllib.request.build_opener(handler)
    response = opener.open(request)
    print(len(response.read().decode('utf-8')))
except URLError as e:
    print(e.reason)
    print('e')

445973


## 1.3.3 Cookie

In [14]:
import http.cookiejar, urllib.request

In [15]:
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('https://www.baidu.com')

In [16]:
for item in cookie:
    print(item.name+"="+item.value)

BD_NOT_HTTPS=1
BIDUPSID=769FCD1900F12D7B20063FCA80D76C2C
PSTM=1698734731
BAIDUID=769FCD1900F12D7B2AB03A64470FD28B:FG=1


In [17]:
filename = 'cookie.txt'
# cookie = http.cookiejar.MozillaCookieJar()
cookie = http.cookiejar.LWPCookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open("http://www.baidu.com")
cookie.save(filename)

In [55]:
cookie = http.cookiejar.LWPCookieJar()
cookie = cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)

# url,request
url = "https://www.baidu.com/"
headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url, headers=headers)

opener = urllib.request.build_opener(handler)
response = opener.open(request)
print(len(response.read().decode('utf-8')))
print(response.status)

445900
200


## 1.3.4 处理异常

1. URLError
2. HTTPError

In [19]:
from urllib import request, error

In [20]:
try:
    response = request.urlopen('https://cuiqingcai.com/404')
except error.HTTPError as e:
    print(e.reason, e.code, e.headers, sep='\n')
except error.URLError as e:
    print(e.reason)
else:
    print(response.read().decode('utf-8'))

Not Found
404
Server: GitHub.com
Content-Type: text/html; charset=utf-8
Access-Control-Allow-Origin: *
ETag: "65020b3f-24a3"
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; img-src data:; connect-src 'self'
x-proxy-cache: MISS
X-GitHub-Request-Id: B312:28D6A0:A7CC6:C1069:6540A28E
Accept-Ranges: bytes
Date: Tue, 31 Oct 2023 06:45:34 GMT
Via: 1.1 varnish
Age: 0
X-Served-By: cache-qpg1255-QPG
X-Cache: MISS
X-Cache-Hits: 0
X-Timer: S1698734734.489585,VS0,VE279
Vary: Accept-Encoding
X-Fastly-Request-ID: 0190eb4309715434edb721892f921b6316044b3f
X-Cache-Lookup: Cache Miss
X-Cache-Lookup: Cache Miss
X-Cache-Lookup: Cache Miss
Content-Length: 9379
X-NWS-LOG-UUID: 2870570413158923470
Connection: close
X-Cache-Lookup: Cache Miss




In [21]:
import socket
import urllib.request
import urllib.error

In [22]:
url = 'https://www.baidu.com'

try:
    response = urllib.request.urlopen(url, timeout=0.01)
except urllib.error.URLError as e:
    print(type(e.reason))
    if isinstance(e.reason, socket.timeout):
        print('TIME OUT')

<class 'TimeoutError'>
TIME OUT


## 1.3 parse模块

###  解析链接 -- urlparse

--- scheme://netloc/path;params?query#fragment

--- scheme:默认协议（http or https）

--- allow_fragments:是否忽略fragment,若被忽略会被解析为path、params或者query的一部分

In [23]:
from urllib.parse import urlparse

In [24]:
url = "https://www.baidu.com/index.html;user?id=5#comment"
result = urlparse(url)
print(type(result))
print(result)

<class 'urllib.parse.ParseResult'>
ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')


In [25]:
result = urlparse(url, allow_fragments=False)
print(result)

ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=5#comment', fragment='')


In [26]:
print(result.scheme, result[0], result.netloc, result[1], sep='\n')

https
https
www.baidu.com
www.baidu.com


### urlunparse

--- 构造URL，长度必须是6，否则存在参数不足或者过多问题

In [27]:
from urllib.parse import urlunparse

In [28]:
data = ['https', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment']
print(urlunparse(data))

https://www.baidu.com/index.html;user?a=6#comment


### urlsplit

--- 只返回5个结果，params会合并到path中

In [29]:
from urllib.parse import urlsplit

In [30]:
url = 'https://www.baidu.com/index.html;user?id=5#comment'
result = urlsplit(url)
print(result)

SplitResult(scheme='https', netloc='www.baidu.com', path='/index.html;user', query='id=5', fragment='comment')


In [31]:
print(result.scheme, result[0], sep='\n')

https
https


### urlunsplit

--- 构造URL，长度必须是6，否则存在参数不足或者过多问题

In [32]:
from urllib.parse import urlunsplit

In [33]:
data = ['https', 'wwwbaidu.com', 'index.html/user', 'id=5', 'comment']
result = urlunsplit(data)
print(result)

https://wwwbaidu.com/index.html/user?id=5#comment


### urljoin

--- base_url,new_url

--- urljoin会解析base_url的scheme,netloc,path这3个内容，并对新链接缺失的部分进行补充

In [34]:
from urllib.parse import urljoin

In [35]:
print(urljoin('https://wwwbaidu.com', 'FAQ.html'))

# path
print(urljoin('https://wwwbaidu.com', 'https://cuiqingcai.com/FAQ.html'))

#use new_url's path
print(urljoin('https://wwwbaidu.com/about.html', 'https://cuiqingcai.com/FAQ.html'))

# base_url path, new_url path
print(urljoin('https://wwwbaidu?wd=abc', 'https://cuiqingcai.com/index.php'))

# base_url scheme netloc, new_url query fragment 
print(urljoin('https://wwwbaidu.com', '?category=2#comment'))
print(urljoin('wwwbaidu.com', '?category=2#comment'))

# base_url fragment, new_url query
print(urljoin('wwwbaidu.com#comment', '?category=2'))

https://wwwbaidu.com/FAQ.html
https://cuiqingcai.com/FAQ.html
https://cuiqingcai.com/FAQ.html
https://cuiqingcai.com/index.php
https://wwwbaidu.com?category=2#comment
wwwbaidu.com?category=2#comment
wwwbaidu.com?category=2


### urlencode

--- 序列化参数

In [36]:
from urllib.parse import urlencode

In [37]:
params = {
    'name':'germy',
    'age':'25'
}
base_url = 'https://www.baidu.com?'
url = base_url + urlencode(params)
print(url)

https://www.baidu.com?name=germy&age=25


### parse_qs

--- 反序列化, 转为字典

In [38]:
from urllib.parse import parse_qs

In [39]:
query = 'name=germy&age=25'
print(parse_qs(query))

{'name': ['germy'], 'age': ['25']}


### parse_qsl

--- 反序列化，转为元组组成的列表

In [40]:
from urllib.parse import parse_qsl

In [41]:
query = 'name=germy&age=25'
print(parse_qsl(query))

[('name', 'germy'), ('age', '25')]


### quote

--- 将内容转为URL编码的格式，例如URL中带有中文参数时，有乱码问题，此时用quote方法可以将中午转化为URL编码

In [42]:
from urllib.parse import quote

In [43]:
keyword = '壁纸'
url = 'https://www.baidu.com?wd=' + quote(keyword)
print(url)

https://www.baidu.com?wd=%E5%A3%81%E7%BA%B8


### unquote

--- URL解码

In [44]:
from urllib.parse import unquote

In [45]:
url = 'https://www.baidu.com/?wd=%E5%A3%81%E7%BA%B8'
print(unquote(url))

https://www.baidu.com/?wd=壁纸


## 1.4 Robots协议

### robotparser 解析robots.txt