In [1]:
import requests

In [2]:
res = requests.get('http://quotes.toscrape.com/')
res

<Response [200]>

In [3]:
res.status_code

200

In [4]:
# raise_for_status will raise python error if status is an http error, otherwise returns None
print(res.raise_for_status())

# example of error
res2 = requests.get('https://github.com/givemeanerror') # this url doesn't exist
res2.raise_for_status()

None


HTTPError: 404 Client Error: Not Found for url: https://github.com/givemeanerror

In [5]:
res.url

'http://quotes.toscrape.com/'

In [6]:
res.encoding

'utf-8'

In [7]:
dict(res.headers)

{'Server': 'nginx/1.14.0 (Ubuntu)',
 'Date': 'Wed, 29 Jul 2020 02:02:46 GMT',
 'Content-Type': 'text/html; charset=utf-8',
 'Transfer-Encoding': 'chunked',
 'Connection': 'keep-alive',
 'X-Upstream': 'spidyquotes-master_web',
 'Content-Encoding': 'gzip'}

In [8]:
# res.text gives page html (technically content of servers response, could be different fromat than html)
print(res.text[:500])

<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="UTF-8">
	<title>Quotes to Scrape</title>
    <link rel="stylesheet" href="/static/bootstrap.min.css">
    <link rel="stylesheet" href="/static/main.css">
</head>
<body>
    <div class="container">
        <div class="row header-box">
            <div class="col-md-8">
                <h1>
                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>
                </h1>
            </div>
            <div class="col-md


In [9]:
type(res.content)

bytes

In [10]:
# res.content gives bytes string
# probably don't want this most of time
res.content[:500]

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n\t<meta charset="UTF-8">\n\t<title>Quotes to Scrape</title>\n    <link rel="stylesheet" href="/static/bootstrap.min.css">\n    <link rel="stylesheet" href="/static/main.css">\n</head>\n<body>\n    <div class="container">\n        <div class="row header-box">\n            <div class="col-md-8">\n                <h1>\n                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>\n                </h1>\n            </div>\n            <div class="col-md'

In [11]:
res.content.decode(res.encoding) == res.text

True

In [12]:
res.cookies

<RequestsCookieJar[]>

In [13]:
lines_gen = res.iter_lines(decode_unicode=True)
next(lines_gen)

'<!DOCTYPE html>'

In [14]:
next(lines_gen)

'<html lang="en">'

In [15]:
lines_list = list(res.iter_lines(decode_unicode=True))
lines_list[:5]

['<!DOCTYPE html>',
 '<html lang="en">',
 '<head>',
 '\t<meta charset="UTF-8">',
 '\t<title>Quotes to Scrape</title>']

In [16]:
res.is_redirect

False

In [17]:
res.elapsed

datetime.timedelta(0, 0, 526064)

In [18]:
res3 = requests.get('https://www.fiverr.com/categories/programming-tech/data-analysis-services/data-mining-scraping')
res3

<Response [200]>

In [19]:
# temporary redirect
res3.history, res3.history[0].is_redirect

([<Response [307]>], True)

In [20]:
# fiverr blocked us
res3.url

'https://block.fiverr.com/index.html?url=aHR0cDovL3d3dy5maXZlcnIuY29tL2NhdGVnb3JpZXMvcHJvZ3JhbW1pbmctdGVjaC9kYXRhLWFuYWx5c2lzLXNlcnZpY2VzL2RhdGEtbWluaW5nLXNjcmFwaW5nPw==&uuid=9e9a0420-d13f-11ea-92c2-3b5b8c69af0d&vid='

In [21]:
# use custom headers to get around fiverr block, Host and User-Agent seem to be the necessary headers
url = 'https://www.fiverr.com/categories/programming-tech/data-analysis-services/data-mining-scraping'
headers = {
#     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
#     "Accept-Encoding": "gzip, deflate", 
#     "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8", 
#     "Dnt": "1", 
    "Host": "httpbin.org", 
#     "Upgrade-Insecure-Requests": "1", 
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", 
  }
res4 = requests.get(url, headers=headers)
res4.url

'https://www.fiverr.com/categories/programming-tech/data-analysis-services/data-mining-scraping'

In [22]:
# There’s also a builtin JSON decoder, in case you’re dealing with JSON data
res5 = requests.get('https://api.github.com/events')

str(res5.json())[:500]

"[{'id': '13043565662', 'type': 'CreateEvent', 'actor': {'id': 47148648, 'login': 'svc-software-factory', 'display_login': 'svc-software-factory', 'gravatar_id': '', 'url': 'https://api.github.com/users/svc-software-factory', 'avatar_url': 'https://avatars.githubusercontent.com/u/47148648?'}, 'repo': {'id': 197735801, 'name': 'adeo-gitlab/sync-github-gitlab-31', 'url': 'https://api.github.com/repos/adeo-gitlab/sync-github-gitlab-31'}, 'payload': {'ref': '2020-07-29_01-57-32_527_scenario1', 'ref_t"