In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


# Python 3
## Http, работа с web

MIPT 2020

основное про http - https://ru.wikipedia.org/wiki/HTTP

### HTML

In [2]:
%%file basic.html

<!DOCTYPE html>
<html>
   <head id=4>
      <meta charset="utf-8" />
      <title>HTML Document</title>
   </head>
   <body>
      <p>
         <b>
            Этот текст будет полужирным, <i>а этот — ещё и курсивным</i>.
         </b>
      </p>
   </body>
</html>


Overwriting basic.html


In [3]:
!firefox basic.html


Более продвинутые вещи нужно искать

PyPi - https://pypi.org/

### Urllib

In [4]:
import urllib
import http

with urllib.request.urlopen('http://yandex.ru') as f:
    type(f)
    f.read(100).decode('utf-8')
    f.getcode(), f.geturl(), f.headers['Content-Type']


http.client.HTTPResponse

'<!DOCTYPE html><html class="i-ua_js_no i-ua_css_standart i-ua_browser_ i-ua_browser_desktop document'

(200, 'https://yandex.ru/', 'text/html; charset=UTF-8')

## requests

Более высокоуровневая библиотека для запросов

In [5]:
import requests

In [6]:
with requests.get('http://yandex.ru') as f:
    f.text[:100], f.status_code, f.headers['Content-type']

# f.json()


('<!DOCTYPE html><html class="i-ua_js_no i-ua_css_standart i-ua_browser_unknown i-ua_browser_desktop d',
 200,
 'text/html; charset=UTF-8')

## aiohttp

In [7]:
import aiohttp

async with aiohttp.request('get', 'http://yandex.ru') as resp:
    resp_text = await resp.text()
    resp_text[:100], resp.status, resp.headers['Content-type']


('<!DOCTYPE html><html class="i-ua_js_no i-ua_css_standart i-ua_browser_unknown i-ua_browser_desktop d',
 200,
 'text/html; charset=UTF-8')

## Парсинг HTML

### lxml

Warning

The xml.etree.ElementTree module is not secure against maliciously constructed data. If you need to parse untrusted or unauthenticated data see XML vulnerabilities

In [8]:
%%file my.xml

<cinema>
  <name>BestCinema</name>
  <films>
    <categories>
      <category>Action</category>
      <category>Thriller</category>
      <category>Soap opera</category>
    </categories>
  </films>
</cinema>


Overwriting my.xml


In [9]:
from lxml import etree

tree = etree.parse('my.xml')

root = tree.getroot()
root.tag

def print_all(node):
    print(f'{node.tag} {node.text}')
    for child in node:
        print_all(child)

print_all(root)


'cinema'

cinema 
  
name BestCinema
films 
    
categories 
      
category Action
category Thriller
category Soap opera


`root_iter` рекурсивно обходит xml

In [10]:
for child in root.iter('category'):
    print(f'{child.tag} {child.text}')


category Action
category Thriller
category Soap opera


In [11]:
data_string = """
<data>
    <country name="Liechtenstein">
        <rank>1</rank>
        <year>2008</year>
        <gdppc>141100</gdppc>
        <neighbor name="Austria" direction="E"/>
        <neighbor name="Switzerland" direction="W"/>
    </country>
    <country name="Singapore">
        <rank>4</rank>
        <year>2011</year>
        <gdppc>59900</gdppc>
        <neighbor name="Malaysia" direction="N"/>
    </country>
    <country name="Panama">
        <rank>68</rank>
        <year>2011</year>
        <gdppc>13600</gdppc>
        <neighbor name="Costa Rica" direction="W"/>
        <neighbor name="Colombia" direction="E"/>
    </country>
</data>
"""


In [12]:
import xml.etree.ElementTree as ET
import io

tree = ET.parse(io.StringIO(data_string))

root = tree.getroot()
countries = root.findall('country')
countries


[<Element 'country' at 0x7f32115dc630>,
 <Element 'country' at 0x7f32115e20e0>,
 <Element 'country' at 0x7f32115e2270>]

In [13]:
for country in countries:
    rank = country.find('rank').text
    name = country.get('name')
    print(rank, name)


1 Liechtenstein
4 Singapore
68 Panama


In [14]:
for rank in root.iter('rank'):
    new_rank = int(rank.text) + 1
    rank.text = str(new_rank)
    rank.set('updated', 'yes')

ET.dump(root)


<data>
    <country name="Liechtenstein">
        <rank updated="yes">2</rank>
        <year>2008</year>
        <gdppc>141100</gdppc>
        <neighbor name="Austria" direction="E" />
        <neighbor name="Switzerland" direction="W" />
    </country>
    <country name="Singapore">
        <rank updated="yes">5</rank>
        <year>2011</year>
        <gdppc>59900</gdppc>
        <neighbor name="Malaysia" direction="N" />
    </country>
    <country name="Panama">
        <rank updated="yes">69</rank>
        <year>2011</year>
        <gdppc>13600</gdppc>
        <neighbor name="Costa Rica" direction="W" />
        <neighbor name="Colombia" direction="E" />
    </country>
</data>


In [15]:
for neighbor in root.findall('./country/neighbor'):
    neighbor.get('name')


'Austria'

'Switzerland'

'Malaysia'

'Costa Rica'

'Colombia'

In [16]:
for panama in root.findall("*[@name='Panama']"):
    panama.get('name')
    panama.find('year').text


'Panama'

'2011'

In [17]:
for year in root.findall("*[.='2011']"):
    year.text


### BeautifulSoup

In [18]:
from bs4 import BeautifulSoup


In [19]:
async with aiohttp.request('get', 'http://yandex.ru') as resp:
    resp_text = await resp.text()


In [20]:
soup = BeautifulSoup(resp_text, 'html')

soup.h3


<h3 style="font-size:1.5em;">Правила режима самоизоляции и порядок передвижения по Москве</h3>

In [21]:
for child in soup.recursiveChildGenerator():
    if child.name == 'title':
        child


<title>Яндекс</title>

In [22]:
print(soup.prettify()[:1000])


<!DOCTYPE html>
<html class="i-ua_js_no i-ua_css_standart i-ua_browser_unknown i-ua_browser_desktop document_sticky-extra-logo_yes i-ua_platform_other" lang="ru">
 <head xmlns:og="http://ogp.me/ns#">
  <meta content="text/html;charset=utf-8" http-equiv="Content-Type"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <title>
   Яндекс
  </title>
  <link href="//yastatic.net/iconostasis/_/8lFaTHLDzmsEZz-5XaQg9iTWZGE.png" rel="shortcut icon"/>
  <link href="//yastatic.net/iconostasis/_/5mdPq4V7ghRgzBvMkCaTzd2fjYg.png" rel="apple-touch-icon" sizes="76x76"/>
  <link href="//yastatic.net/iconostasis/_/s-hGoCQMUosTziuARBks08IUxmc.png" rel="apple-touch-icon" sizes="120x120"/>
  <link href="//yastatic.net/iconostasis/_/KnU823iWwj_vrPra7x9aQ-4yjRw.png" rel="apple-touch-icon" sizes="152x152"/>
  <link href="//yastatic.net/iconostasis/_/wT9gfGZZ80sP0VsoR6dgDyXJf2Y.png" rel="apple-touch-icon" sizes="180x180"/>
  <link href="https://yandex.ru/company/press_releases/news.rss" rel="alternat

In [23]:
soup.find(rel="shortcut icon")


<link href="//yastatic.net/iconostasis/_/8lFaTHLDzmsEZz-5XaQg9iTWZGE.png" rel="shortcut icon"/>

In [24]:
soup.find('title')


<title>Яндекс</title>

In [25]:
import re

soup.find_all(string=re.compile("Я"))


['Яндекс',
 'на Яндекс.Услугах',
 'Популярные сервисы Яндекса',
 'Яндекс.Браузер',
 'Штраф за нарушение самоизоляции.\nДрузья, добрый день!\nВчера выкладывала пост про то, как меня задержали. Всем большое спасибо за дельные советы.\nВыкладываю фото протокола.\nЯ сидела в пункте участкового 2 часа, мне сказали, что если не подпишу согласие с протоколом, буду сидеть ещё дольше. Т. К я никогда ни с чем подобным не стал',
 '©\xa0Яндекс']

#### Полноценный пример

In [26]:
import asyncio


In [27]:
sum_rating = 0

async with aiohttp.request('get', 'http://reddit.com') as resp:
    resp_text = await resp.text()

soup = BeautifulSoup(resp_text, 'html')
posts = soup.find_all(id=re.compile('t3_'))

for post in posts:
    upvotes = post.find(string=re.compile("[0-9].[0-9]k"))
    if upvotes:
        try:
            upvotes = float(upvotes[:-1])
        except:
            continue
        print(upvotes)
        sum_rating += upvotes

sum_rating /= 3
print(f'sum_rating is {sum_rating}')


9.8
9.8
9.8
44.0
44.0
44.0
53.3
53.3
53.3
84.4
84.4
84.4
23.9
23.9
23.9
34.4
34.4
34.4
11.5
11.5
11.5
sum_rating is 261.29999999999995
