# HTML形式

## Webスクレイピング

## データサンプル

## データの取得方法

### HTMLを取得してタグを解析する方法

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

url = "https://peps.python.org/"  # WebサイトのURL
attrs = (
    "type-status",
    "number",
    "title",
    "authors",
)  # 取得するデータの属性名

with urlopen(url) as f:  # urllibでWebサイトにアクセス・・①
    html = f.read()

# 取得したHTMLデータを BeautifulSoup で解析
soup = BeautifulSoup(html, "html.parser")  # ・・②
# HTML内のidを指定してデータを取得
table = soup.find(id="historical-meta-peps-and-informational-peps").table  # ・・③

out = []
for tr in table.tbody.find_all("tr"):  # HTML tableタグ内の `tr` タグのすべてを取得・・④
    line = []
    for td in tr.find_all("td"):  # trタグ内の `td` タグのすべてを取得・・⑤
        text = td.text  # tdタグの文字列を取得
        line.append(text)  # 中間データ用ののリストに追加

    # 出力用のリストに取得した1行分のデータを辞書型で追加
    out.append({k: v for k, v in zip(attrs, line)})  # ・・⑥

print(out[0])

{'type-status': 'PS', 'number': '5', 'title': 'Guidelines for Language Evolution', 'authors': 'Paul Prescod'}


In [3]:
import pandas as pd

df_html_parse = pd.DataFrame(out)
df_html_parse.head()

Unnamed: 0,type-status,number,title,authors
0,PS,5,Guidelines for Language Evolution,Paul Prescod
1,PS,6,Bug Fix Releases,"Aahz, Anthony Baxter"
2,IF,160,Python 1.6 Release Schedule,"Fred L. Drake, Jr."
3,IF,200,Python 2.0 Release Schedule,Jeremy Hylton
4,IF,226,Python 2.1 Release Schedule,Jeremy Hylton


In [4]:
df_html_parse.shape

(36, 4)

## pandasでテーブルタグを取得する方法

In [6]:
import pandas as pd

url = "https://peps.python.org/"

tables = pd.read_html(url)
print(len(tables))

12


In [7]:
print(type(tables))
print([type(table) for table in tables])

<class 'list'>
[<class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.frame.DataFrame'>]


In [8]:
df_table = tables[5]
df_table.head()

Unnamed: 0.1,Unnamed: 0,PEP,Title,Authors,Unnamed: 4
0,SF,100,Python Unicode Integration,Marc-André Lemburg,2.0
1,SF,201,Lockstep Iteration,Barry Warsaw,2.0
2,SF,202,List Comprehensions,Barry Warsaw,2.0
3,SF,203,Augmented Assignments,Thomas Wouters,2.0
4,SF,205,Weak References,"Fred L. Drake, Jr.",2.1


In [9]:
df_table.shape

(249, 5)

In [11]:
attrs = (
    "type-status",
    "number",
    "title",
    "authors",
    "name-code" # 追加 column appended
)
df_table.columns = attrs
df_table.head()

Unnamed: 0,type-status,number,title,authors,name-code
0,SF,100,Python Unicode Integration,Marc-André Lemburg,2.0
1,SF,201,Lockstep Iteration,Barry Warsaw,2.0
2,SF,202,List Comprehensions,Barry Warsaw,2.0
3,SF,203,Augmented Assignments,Thomas Wouters,2.0
4,SF,205,Weak References,"Fred L. Drake, Jr.",2.1
