In [2]:
from bs4 import BeautifulSoup

In [3]:
# Test html 
html = '''
<html>
    <head></head>
    <body>
        <div id = 'result'>
           <p class = 'row'>
               <a class = 'red'>Go to page1</a>
               <a class = 'blue'>Go to page2</a>
            </p>
        </div>
    </body>
</html>
'''

dom = BeautifulSoup(html, 'lxml')

In [4]:
type(dom.html.head), type(dom.html)

(bs4.element.Tag, bs4.element.Tag)

In [5]:
dom.a, dom.find('a')

(<a class="red">Go to page1</a>, <a class="red">Go to page1</a>)

In [6]:
for tag in dom.find_all('a'):
    print(tag.name, tag['class'], tag.text)

a ['red'] Go to page1
a ['blue'] Go to page2


In [7]:
dom.find_all('a', {'class':'blue'})

[<a class="blue">Go to page2</a>]

In [8]:
dom.find_all(['div', 'a'])

[<div id="result">
 <p class="row">
 <a class="red">Go to page1</a>
 <a class="blue">Go to page2</a>
 </p>
 </div>, <a class="red">Go to page1</a>, <a class="blue">Go to page2</a>]

In [9]:
dom.find_all('', {'id':'result'}) 

[<div id="result">
 <p class="row">
 <a class="red">Go to page1</a>
 <a class="blue">Go to page2</a>
 </p>
 </div>]

### http://pythonscraping.com/pages/page3.html (Test Url)

In [11]:
from urllib import parse
import requests

header = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}

def getDownload(url, params={}, retries=3):
    resp = None
    
    try:
        resp = requests.get(url, params=params, headers=header)
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e:
        if 500 <= e.response.status_code < 600 and retries > 0:
            print(retries)
            resp = getDownload(url, params, retries-1)
        else:
            print(e.response.status_code)
            print(e.response.reason)
            print(e.response.headers)
            
    return resp

In [12]:
# html.text
url = 'http://pythonscraping.com/pages/page3.html'
html = getDownload(url)
dom = BeautifulSoup(html.text, 'lxml')
footer = dom.find('div', {'id':'footer'})

In [13]:
parent = footer.find_parent()
parent.name, parent.attrs

('div', {'id': 'wrapper'})

In [16]:
# children tag의 하부 tree 뿐 아니라 형제 Tree의 하부 까지 모두 찾는다.
children = parent.find_all()
# children

In [17]:
# recursive=False로 지정하면, 원래의 children tag의 하부 Tree 만 찾는다.
children = parent.find_all(recursive=False)
for row in children:
    print(row.name, row.attrs)

img {'src': '../img/gifts/logo.jpg', 'style': 'float:left;'}
h1 {}
div {'id': 'content'}
table {'id': 'giftList'}
div {'id': 'footer'}


In [18]:
descendants = parent.find_all()
for row in descendants:
    print(row.name, row.attrs)

img {'src': '../img/gifts/logo.jpg', 'style': 'float:left;'}
h1 {}
div {'id': 'content'}
p {}
br {}
br {}
table {'id': 'giftList'}
tr {}
th {}
th {}
th {}
th {}
tr {'id': 'gift1', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img1.jpg'}
tr {'id': 'gift2', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img2.jpg'}
tr {'id': 'gift3', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img3.jpg'}
tr {'id': 'gift4', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img4.jpg'}
tr {'id': 'gift5', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img6.jpg'}
div {'id': 'footer'}
br {}


In [19]:
divChildren = children[2].find_all(recursive = False)
for row in divChildren:
    print(row.name, row.attrs, row.text)

p {} 
We haven't figured out how to make online shopping carts yet, but you can send us a check to:
123 Main St.
Abuja, Nigeria
We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.


In [20]:
divTag = children[2]
children[1].name, divTag.find_previous_sibling().name

('h1', 'h1')

In [21]:
divTag.find_next_siblings() # 이웃

[<table id="giftList">
 <tr><th>
 Item Title
 </th><th>
 Description
 </th><th>
 Cost
 </th><th>
 Image
 </th></tr>
 <tr class="gift" id="gift1"><td>
 Vegetable Basket
 </td><td>
 This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
 <span class="excitingNote">Now with super-colorful bell peppers!</span>
 </td><td>
 $15.00
 </td><td>
 <img src="../img/gifts/img1.jpg"/>
 </td></tr>
 <tr class="gift" id="gift2"><td>
 Russian Nesting Dolls
 </td><td>
 Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
 </td><td>
 $10,000.52
 </td><td>
 <img src="../img/gifts/img2.jpg"/>
 </td></tr>
 <tr class="gift" id="gift3"><td>
 Fish Painting
 </td><td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td><td>
 $10,005

In [22]:
children[3] == divTag.find_next_sibling()

True

## 특정부분만 가져오기

In [24]:
aList = dom.find_all('tr')
for row in aList:
    print(row.find_all(recursive = False)[2].text.strip())

Cost
$15.00
$10,000.52
$10,005.00
$0.50
$1.50


## 구글 검색결과

In [25]:
# url = 'https://www.google.com/search?ei=olCDXIG-EpGNr7wPucGkuAc&q=%ED%8C%8C%EC%9D%B4%EC%8D%AC&oq=%ED%8C%8C%EC%9D%B4%EC%8D%AC&gs_l=psy-ab.3..35i39l2j0i67j0l6j0i20i263.9312.9312..9658...0.0..0.111.111.0j1......0....1..gws-wiz.y-nzWDsTdd0'
# html = getDownload(url)

url = 'https://www.google.com/search'
params = {'q':'박보영'}
html = getDownload(url,params)

dom = BeautifulSoup(html.text, 'lxml')

for tag in dom.find_all('', {'class':'r'}):
    print(tag.find('h3').text)

박보영 - 나무위키
박보영 - 위키백과, 우리 모두의 백과사전
박보영의 작품 목록 - 위키백과, 우리 모두의 백과사전
[8화 예고] 박보영&안효섭, 구슬커플 드디어 로맨스 가나요...? _   어비스 ...
박보영은 오래 지켜본다. 연애도, 연기 변신도 - 중앙일보 - 조인스
박보영, tvN 드라마 '어비스' 여주인공 - MSN.com
`어비스` 박보영·안효섭, 첫 만남부터 눈물…무슨 일? - 스타투데이
'어비스' 박보영·안효섭, '강렬 눈빛 추적자' 변신…'박보영 죽음' 본격 추적 ...
