In [1]:
import requests

In [2]:
def getDownload(url, param = None, retries = 3):
    resp = None
    
    try:
        resp = requests.get(url, params = param, headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'})
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e:
        if 500 <= resp.status_code < 600 and retries > 0:
            print('Retries : {0}'.format(retries))
            return getDownload(url, param, retries -1)
        else:
            print(resp.status_code)
            print(resp.reason)
            print(resp.request.headers)
            
    return resp

In [3]:
from bs4 import BeautifulSoup

In [4]:
url = "http://example.webscraping.com/places/default/index"

In [5]:
html = getDownload(url)
dom = BeautifulSoup(html.text,"lxml")

In [7]:
for tag in dom.select("a"):
    if(tag.has_attr("href")):
        print(tag["href"])

#
/places/default/user/register?_next=/places/default/index
/places/default/user/login?_next=/places/default/index
/places/default/index
/places/default/search
/places/default/view/Afghanistan-1
/places/default/view/Aland-Islands-2
/places/default/view/Albania-3
/places/default/view/Algeria-4
/places/default/view/American-Samoa-5
/places/default/view/Andorra-6
/places/default/view/Angola-7
/places/default/view/Anguilla-8
/places/default/view/Antarctica-9
/places/default/view/Antigua-and-Barbuda-10
/places/default/index/1


In [8]:
from urllib.parse import urljoin

urljoin("http://www.naver.com","/search/about")

'http://www.naver.com/search/about'

In [12]:
urljoin(url,"/places/default/user/register?_next=/places/default/index")
requests.compat.urljoin(url,"/places/default/user/register?_next=/places/default/index")

'http://example.webscraping.com/places/default/user/register?_next=/places/default/index'

In [16]:
for tag in dom.select("a"):
    if(tag.has_attr("href")):
        href = tag["href"]
        print(href)
        
        if href.startswith("#"):
            print("Skipped: "+href)
        elif href.startswith("/"):
            print(requests.compat.urljoin(url,href))
#         print(tag["href"])

#
Skipped: #
/places/default/user/register?_next=/places/default/index
http://example.webscraping.com/places/default/user/register?_next=/places/default/index
/places/default/user/login?_next=/places/default/index
http://example.webscraping.com/places/default/user/login?_next=/places/default/index
/places/default/index
http://example.webscraping.com/places/default/index
/places/default/search
http://example.webscraping.com/places/default/search
/places/default/view/Afghanistan-1
http://example.webscraping.com/places/default/view/Afghanistan-1
/places/default/view/Aland-Islands-2
http://example.webscraping.com/places/default/view/Aland-Islands-2
/places/default/view/Albania-3
http://example.webscraping.com/places/default/view/Albania-3
/places/default/view/Algeria-4
http://example.webscraping.com/places/default/view/Algeria-4
/places/default/view/American-Samoa-5
http://example.webscraping.com/places/default/view/American-Samoa-5
/places/default/view/Andorra-6
http://example.webscraping

In [74]:
def getUrls(url, depth):
    
    if depth > 3:
        return None
    
    html = getDownload(url)
    
    if html.status_code != 200:
        return None
    
    dom = BeautifulSoup(html.text, "lxml")
    
    urls = []
    
    for tag in dom.select("a"):
        if(tag.has_attr("href")):
            href = tag["href"]
#             print(href)

            if href.startswith("http"):
                urls.append({"url":href, "depth":depth+1})
            if href.startswith("#"):
                print("Skipped: "+href)
            if href.startswith("/"):
                if len(href) > 2:
                    newUrl = requests.compat.urljoin(url,href)
    #                 print(requests.compat.urljoin(url,href))
                    if url != newUrl:
                        urls.append({"url":newUrl, "depth":depth+1})                
                if href.startswith("//"):
                    urls.append({"url":"http:"+href, "depth":depth+1})

                
    print("{0} {1} / {2}".format(">"*depth, url, len(urls)))

    return urls

In [26]:
import time
import random

time.sleep(1)
random.randint(1,3)

1

In [75]:
queue = getUrls(url)
visited = [] # timestamp

while queue:
    time.sleep(random.randint(1,3))
    
    seed = queue.pop(0)
    links = getUrls(seed["url"], seed["depth"])
    
    if links != None:
        queue.extend(links)
    
    visited.append(seed)    
    target = [tag for tag in links if tag not in queue and visited]
    ""
    print("Queu: {0}, Links:{1}".format(len(queue), len(target)))
    
    queue.extend(target)

TypeError: getUrls() missing 1 required positional argument: 'depth'

In [34]:
a = [1, 2]
b = [3]
a.append(b)
a

[1, 2, [3]]

In [32]:
a = [1, 2]
b = [3]
a.extend(b)
a

[1, 2, 3]

In [37]:
whos

Variable        Type             Data/Info
------------------------------------------
BeautifulSoup   type             <class 'bs4.BeautifulSoup'>
a               list             n=3
b               list             n=1
dom             BeautifulSoup    <!--[if HTML5]><![endif]-<...>f]-->\n</body>\n</html>\n
getDownload     function         <function getDownload at 0x00000193E3371950>
getUrls         function         <function getUrls at 0x00000193E4B627B8>
href            str              /places/default/index/1
html            Response         <Response [200]>
links           list             n=0
queue           list             n=150
random          module           <module 'random' from 'C:<...>aconda3\\lib\\random.py'>
requests        module           <module 'requests' from '<...>\\requests\\__init__.py'>
seed            str              http://example.webscrapin<...>com/places/default/iso/ME
tag             Tag              <a href="/places/default/index/1">Next &gt;</a>
target 

In [76]:
url = "https://www.google.com/search"
param = {"q":"파이썬"}

html = getDownload(url,param)
dom = BeautifulSoup(html.text,"lxml")

queue= []

for tag in dom.select(".r a > h3"):
    queue.append({"url":tag.find_parent()["href"], "depth":0})
#     print(tag.find_parent()["href"])

In [77]:
queue

[{'url': 'https://www.python.org/', 'depth': 0},
 {'url': 'https://docs.python.org/ko/3/tutorial/index.html', 'depth': 0},
 {'url': 'https://namu.wiki/w/Python', 'depth': 0},
 {'url': 'https://ko.wikipedia.org/wiki/%ED%8C%8C%EC%9D%B4%EC%8D%AC',
  'depth': 0},
 {'url': 'https://programmers.co.kr/learn/courses/2', 'depth': 0},
 {'url': 'https://wikidocs.net/9', 'depth': 0},
 {'url': 'https://wikidocs.net/43', 'depth': 0},
 {'url': 'https://wikidocs.net/6', 'depth': 0},
 {'url': 'https://wikidocs.net/13', 'depth': 0},
 {'url': 'https://opentutorials.org/course/1750', 'depth': 0}]

In [78]:
# getUrls(queue[0])

depth = 0

while queue:
    seed = queue.pop(0)
    
    links = getUrls(seed["url"], seed["depth"])
    
    if links != None:
        queue.extend(links)

Skipped: #content
Skipped: #python-network
Skipped: #top
Skipped: #site-map
Skipped: #
Skipped: #
Skipped: #python-network
Skipped: #python-network
 https://www.python.org/ / 195
Skipped: #the-python-tutorial
 https://docs.python.org/ko/3/tutorial/index.html / 6
Skipped: #
Skipped: #
Skipped: #
Skipped: #
Skipped: #fn-1
Skipped: #s-1
Skipped: #s-2
Skipped: #s-2.1
Skipped: #s-2.2
Skipped: #s-2.3
Skipped: #s-2.4
Skipped: #s-3
Skipped: #s-3.1
Skipped: #s-3.2
Skipped: #s-4
Skipped: #s-4.1
Skipped: #s-4.2
Skipped: #s-4.3
Skipped: #s-4.4
Skipped: #s-4.5
Skipped: #s-4.6
Skipped: #s-4.7
Skipped: #s-4.8
Skipped: #s-4.8.1
Skipped: #s-5
Skipped: #s-5.1
Skipped: #s-6
Skipped: #s-6.1
Skipped: #s-6.2
Skipped: #s-7
Skipped: #s-7.1
Skipped: #s-7.2
Skipped: #s-8
Skipped: #s-8.1
Skipped: #s-8.2
Skipped: #s-9
Skipped: #s-9.1
Skipped: #s-10
Skipped: #s-10.1
Skipped: #toc
Skipped: #fn-2
Skipped: #fn-3
Skipped: #toc
Skipped: #toc
Skipped: #toc
Skipped: #toc
Skipped: #fn-4
Skipped: #toc
Skipped: #fn-5
Skippe

KeyboardInterrupt: 