# 크롤링

In [1]:
from bs4 import BeautifulSoup
import urllib.request
import json

## 벅스

In [2]:
url = 'https://music.bugs.co.kr/chart'
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko')
r = urllib.request.urlopen(req)
html = urllib.request.urlopen(req).read()

In [3]:
soup = BeautifulSoup(html, 'html5lib')
tr_tag_list = soup.select(".list > tbody > tr")

In [4]:
song_list = []
for rank, tr_tag in enumerate(tr_tag_list, 1):
    song_no = tr_tag["trackid"]
    song_tag = tr_tag.select_one("th .title a")
    album_tag = tr_tag.select_one("td.left .album")
    artist_tag = tr_tag.select_one("td.left .artist a")
    
    song = {
        "song_no": song_no,
        "title": song_tag.text,
        "album": album_tag.text,
        "artist": artist_tag.text,
    }
    
    song_list.append(song)

In [5]:
song_list[0:2]

[{'song_no': '6171743', 'title': 'POSE', 'album': 'POSE', 'artist': '키노'},
 {'song_no': '6170060',
  'title': 'Attention',
  'album': "NewJeans 1st EP 'New Jeans'",
  'artist': 'NewJeans'}]

## 멜론

In [6]:
url = 'https://www.melon.com/chart/index.htm'
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko')
r = urllib.request.urlopen(req)
html = urllib.request.urlopen(req).read()

In [7]:
soup = BeautifulSoup(html, 'html5lib')
tr_tag_list = soup.select(".d_song_list tbody tr")

In [8]:
song_list = []
for rank, tr_tag in enumerate(tr_tag_list, 1):
    song_no = tr_tag["data-song-no"]
    song_tag = tr_tag.select_one("a[href*=playSong]")
    album_tag = tr_tag.select_one(".wrap_song_info a[href*=goAlbumDetail]")
    artist_tag = tr_tag.select_one("a[href*=goArtistDetail]")
    
    song = {
        "song_no": song_no,
        "title": song_tag.text,
        "album": album_tag.text,
        "artist": artist_tag.text,
    }
    
    song_list.append(song)

In [9]:
song_list[0:5]

[{'song_no': '35454425',
  'title': 'Attention',
  'album': "NewJeans 1st EP 'New Jeans'",
  'artist': 'NewJeans'},
 {'song_no': '35383397',
  'title': '그때 그 순간 그대로 (그그그)',
  'album': 'WSG워너비 1집',
  'artist': 'WSG워너비 (가야G)'},
 {'song_no': '35383398',
  'title': '보고싶었어',
  'album': 'WSG워너비 1집',
  'artist': 'WSG워너비 (4FIRE)'},
 {'song_no': '35454426',
  'title': 'Hype boy',
  'album': "NewJeans 1st EP 'New Jeans'",
  'artist': 'NewJeans'},
 {'song_no': '34847378',
  'title': 'LOVE DIVE',
  'album': 'LOVE DIVE',
  'artist': 'IVE (아이브)'}]

In [10]:
def get_like_count(song_no_list):
    params = urllib.parse.urlencode({"contsIds": song_no_list})
    api_url = f'https://www.melon.com/commonlike/getSongLike.json?{params}'
    req = urllib.request.Request(api_url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko')
    r = urllib.request.urlopen(req)
    response = json.loads(r.read())
    like_list = response["contsLike"]
    like_dict = {str(song["CONTSID"]): song["SUMMCNT"] for song in like_list}
    return like_dict

In [11]:
get_like_count(34847378)

{'34847378': 178221}

## 다나와

In [12]:
params = urllib.parse.urlencode({"k1": "AMD"})
url = f'https://search.danawa.com/dsearch.php?{params}&module=goods&act=dispMain'
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko')
r = urllib.request.urlopen(req)
html = urllib.request.urlopen(req).read()

In [13]:
soup = BeautifulSoup(html, 'html5lib')
li_tag_list = soup.select("div.main_prodlist ul.product_list li.prod_item")

In [14]:
title = li_tag_list[0].select('p.prod_name a')[0].text
spec_list = li_tag_list[0].select('div.spec_list')[0].text.strip()
price = li_tag_list[0].select('li.rank_one p.price_sect a strong')[0].text.strip().replace(',','')

In [15]:
prod_data = []
for prod_item in li_tag_list:
    try:
        title = prod_item.select('p.prod_name a')[0].text
    except:
        title = ""
        
    try:
        spec_list = prod_item.select('div.spec_list')[0].text.strip()
    except:
        spec_list = ""
        
    try:
        price = prod_item.select('li.rank_one p.price_sect a strong')[0].text.strip().replace(',','')
    except:
        price = ""
    
    prod = {'title': title, 
            #'spec_list':spec_list, 
            'price': price}
    
    prod_data.append(prod)
    

In [16]:
prod_data

[{'title': 'AMD 라이젠5-4세대 5600X (버미어)', 'price': '206010'},
 {'title': 'AMD 라이젠5-4세대 5600 (버미어)', 'price': '184210'},
 {'title': 'LPjl3gIMDg', 'price': '367073'},
 {'title': 'AMD 라이젠7-4세대 5800X3D (버미어)', 'price': '599680'},
 {'title': 'AMD 라이젠7-4세대 5800X (버미어)', 'price': '303420'},
 {'title': 'AMD 라이젠5-4세대 5600G (세잔)', 'price': '195420'},
 {'title': '', 'price': ''},
 {'title': 'AMD 라이젠9-4세대 5900X (버미어)', 'price': '487950'},
 {'title': 'AMD 라이젠5 PRO 4650G (르누아르)', 'price': '154840'},
 {'title': 'AMD 라이젠5-3세대 4600G (르누아르)', 'price': '144040'},
 {'title': 'AMD 라이젠9-4세대 5950X (버미어)', 'price': '669800'},
 {'title': 'AMD 라이젠7-4세대 5700G (세잔)', 'price': '322140'},
 {'title': '', 'price': ''},
 {'title': 'AMD 라이젠7-4세대 5700X (버미어)', 'price': '296390'},
 {'title': 'AMD 라이젠5-4세대 5500 (세잔)', 'price': '159740'},
 {'title': 'AMD 라이젠 스레드리퍼 PRO 5995WX (샤갈 프로)', 'price': '10335000'},
 {'title': 'AMD 라이젠3 PRO 4350G (르누아르)', 'price': '130000'},
 {'title': 'AMD 라이젠7 PRO 4750G (르누아르)', 'price': '258050'},
 