In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import requests
from bs4 import BeautifulSoup
from selenium import webdriver

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')


### 멜론 플레이리스트 인기 테마 / 인기 장르 
- 메인 Tag URL : https://www.melon.com/dj/tag/djtaghub_list.htm?tagSeq=43#params%5BtagSeq%5D=43&params%5BorderBy%5D=POP&po=pageObj&startIndex=1
- PlayListURL : https://www.melon.com/mymusic/dj/mymusicdjplaylistview_inform.htm?plylstSeq=리스트ID


##### 1. 인기테마/장르 별로 플레이리스트 ID가 저장된 파일을 불러온다.

In [2]:
df = pd.read_csv('data/멜론테그번호.CSV', encoding='euc-kr')
print(df.shape)
df.head()

(45, 2)


Unnamed: 0,tag_name,id
0,가을,2
1,기분전환,3
2,휴식,4
3,힐링,5
4,사랑,6


##### 2. 해당 ID별로 리스트 목록을 가지고 와서 각각(2페이지) 50개 내의 노래 리스트만 가지고온다.

In [3]:
tag_list = df.id.values
# 메인 tag URL
tag_url1 = 'https://www.melon.com/dj/tag/djtaghub_list.htm?tagSeq='
tag_url2 = '#params%5BtagSeq%5D='
tag_url3 = '&params%5BorderBy%5D=POP&po=pageObj&startIndex='
# PlayList URL
playlist_url = 'https://www.melon.com/mymusic/dj/mymusicdjplaylistview_inform.htm?plylstSeq='

header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'}
# driver = webdriver.Chrome('C:/Users/yonsai/Downloads/chromedriver_win32/chromedriver.exe')
driver = webdriver.Chrome('C:/Users/sujung/Downloads/chromedriver_win32/chromedriver.exe')


In [4]:
plist1, plist2 = [], []

for i in tqdm(tag_list):

    # 1, 2 페이지의 내용만
    for k in range(1, 22, 20):

        driver.get(f'{tag_url1}{i}{tag_url2}{i}{tag_url3}{k}')
        time.sleep(1)

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        lis = soup.select('.service_list_play.d_djcol_list > ul > li > div > a')
        
        plist = [li['href'].split(',')[-1][1:-3] for li in lis]

        for pli in plist:

            driver.get(playlist_url + pli)
            time.sleep(1)

            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # plist1에 들어갈 요소들
            title = soup.select_one('.ellipsis.song_name').text.strip()
            tag = soup.select_one('.tag_list.type03').text.replace('\n', ' ')
            like = int(soup.select_one('.cnt').text.split()[-1].replace(',', '').strip())
            
            trs = soup.select('.service_list_song.d_song_list > table > tbody > tr' )
            song_ids = []
            # plist2에 들어갈 요소들
            for tr in trs:
                
                if tr.select_one('.bullet_icons.age_19') :  # 19금 제외
                    continue

                img = tr.select_one('img')['src']

                tmp = tr.select_one('.rank01 > span > a')
                s_id = tmp['href'].split(',')[-1][:-2]
                song_ids.append(s_id)

                s_title = tmp.text.strip()
                s_artist = tr.select_one('.rank02 > span').text

                plist2.append({'plylstSeq': pli, 'songId': s_id, 'title': s_title, 'artist': s_artist, 'img': img})
            
            plist1.append({'plylstSeq': pli, 'title': title, 'tag': tag, 'like':like, 'songIds': ' '.join(song_ids)})


df1 = pd.DataFrame(plist1)
df2 = pd.DataFrame(plist2)

print(df1.shape, df2.shape)

df1.to_csv('data/playlist1.csv', index=False)
df2.to_csv('data/playlist2.csv', index=False)

driver.close()


100%|██████████| 45/45 [1:05:00<00:00, 86.68s/it]


(1800, 5) (77932, 5)


In [11]:
df1.shape[0], df1.plylstSeq.nunique(), df2.shape[0], df2.songId.nunique()

(1800, 1108, 77932, 26609)

In [12]:
df1.head()

Unnamed: 0,plylstSeq,title,tag,like,songIds
0,516301214,"빙글빙글 낭만이 물들어가는, LP판 느낌의 감성팝",#낭만 #감성 #LP #힐링 #휴식 #사랑 #분위기 #가을 #추억 #팝,400,33880688 36359244 36148918 35576427 33664280 3...
1,523235480,IVE 멤버들의 추천곡들,#IVE #아이브 #이서 #원영 #레이 #유진 #리즈 #가을,676,33213267 32544401 33502105 33469725 32464068 3...
2,523324194,가을에 듣는 곡들 ༄,#가을 #단풍 #낙엽 #고독 #뉴진스 #아이브 #르세라핌 #NewJeans #IV...,666,354198 5406681 33618864 2425149 4252725 306695...
3,523237139,2023 2 11 아이브 첫 콘서트,#IVE #아이브 #이서 #원영 #헤이 #유진 #리즈 #가을,682,36356992 36356993 36356994 36356995 36356996 3...
4,495831954,시끄러운 건 싫지만 신나고 싶은 당신을 위한 데일리 POP (상시업뎃),#감성 #기분전환 #휴식 #힐링 #신나는 #도입부 #가을 #외출 #산책 #중독성,7118,34041471 36408583 8072233 34572522 36441928 33...


In [14]:
df1[df1.plylstSeq == '516301214']

Unnamed: 0,plylstSeq,title,tag,like,songIds
0,516301214,"빙글빙글 낭만이 물들어가는, LP판 느낌의 감성팝",#낭만 #감성 #LP #힐링 #휴식 #사랑 #분위기 #가을 #추억 #팝,400,33880688 36359244 36148918 35576427 33664280 3...
106,516301214,"빙글빙글 낭만이 물들어가는, LP판 느낌의 감성팝",#낭만 #감성 #LP #힐링 #휴식 #사랑 #분위기 #가을 #추억 #팝,400,33880688 36359244 36148918 35576427 33664280 3...
146,516301214,"빙글빙글 낭만이 물들어가는, LP판 느낌의 감성팝",#낭만 #감성 #LP #힐링 #휴식 #사랑 #분위기 #가을 #추억 #팝,400,33880688 36359244 36148918 35576427 33664280 3...
181,516301214,"빙글빙글 낭만이 물들어가는, LP판 느낌의 감성팝",#낭만 #감성 #LP #힐링 #휴식 #사랑 #분위기 #가을 #추억 #팝,400,33880688 36359244 36148918 35576427 33664280 3...
270,516301214,"빙글빙글 낭만이 물들어가는, LP판 느낌의 감성팝",#낭만 #감성 #LP #힐링 #휴식 #사랑 #분위기 #가을 #추억 #팝,400,33880688 36359244 36148918 35576427 33664280 3...
1156,516301214,"빙글빙글 낭만이 물들어가는, LP판 느낌의 감성팝",#낭만 #감성 #LP #힐링 #휴식 #사랑 #분위기 #가을 #추억 #팝,400,33880688 36359244 36148918 35576427 33664280 3...


In [7]:
df2.head()

Unnamed: 0,plylstSeq,songId,title,artist,img
0,516301214,33880688,Until I Found You,Stephen Sanchez,https://cdnimg.melon.co.kr/cm2/album/images/10...
1,516301214,36359244,Sleep Well,d4vd,https://cdnimg.melon.co.kr/cm2/album/images/11...
2,516301214,36148918,3 Boys,Omar Apollo,https://cdnimg.melon.co.kr/cm2/album/images/11...
3,516301214,35576427,THANK YOU 4 LOVIN’ ME,Paul Partohap,https://cdnimg.melon.co.kr/cm2/album/images/11...
4,516301214,33664280,This Could Be,Joel Ansett,https://cdnimg.melon.co.kr/cm2/album/images/10...


In [17]:
df2[df2.songId == '33880688']

Unnamed: 0,plylstSeq,songId,title,artist,img
0,516301214,33880688,Until I Found You,Stephen Sanchez,https://cdnimg.melon.co.kr/cm2/album/images/10...
4784,516301214,33880688,Until I Found You,Stephen Sanchez,https://cdnimg.melon.co.kr/cm2/album/images/10...
5329,524842683,33880688,Until I Found You,Stephen Sanchez,https://cdnimg.melon.co.kr/cm2/album/images/10...
6532,516301214,33880688,Until I Found You,Stephen Sanchez,https://cdnimg.melon.co.kr/cm2/album/images/10...
7135,524842683,33880688,Until I Found You,Stephen Sanchez,https://cdnimg.melon.co.kr/cm2/album/images/10...
8158,516301214,33880688,Until I Found You,Stephen Sanchez,https://cdnimg.melon.co.kr/cm2/album/images/10...
12093,516301214,33880688,Until I Found You,Stephen Sanchez,https://cdnimg.melon.co.kr/cm2/album/images/10...
51297,516301214,33880688,Until I Found You,Stephen Sanchez,https://cdnimg.melon.co.kr/cm2/album/images/10...
59107,524842683,33880688,Until I Found You,Stephen Sanchez,https://cdnimg.melon.co.kr/cm2/album/images/10...


##### 4. 데이터 중복 제거

In [None]:
# playlist1 에서 plylstSeq 중복제거


##### 5. PlaylistID를 알 때 크롤링

In [1]:
# 두 파일에서 PlayListID를 알아온다.
ply_list = '100123440 482975847'.split()

In [None]:
plist1, plist2 = [], []

for ply in ply_list:
    
    driver.get(playlist_url + ply)
    time.sleep(1)

    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # plist1에 들어갈 요소들
    title = soup.select_one('.ellipsis.song_name').text.strip()
    tag = soup.select_one('.tag_list.type03').text.replace('\n', ' ')
    like = int(soup.select_one('.cnt').text.split()[-1].replace(',', '').strip())
    
    trs = soup.select('.service_list_song.d_song_list > table > tbody > tr' )
    song_ids = []
    # plist2에 들어갈 요소들
    for tr in trs:
        
        if tr.select_one('.bullet_icons.age_19') :  # 19금 제외
            continue

        img = tr.select_one('img')['src']

        tmp = tr.select_one('.rank01 > span > a')
        s_id = tmp['href'].split(',')[-1][:-2]
        song_ids.append(s_id)

        s_title = tmp.text.strip()
        s_artist = tr.select_one('.rank02 > span').text

        plist2.append({'plylstSeq': pli, 'songId': s_id, 'title': s_title, 'artist': s_artist, 'img': img})
    
    plist1.append({'plylstSeq': pli, 'title': title, 'tag': tag, 'like':like, 'songIds': ' '.join(song_ids)})