# YouTube Channel List Crawling
* Author: 고지형, iloveslowfood
* [The Youtube Channel Crawler](https://www.channelcrawler.com/)로부터 채널명 및 url을 크롤링한다.
* Creation Date(채널 가입일)을 조정하여 크롤링할 수 있다.

In [1]:
import os
import time
import json
import glob
from datetime import date

import pandas as pd
pd.set_option('display.max_rows', 300)
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm

## Manual
  
I. 크롤러 객체 생성
```python
crawler = Crawler(
    driver_path=driver_path, # 크롬드라이버 경로
    url=url, # 채널크롤러 웹사이트 url
    save_path=save_path # 결과 파일을 저장할 경로
)
```
II. 크롤링 시작
```python
crawler.work(
    date_interval=date_interval, # Creation Date
    filter_size=filter_size # 한번에 훑을 날짜 간격(2 또는 3 권장, 기본값은 2)
)
```

In [3]:
url = 'https://www.channelcrawler.com/' # YTCC url
driver_path = 'drivers/chromedriver.exe' # 드라이버 경로
save_path = '../raw/#3. channel list'
date_interval = (565, 655) # 최소/최대 Creation Date

In [4]:
crawler = YTCCCrawler(driver_path, url, save_path)
crawler.work(date_interval, 2)

In [1]:
class YTCCCrawler:
    def __init__(self, driver_path, url, save_path='../raw/#3. channel list'):
        self.driver_path = driver_path
        self.url = url
        self.save_path = save_path
        
    def work(self, date_interval, filter_size=2):
        self.driver = webdriver.Chrome(self.driver_path)
        stride = filter_size
        print(f'<Creation Date: {date_interval[0]}~{date_interval[-1]}>')
        dates = [i for i in range(date_interval[0], date_interval[-1]+1)]
        for edge_idx in range(0, len(dates), stride):
            min_date = min(dates[edge_idx : edge_idx+filter_size])
            max_date = max(dates[edge_idx : edge_idx+filter_size])
            if max_date - min_date > 0:
                print(f'\tOngoing: {min_date}~{max_date}', end='\t')
                self.initialize(min_date, max_date) # 구간 초기화
                self.get_info(min_date, max_date) # 구간 내 정보 수집
            else:
                pass
        self.driver.close()
            
    def initialize(self, min_date, max_date):
        self.driver.get(self.url)
        self.click(option='clear')
        self.set_condition(min_date, max_date)
        self.click(option='search')
        
    def get_info(self, min_date, max_date):
        channel_url_list = []
        channel_title_list = []
        soup = BeautifulSoup(self.driver.page_source, 'html.parser')
        
        page_num = 1
        end_num = 999
        need_check = True
        page_id = self.driver.current_url + '/page:'
        try:
            pseudo_end_num = int(soup.find('div', class_="col-xs-12 text-center").find_all('a', href=True)[-2]['href'].split(':')[-1])
            next_page = int(soup.find('div', class_="col-xs-12 text-center").find_all('a', href=True)[-1]['href'].split(':')[-1])
        except:
            pseudo_end_num = 5
            next_page = 10
        
        while True:
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            channel_soups = soup.find_all('div', class_="channel col-xs-12 col-sm-4 col-lg-3")
            for channel in channel_soups:
                channel_url_list.append(channel.find('a', href=True)['href'])
                channel_title_list.append(channel.find('a', href=True)['title'])
                
            # 다음 페이지로
            if page_num < end_num:
                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                try:
                    pseudo_end_num = int(soup.find('div', class_="col-xs-12 text-center").find_all('a', href=True)[-2]['href'].split(':')[-1])
                    next_num = int(soup.find('div', class_="col-xs-12 text-center").find_all('a', href=True)[-1]['href'].split(':')[-1])
                    if need_check:
                        if pseudo_end_num == next_num:
                            end_num = pseudo_end_num
                            need_check = False
                        else:
                            end_num = 15
                    page_num += 1
                    self.driver.get(page_id + str(page_num))
                except:
                    page_num += 1
                    self.driver.get(page_id + str(page_num))
            else:
                break
            
        result = pd.DataFrame(dict(channel=channel_title_list, url=channel_url_list))
        result.drop_duplicates(ignore_index=True, inplace=True)
        
        # 파일 저장
        today = date.today().strftime('%y%m%d')
        file_name = f'YTCC_{today}_{min_date, max_date}.csv'
        path = os.path.join(self.save_path, file_name)
        result.to_csv(path, index=False)
        print(f"'{file_name}' saved.")
        
    def set_condition(self, min_date, max_date):
        placeholder_min_date = self.driver.find_element_by_xpath('//*[@id="queryMinPublishedOn"]')
        placeholder_max_date = self.driver.find_element_by_xpath('//*[@id="queryMaxPublishedOn"]')
        placeholder_min_sub = self.driver.find_element_by_xpath('//*[@id="queryMinSubs"]')

        placeholder_min_date.clear()
        placeholder_max_date.clear()
        placeholder_min_sub.clear()

        placeholder_min_date.send_keys(min_date)
        placeholder_max_date.send_keys(max_date)
        placeholder_min_sub.send_keys(100)

    def click(self, option):
        if option == 'clear':
            clear = self.driver.find_element_by_xpath('//*[@id="queryIndexForm"]/div[2]/div[1]/div[4]/div/div/div[1]/div/span[1]/i')
            clear.click()
        elif option == 'search':
            search = self.driver.find_element_by_xpath('/html/body/div/div[3]/div/div[1]/div[2]/div/button')
            search.click()