# YouTube Channel Trend Crawling
* Author: 고지형, [iloveslowfood](https://github.com/iloveslowfood)
* 채널의 구독자 추이와 조회수 추이를 수집한다.
* 최근 360일까지 수집이 가능하다.

In [None]:
import os
import time
import json
import glob
from tqdm import tqdm
import datetime
from datetime import timedelta
from tqdm import tqdm

import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

In [None]:
channel_list = pd.read_csv('../raw/channel_list_지형.csv')['channel'].tolist()
save_path = '../raw/#2. subscribe trend data' # 파일을 저장할 폴더
driver_path = './drivers/chromedriver.exe' # 크롬드라이버 저장 경로

batch_size = len(channel_list) // 20
subsample = [channel_list[batch_size*i:batch_size*(i+1)] for i in range(20)] + [channel_list[batch_size*20:]]

crawler = TrendCrawler(driver_path, save_path)

In [None]:
crawler.work(subsample[1])

In [None]:
class TrendCrawler:
    URL = 'https://kr.noxinfluencer.com/'
    def __init__(self, driver_path, save_path):
        self.driver_path = driver_path
        self.save_path = save_path
    
    def work(self, channel_list):
        '''일해라 로봇'''
        try:
            os.mkdir(os.path.join(self.save_path))
        except:
            pass
        
        self.driver = webdriver.Chrome(self.driver_path)
        self.driver.get(self.URL)
        for channel_name in channel_list:
            try:
                self.into_channel(channel_name)
                sub_trend = self.get_trend(channel_name, trend_type='sub_trend')
                view_trend = self.get_trend(channel_name, trend_type='view_trend')
                result = self.wrap(sub_trend, view_trend)

                file_name = f'ChannelTrend_{self.correct_file_name(channel_name)}.csv'
                result.to_csv(os.path.join(self.save_path, file_name), index=False)
                print(f'{file_name} saved.\n')
            except:
                continue
    
    def get_trend(self, channel_name, trend_type):
        '''추이를 크롤링하는 함수'''
        graph_elements = self.grope(trend_type)

        if trend_type == 'sub_trend':
            date_list = []
            n_sub_list = []
            print(f'Getting subscribe trend from {channel_name}...', end='\t')
            for n in range(1, graph_elements['date_interval']):
                self.move_cursor(n, graph_elements['start_point'], graph_elements['element'], graph_elements['pix'])
                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                info = soup.find_all('div', id="channel-history-sub-chart")[0].get_text()
                if '획기적' in info:
                    try:
                        try:
                            date = (pd.to_datetime(date) + timedelta(1)).strftime('%Y-%m-%d')
                        except:
                            date = np.nan
                    except:
                        date = pd.to_datetime('1900-01-01')
                    n_sub = self.calc_n_str(info.split(':')[-1].split('구독자 ')[-1])
                else:
                    date = info[:10]
                    n_sub = self.calc_n_str(info[10:])
                date_list.append(date)
                n_sub_list.append(n_sub)

            sub_trend = pd.DataFrame(dict(date=date_list, subscriber=n_sub_list))
            sub_trend['date'] = pd.to_datetime(sub_trend['date'])
            sub_trend = self.correct_timeline(sub_trend)
            sub_trend = sub_trend.drop_duplicates(ignore_index=True)

            return sub_trend
        else:
            date_list = []
            n_view_list = []
            print(f'Getting view trend from {channel_name}...')
            for n in range(1, graph_elements['date_interval']):
                self.move_cursor(n, graph_elements['start_point'], graph_elements['element'], graph_elements['pix'])
                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                info = soup.find('div', id="channel-history-view-chart").find_all('div')[-1].get_text()
                date = info[:10]
                if '조회수' in info[10:]:
                    n_view = self.calc_n_str(info[10:].strip().split('조회수')[0])
                else:
                    n_view = self.calc_n_str(info[10:])

                date_list.append(date)
                n_view_list.append(n_view)

            view_trend = pd.DataFrame(dict(date=date_list, view=n_view_list))
            view_trend['date'] = pd.to_datetime(view_trend['date'])
            view_trend = self.correct_timeline(view_trend)
            view_trend = view_trend.drop_duplicates(ignore_index=True)
            
            return view_trend
        
    def grope(self, trend_type):
        '''추출할 트렌드의 날짜 범위, 그래프 내 좌표 범위를 추출하는 함수'''
        if trend_type == 'sub_trend':
            wait = WebDriverWait(self.driver, 10)
            element = wait.until(lambda x: x.find_element_by_xpath('//*[@id="channel-history-sub-chart"]/div[1]/canvas'))
        elif trend_type == 'view_trend':
            wait = WebDriverWait(self.driver, 10)
            element = wait.until(lambda x: x.find_element_by_xpath('//*[@id="channel-history-view-chart"]/div[1]/canvas'))
        else:
            raise NotImplementedError()
        
        # 그래프 위치 찾기
        loc = element.location
        size = element.size
        origin = element.size['width'] // 2
        self.move_cursor(0, origin, element)

        start_origin = -origin + 60
        end_origin = origin - 20
        
        start_point, start_date = self.find_edges(element, start_origin, trend_type, 'start')
        end_point, end_date = self.find_edges(element, end_origin, trend_type, 'end')
        pix_interval = end_point - start_point
        date_interval = (end_date - start_date).days
        pix = pix_interval / date_interval
        
        return dict(date_interval=date_interval, start_point=start_point, element=element, pix=pix)
    
    def find_edges(self, element, origin, trend_type, option, margin=-2) -> ('point', 'date'):
        '''그래프의 끝과 끝 위치값을 탐색하는 함수'''
        if trend_type == 'sub_trend':
            self.move_cursor(0, 0, element)
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            standard = soup.find_all('div', id="channel-history-sub-chart")[0].get_text()[:10]
            if option=='start':
                compare = None
                while True:
                    self.move_cursor(n_offset=0, origin=origin, element=element)
                    soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                    temp = soup.find_all('div', id="channel-history-sub-chart")[0].get_text()
                    compare = temp[:10]

                    if compare != standard:
                        if '획기적' not in compare: # '획기적 사건'이 아닌 일반적인 날짜
                            origin += margin
                            start_date = pd.to_datetime(soup.find_all('div', id="channel-history-sub-chart")[0].get_text()[:10])
                            start_point = origin
                            continue

                        else: # '획기적 사건'이 나올 경우
                            while True:
                                self.move_cursor(n_offset=0, origin=origin, element=element)
                                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                                pseudo_start = soup.find_all('div', id="channel-history-sub-chart")[0].get_text()[:10]
                                start_point = origin
                                if '획기적' not in pseudo_start:
                                    start_date = pd.to_datetime(pseudo_start[:10]) - timedelta(1)
                                    break
                                else:
                                    origin -= margin
                                    continue
                            break
                    else:
                        break
                return start_point, start_date

            else:
                compare = None
                while True:
                    self.move_cursor(n_offset=0, origin=origin, element=element)
                    soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                    temp = soup.find_all('div', id="channel-history-sub-chart")[0].get_text()
                    compare = temp[:10]

                    if compare != standard:
                        if '획기적' not in compare: # '획기적 사건'이 아닌 일반적인 날짜
                            origin -= margin
                            end_date = pd.to_datetime(soup.find_all('div', id="channel-history-sub-chart")[0].get_text()[:10])
                            end_point = origin
                            continue

                        else: # '획기적 사건'이 나올 경우
                            while True:
                                self.move_cursor(0, origin, element)
                                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                                pseudo_end = soup.find_all('div', id="channel-history-sub-chart")[0].get_text()[:10]
                                end_point = origin
                                if '획기적' not in pseudo_end:
                                    end_date = pd.to_datetime(pseudo_end[:10]) - timedelta(1)
                                    break
                                else:
                                    origin += margin
                                    continue
                            break
                    else:
                        break
                return end_point, end_date

        elif trend_type == 'view_trend':
            self.move_cursor(0, 0, element)
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            standard = soup.find('div', id="channel-history-view-chart").find_all('div')[-1].get_text()[:10]
            if option=='start':
                compare = None
                while True:
                    self.move_cursor(n_offset=0, origin=origin, element=element)
                    soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                    temp = soup.find('div', id="channel-history-view-chart").find_all('div')[-1].get_text()
                    compare = temp[:10]

                    if compare != standard:
                        origin += margin
                        start_date = pd.to_datetime(compare)
                        start_point = origin
                        continue
                    else:
                        break
                return start_point, start_date

            else:
                compare = None
                while True:
                    self.move_cursor(n_offset=0, origin=origin, element=element)
                    soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                    temp = soup.find('div', id="channel-history-view-chart").find_all('div')[-1].get_text()
                    compare = temp[:10]

                    if compare != standard:
                        origin -= margin
                        end_date = pd.to_datetime(compare)
                        end_point = origin
                        continue
                    else:
                        break
                return end_point, end_date
            
    def move_cursor(self, n_offset, origin, element, pix=2):
        action = webdriver.common.action_chains.ActionChains(self.driver)
        action.move_to_element(element)
        action.move_by_offset(origin + pix*n_offset, 0)
        action.perform()
        
    def into_channel(self, channel_name):
        CHANNEL = self.driver.find_element_by_xpath('//*[@id="header-search-input"]')
        CHANNEL.clear()
        CHANNEL.send_keys(channel_name)
        time.sleep(0.5)
        CHANNEL.send_keys(Keys.ENTER)
        
        patience = 0
        while True:
            try:
                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                link = self.URL[:-1] + soup.find('div', class_='result').find('a', class_="channel-name ellipsis", href=True)['href']
                break
            except: 
                if patience == 10:
                    raise NotImplementedError()
                time.sleep(0.5)
                patience += 1
        
        compare = soup.find('a', class_='channel-name ellipsis').get_text().strip()
        
        if compare != channel_name.strip():
            print(f"Could not found channel '{channel_name.strip()}'")
            raise NameError()
            
        self.driver.get(link)
        
    def wrap(self, sub_trend, view_trend):
        '''구독자 추이, 조회수 추이를 병합하는 함수'''
        result = sub_trend.merge(view_trend, how='outer', on='date').sort_values(by='date', ignore_index=True)
        result = self.imputate(result)
        result['subscriber'] = result['subscriber'].astype(int)
        result['view'] = result['view'].astype(int)
        return result    
        
    @staticmethod
    def calc_n_str(str_n):
        if '만' in str_n:
            num = str_n.split('만')[0]
            return int(float(num) * 10000)
        elif '천' in str_n:
            num = str_n.split('천')[0]
            return int(float(num) * 1000)
        else:
            return int(float(str_n))
        
    @staticmethod
    def correct_file_name(title):
        invalid_file_name_list = ['\\', '/', ':', '*', '?', '"', '<', '>', '|']
        for inv in invalid_file_name_list:
            if inv in title:
                title = title.replace(inv, '')
        return title

    @staticmethod
    def correct_timeline(data):
        idx_to_correct = data[data['date'] == pd.to_datetime('1900-01-01')].index.tolist()
        if idx_to_correct:
            idx_shift = list(np.array(idx_to_correct) + 1)
            data.loc[idx_to_correct, 'date'] = (data.loc[idx_shift, 'date'] - timedelta(1)).values
        return data
    
    @staticmethod
    def imputate(df):
        detect = df.isnull().sum()
        missing_cols = detect[detect != 0].index.tolist()
        for col in missing_cols:
            missing_idx = df[df[col].isnull()].index.tolist()
            for m in missing_idx:
                if m == 0:
                    upper_fill = df.loc[m+1:, col]
                    upper_bound = upper_fill[upper_fill.notnull()].tolist()[0]
                    fill_value = int(upper_bound)
                    pass
                elif m == df.shape[0]-1:
                    lower_fill = df.loc[:m-1, col]
                    lower_bound = lower_fill[lower_fill.notnull()].tolist()[-1]
                    fill_value = int(lower_bound)
                else:
                    lower_fill = df.loc[:m-1, col]
                    upper_fill = df.loc[m+1:, col]
                    lower_bound = lower_fill[lower_fill.notnull()].tolist()[-1]
                    upper_bound = upper_fill[upper_fill.notnull()].tolist()[0]
                    fill_value = int(np.mean([lower_bound, upper_bound]))
                df.loc[m, col] = fill_value
        return df