In [66]:
from selenium import webdriver as wd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd
import requests
import re


def check_comment_count_is_zero(html_source, css_selector):
    is_comment_count_zero = False
    
    soup = BeautifulSoup(html_source, 'lxml')
    datas = soup.select(css_selector)
    print(datas)
    if len(datas) > 0:
        comment_count_data = datas[0]
        
        if comment_count_data.text == "0 Comments":
            is_comment_count_zero = True
            
    return is_comment_count_zero

def scroll(driver, height=700):
    driver.execute_script(f"window.scrollTo(0, {height});")

def scroll_page(driver, target='essential'):
    last_page_height = driver.execute_script("return document.documentElement.scrollHeight")

    if target == 'essential':
        for _ in range(3):
            driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
            time.sleep(1.0)
            new_page_height = driver.execute_script("return document.documentElement.scrollHeight")
            last_page_height = new_page_height
    else:
        while True:
            driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
            
            time.sleep(2.0)
            
            new_page_height = driver.execute_script("return document.documentElement.scrollHeight")
            
            if new_page_height == last_page_height:
                break
                
            last_page_height = new_page_height
    
    return driver




def get_url_title_in_html_source(html_source, css_selector):
    titles, urls = [], []
    
    soup = BeautifulSoup(html_source, 'lxml')
    
    datas = soup.select(css_selector)
    
    for data in datas:
        title = data.text.replace('\n', '')
        url = "https://www.youtube.com" + data.get('href')
        
        titles.append(title)
        urls.append(url)
        
    return titles, urls

        
def get_channel_video_url_list(channel_url):
    titles = []
    urls = []
    
    driver = wd.Chrome(ChromeDriverManager().install())
    driver.maximize_window()
    
    driver.get(channel_url)
    
    driver = scroll_page(driver=driver)
        
    html_source = driver.page_source

    driver.quit()
    
    url_title_css_selector = "#a.yt-simple-endpoint.focus-on-expand.style-scope.ytd-rich-grid-media"
    
    titles, urls = get_url_title_in_html_source(
        html_source=html_source, 
        css_selector=url_title_css_selector
    )
    return titles, urls

def crawl_youtube_page_html_sources(urls):
    html_sources = []

    for idx in range(len(urls)):
        driver = wd.Chrome(ChromeDriverManager().install())
        driver.maximize_window()
        driver.get(urls[idx])
        
        time.sleep(3.0)
        
        scroll(driver)
        
        # comment_css_selector = "ytd-comments-header-renderer.style-scope.ytd-item-section-renderer > div#title > h2#count > yt-formatted-string.count-text.style-scope.ytd-comments-header-renderer"
        # comment_css_selector = "div#contents > ytd-comment-thread-renderer"
        comment_css_selector = "yt-formatted-string.count-text.style-scope.ytd-comments-header-renderer"
        #count > yt-formatted-string
        # driver.find_elements(By.CSS_SELECTOR, comment_css_selector)
        # WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, comment_css_selector)))
        
        html_source = driver.page_source
        
        is_comment_count_zero = check_comment_count_is_zero(
            html_source=html_source, css_selector=comment_css_selector
        )
        
        if not is_comment_count_zero:
            driver = scroll_page(driver=driver)

        html_source = driver.page_source
        html_sources.append(html_source)

        driver.quit()
        
    return html_sources

def post_processing_text(text):
    return text.replace('\n', '').replace('\t', '').replace('                ','') if text is not None else ""

def pack_space(text):
    return " ".join(text.split())

def divide_watch_shorts(titles, urls):
    watch_url, shorts_url = [], []
    
    for title, url in zip(titles, urls):
        url_type = url.split("/")[3].split("?")[0]
        
        if url_type == "watch":
            watch_url.append({
                "title": title, 
                "url": url
            })
        elif url_type == "watch":
            shorts_url.append({
                "title": title, 
                "url": url
            })
            
    return watch_url, shorts_url

def get_user_IDs_and_comments(url_dict, video_type, html_source):
    comment_crawl_result_dict = {
        "title": url_dict['title'], 
        "video_url": url_dict['url'], "video_type": video_type,
        "comment": []
    }
    
    comment_id_css_selector = "ytd-comment-renderer#comment > div#body > div#main > div#header > div#header-author > h3.style-scope.ytd-comment-renderer > a#author-text"
    comment_text_css_selector = "ytd-comment-renderer#comment > div#body > div#main > div#comment-content > ytd-expander#expander > div#content > yt-formatted-string#content-text"
    soup = BeautifulSoup(html_source, 'lxml')


    youtube_user_ID_list = soup.select(comment_id_css_selector)
    youtube_comment_list = soup.select(comment_text_css_selector)

    for youtube_user_id, youtube_comment in zip(youtube_user_ID_list, youtube_comment_list):
        user_id = pack_space(text=post_processing_text(text=youtube_user_id.text))
        comment = post_processing_text(text=youtube_comment.text)

        comment_data_dict = {"id":user_id, "comment":comment}
        
        comment_crawl_result_dict['comment'].append(comment_data_dict)
    
    return comment_crawl_result_dict


def convert_crawl_result_dict_to_csv(crawl_result_dict):
    title = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…《\》]', '', crawl_result_dict['title'])
    
    temp_df = pd.DataFrame(crawl_result_dict['comment'])
    
    try :
        temp_df = temp_df[['id', 'comment']]
    except:
        temp_df = pd.DataFrame()
    
    temp_df.to_csv(f"{title}.csv", index=False)

In [67]:
driver_dir = '/Users/jyh/Downloads/chromedriver-mac-arm64/chromedriver'
driver = wd.Chrome(service=Service(driver_dir))

crawling_result_list = []
url = "https://www.youtube.com/@snubh/videos"
driver.get(url)

css_selector = "a.yt-simple-endpoint.focus-on-expand.style-scope.ytd-rich-grid-media"

driver = scroll_page(driver=driver, target='comment')
html_source = driver.page_source

titles, urls = get_url_title_in_html_source(css_selector=css_selector,  html_source=html_source)
print("Crawling the title, url of all videos completed!")
print(f"No. of videos: {len(titles)}")

urls, shorts_url = divide_watch_shorts(titles, urls)
html_sources = []

# Crawling page sources
for idx in range(len(urls)):
    driver = wd.Chrome(service=Service(driver_dir))
    driver.maximize_window()
    driver.get(urls[idx]['url'])
    
    time.sleep(2.0)
    click_detail = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#expand')))
    click_detail.click()
    scroll(driver)
    # comment_css_selector = "yt-formatted-string.count-text.style-scope.ytd-comments-header-renderer"
    # temp =  WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, comment_css_selector)))
    # is_comment_count_zero = temp.text == "0 Comments"
    
    driver = scroll_page(driver=driver)
    # if not is_comment_count_zero:
    html_source = driver.page_source
    html_sources.append(html_source)
    driver.quit()

# Crawl comments and id
for url_dict, html_source in zip(urls, html_sources):
    video_info_result_dict = {
        "title": url_dict['title'], 
        "video_url": url_dict['url'],
        "comment": []
    }

    title_selector = "#title > h1 > yt-formatted-string"
    view_selector = "#info > span:nth-child(1)"
    date_selector = "#info > span:nth-child(3)"
    description_selector = "#description-inline-expander > yt-attributed-string > span > span:nth-child(6)"
    no_comment_selector = "#count > yt-formatted-string > span:nth-child(1)"
    #info > span:nth-child(1)
    soup = BeautifulSoup(html_source, 'lxml')
    
    title_list = soup.select(title_selector)
    view_count_list = soup.select(view_selector)
    date_list = soup.select(date_selector)
    no_of_comment_list = soup.select(no_comment_selector)
    
    for title, view_count, date in zip(title_list, view_count_list, date_list):
        print(f"title: {title.text}, views: {view_count.text}, date:{date.text}")
        
        title = title.text
        views = view_count.text.split(' ')[0]
        dates = date.text
        # num_comment = no_of_comment.text
        
        video_info_dict = {"title":title, 'views': views, 'dates': dates}
        video_info_result_dict['comment'].append(video_info_dict)
    
    crawling_result_list.append(video_info_result_dict)

#     comment_id_css_selector = "ytd-comment-renderer#comment > div#body > div#main > div#header > div#header-author > h3.style-scope.ytd-comment-renderer > a#author-text"
#     comment_text_css_selector = "ytd-comment-renderer#comment > div#body > div#main > div#comment-content > ytd-expander#expander > div#content > yt-formatted-string#content-text"
#     soup = BeautifulSoup(html_source, 'lxml')

#     youtube_user_ID_list = soup.select(comment_id_css_selector)
#     youtube_comment_list = soup.select(comment_text_css_selector)

#     for youtube_user_id, youtube_comment in zip(youtube_user_ID_list, youtube_comment_list):
#         user_id = pack_space(text=post_processing_text(text=youtube_user_id.text))
#         comment = post_processing_text(text=youtube_comment.text)

#         comment_data_dict = {"id":user_id, "comment":comment}
#         comment_crawl_result_dict['comment'].append(comment_data_dict)

#     # convert_crawl_result_dict_to_csv(comment_crawl_result_dict)
#     crawling_result_list.append(comment_crawl_result_dict)

result_df = pd.DataFrame()
for i in crawling_result_list:
  result_df = pd.concat([result_df, pd.DataFrame(i['comment'])], axis=0) 
result_df['dates'] = pd.to_datetime(result_df['dates']).dt.strftime('%Y-%m-%d')
result_df['views'] = result_df['views'].str.replace(",","").astype(int)
result_df.to_csv('SNUBH_Youtube_crawling.csv')
    

KeyboardInterrupt: 

In [49]:
result_df = pd.DataFrame()
for i in crawling_result_list:
    result_df = pd.concat([result_df, pd.DataFrame(i['comment'])], axis=0) 
result_df['dates'] = pd.to_datetime(result_df['dates']).dt.strftime('%Y-%m-%d')
result_df['views'] = result_df['views'].str.replace(",","").astype(int)


In [65]:
result_df['views'].str.replace(",","").astype(int)

0    1225
0     770
0    1000
0    1803
0    3893
Name: views, dtype: int64

In [61]:
result_df['dates'] = pd.to_datetime(result_df['dates']).dt.strftime('%Y-%m-%d')

In [13]:

def get_essesential_info(): 

  title_selector = "#title > h1 > yt-formatted-string"
  view_selector = "#info > span:nth-child(1)"
  date_selector = "#info > span:nth-child(3)"
  description_selector = "#description-inline-expander > yt-attributed-string > span > span:nth-child(6)"
  no_comment_selector = "#count > yt-formatted-string > span:nth-child(1)"
  title = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, title_selector)))

  view_count = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, view_selector)))

  date = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, date_selector)))

  description = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, description_selector)))

  no_of_comment = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, no_comment_selector)))

  print(f"title: {title}, views: {view_count}, date:{date}, No.of comment: {no_of_comment}")



TimeoutException: Message: 
Stacktrace:
0   chromedriver                        0x0000000100bde65c chromedriver + 4318812
1   chromedriver                        0x0000000100bd6d00 chromedriver + 4287744
2   chromedriver                        0x00000001008087ec chromedriver + 296940
3   chromedriver                        0x0000000100846048 chromedriver + 548936
4   chromedriver                        0x000000010087ed28 chromedriver + 781608
5   chromedriver                        0x000000010083a178 chromedriver + 500088
6   chromedriver                        0x000000010083afc0 chromedriver + 503744
7   chromedriver                        0x0000000100b9ec40 chromedriver + 4058176
8   chromedriver                        0x0000000100ba3160 chromedriver + 4075872
9   chromedriver                        0x0000000100b66e68 chromedriver + 3829352
10  chromedriver                        0x0000000100ba3c4c chromedriver + 4078668
11  chromedriver                        0x0000000100b7bf08 chromedriver + 3915528
12  chromedriver                        0x0000000100bc0140 chromedriver + 4194624
13  chromedriver                        0x0000000100bc02c4 chromedriver + 4195012
14  chromedriver                        0x0000000100bd04d0 chromedriver + 4261072
15  libsystem_pthread.dylib             0x000000018c36ffa8 _pthread_start + 148
16  libsystem_pthread.dylib             0x000000018c36ada0 thread_start + 8
