In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
import re
from datetime import datetime, timedelta

ModuleNotFoundError: No module named 'selenium'

In [None]:
def convert_relative_time_to_date(relative_time):
    """
    Convert relative time strings like '3 months ago', '2 years ago' to YYYY-MM format
   
    Parameters:
    relative_time (str): Relative time string from YouTube
   
    Returns:
    str: Date in YYYY-MM format
    """
    current_date = datetime.now()
   
    # Extract the number and unit from the relative time
    match = re.search(r'(\d+)\s+(\w+)', relative_time)
    if not match:
        return "Unknown"
   
    value, unit = int(match.group(1)), match.group(2).lower()
   
    # Handle different time units
    if 'second' in unit or 'minute' in unit or 'hour' in unit:
        # For very recent comments, just use current month
        result_date = current_date
    elif 'day' in unit:
        result_date = current_date - timedelta(days=value)
    elif 'week' in unit:
        result_date = current_date - timedelta(weeks=value)
    elif 'month' in unit:
        # Subtract months (handling year change)
        month = current_date.month - value % 12
        year = current_date.year - value // 12
        if month <= 0:
            month += 12
            year -= 1
        result_date = datetime(year, month, 1)
    elif 'year' in unit:
        result_date = datetime(current_date.year - value, current_date.month, 1)
    else:
        return "Unknown"
   
    # Format to YYYY-MM
    return result_date.strftime('%Y-%m')


In [None]:
def scrape_youtube_comments(video_url, max_comments=100):
    """
    Scrape comments from a YouTube video
   
    Parameters:
    video_url (str): URL of the YouTube video
    max_comments (int): Maximum number of comments to scrape
   
    Returns:
    pandas.DataFrame: DataFrame containing comments data
    """
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--no-sandbox")
   
    # Initialize the Chrome driver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
   
    try:
        # Open the YouTube video
        driver.get(video_url)
        print(f"Opened video: {video_url}")
       
        # Wait for the comments section to load
        wait = WebDriverWait(driver, 20)
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "ytd-comments#comments")))
        print("Comments section loaded")
       
        # Scroll down to load more comments
        comment_section = driver.find_element(By.CSS_SELECTOR, "ytd-comments#comments")
        driver.execute_script("arguments[0].scrollIntoView();", comment_section)
       
        # Initialize variables
        comments_data = []
        last_comment_count = 0
        scroll_attempts = 0
        max_scroll_attempts = 30  # or higher if needed


        # Modify the scrolling behavior
        # driver.execute_script("window.scrollBy(0, 2000);")  # Scroll further each time
        # time.sleep(3)  
       
        # Scroll and collect comments
        while len(comments_data) < max_comments and scroll_attempts < max_scroll_attempts:
            # Extract comments
            comments = driver.find_elements(By.CSS_SELECTOR, "ytd-comment-thread-renderer")
           
            # Process new comments
            for i in range(last_comment_count, len(comments)):
                if i >= max_comments:
                    break
                   
                try:
                    comment = comments[i]
                   
                    # Extract comment information
                    author = comment.find_element(By.CSS_SELECTOR, "#author-text").text.strip()
                    comment_text = comment.find_element(By.CSS_SELECTOR, "#content-text").text.strip()
                   
                    # Try to get like count (might not be available for all comments)
                    try:
                        like_count_text = comment.find_element(By.CSS_SELECTOR, "#vote-count-middle").text.strip()
                        # Convert text like "1.2K" to numbers
                        if 'K' in like_count_text:
                            like_count = int(float(like_count_text.replace('K', '')) * 1000)
                        else:
                            like_count = int(like_count_text) if like_count_text else 0
                    except:
                        like_count = 0
                   
                    # Try to get timestamp
                    # Alternative approach based on your HTML structure
                    try:
                        # Look for the specific pattern in your screenshot
                        timestamp_element = comment.find_element(By.CSS_SELECTOR, "span#published-time-text")
                        relative_timestamp = timestamp_element.text.strip()
                    except:
                        # Try another approach if the first one fails
                        try:
                            timestamp_element = comment.find_element(By.CSS_SELECTOR, "a.yt-simple-endpoint.style-scope.ytd-comment-view-model")
                            relative_timestamp = timestamp_element.text.strip()
                        except:
                            relative_timestamp = "Unknown"
                   
                    # Convert relative timestamp to YYYY-MM format
                    if relative_timestamp != "Unknown":
                        timestamp = convert_relative_time_to_date(relative_timestamp)
                    else:
                        timestamp = "Unknown"
                   
                    # Add to our data
                    comments_data.append({
                        'Author': author,
                        'Comment': comment_text,
                        'Likes': like_count,
                        'RelativeTimestamp': relative_timestamp,  # Keep the original for reference
                        'Timestamp': timestamp  # Converted to YYYY-MM
                    })
                   
                    print(f"Scraped comment {len(comments_data)}/{max_comments}")
                   
                except Exception as e:
                    print(f"Error extracting comment data: {e}")
           
            # Update last comment count
            last_comment_count = len(comments)
           
            # Scroll down to load more comments
            driver.execute_script("window.scrollBy(0, 2000);")
            time.sleep(3)  # Wait for comments to load
           
            # Check if we've loaded new comments
            new_comments = driver.find_elements(By.CSS_SELECTOR, "ytd-comment-thread-renderer")
            if len(new_comments) == last_comment_count:
                scroll_attempts += 1
            else:
                scroll_attempts = 0
               
        print(f"Scraped a total of {len(comments_data)} comments")
       
        # Create a DataFrame
        df = pd.DataFrame(comments_data)
        return df
       
    finally:
        # Close the browser
        driver.quit()


In [None]:


# Example usage
if __name__ == "__main__":
    # Replace with the YouTube video URL you want to scrape
    video_url = "https://www.youtube.com/watch?v=55XJ1ObZKaM"
   
    # Set the maximum number of comments to scrape
    max_comments = 50
   
    # Scrape comments
    comments_df = scrape_youtube_comments(video_url, max_comments)
   
    # Save to CSV
    comments_df.to_csv("year6.csv", index=False)
   
    print(f"Comments saved to youtube_comments.csv")


