In this notebook, we extract the comments of Reddit posts and YouTube videos that relate to Tesla Model


--To get the comments from Reddit, we use the API praw. We first search the posts with the keyword 'Tesla Model', then extract the comments under these posts.

--To get the comments from YouTube, we use the Selenium web scraping tool to set the keyword to "Tesla Model" and scrape comments from YouTube videos. By default, we scrape 20 comments per video and collect over 1700 comments in total.

Finally, we convert the results to a csv file *Reddit&YouTube Comments.csv*.

# Extract Comments from Reddit

## Install and import libraries

In [1]:
# Import Libraries
import praw
import json
import pandas as pd
import numpy as np
from datetime import datetime
from praw.models import MoreComments
import csv

## Search reddit and create a word cloud of 'Tesla model'

In [2]:
# Set up a client for the Search API
client_id="......"
client_secret="......"
password="******"
user_agent="......"
username="......"


reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    password=password,
    user_agent=user_agent,
    username=username,
)

In [None]:
# Retrieve from the Search API

# Set keyword and search in all subreddits
keyword = "tesla model"
search = reddit.subreddit("all").search(keyword)

# Set a blank list to save serch results
search_posts = []

# Extrac features
for submission in search:

  search_comments = []      #blank list to save comments

  for comment in submission.comments:     #Search comments in every submission
    if isinstance(comment, MoreComments):
        continue
    search_comments.append(comment.body)

  # Add submission information to search result
  search_posts.append([submission.url, submission.title, submission.id, submission.author, datetime.fromtimestamp(submission.created_utc), submission.num_comments, np.transpose(search_comments)])

# Set Dataframe to sotre search results, and save to .csv file.
submissions = pd.DataFrame(search_posts, columns=['Url', 'Title', 'Title_id', 'Author', 'Sub_time', 'No. of comments', 'Comments'])
submissions.to_csv(f"comments_Reddit.csv")

# Extract Comments from Youtube




## Install and import libraries

In [None]:
import time
import random
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import datetime
import pandas as pd

## Create an edge instance, and search on Youtube with key word

In [None]:
#Create an edge instance
edge_options = webdriver.EdgeOptions()
edge_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3')
edge_options.add_argument('lang=en-US')
service = Service('edgedriver_win64/msedgedriver.exe')  
driver = bor = webdriver.Edge(service=service, options=edge_options)

# set up search on Youtube with key word.
key_word = 'Tesla model'
url = f"https://www.youtube.com/results?search_query={key_word}"
bor.get(url)
time.sleep(15)   # wait to ensure the page is fully loaded

accept_buttons = driver.find_elements(By.CSS_SELECTOR, '.yt-spec-button-shape-next.yt-spec-button-shape-next--filled')

# Look for the button "Accept all"
accept_button = [b for b in accept_buttons if b.text == 'Accept all']

# Cilck the "Accept all" button if it exists
if accept_button:
    accept_button[0].click()
    time.sleep(5)  # wait to ensure the click is finished
else:
    print("The button not found")

In [None]:
# Searche videos on Youtube with keyword, and save the results in a list.

def search(key_word,fetch_numbers = 100):   # set key words, target number and url    
    url = f"https://www.youtube.com/results?search_query={key_word}"
    bor.get(url)
    print('scrolling ....',end='  ')
    while True:  # scroll down to load more videos
        bor.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
        wait_time = random.randint(10, 15)  # wait for 15 to 30 seconds randomly, to avoid being detected as a scraper.
        time.sleep(wait_time)
        search_items = bor.find_elements(by='tag name',value='ytd-video-renderer')
        print(len(search_items),end=' ')
        if len(search_items) > fetch_numbers:
            break

    search_page = []  # save search results
    for one_item in search_items:
        title_div = one_item.find_element(by='id',value='video-title')
        title_text = title_div.text
        link = title_div.get_attribute('href')
        meta_text = one_item.find_element(by='tag name',value='ytd-video-meta-block').text
        item_data = dict(title=title_text,link=link,meta=meta_text)
        search_page.append(item_data)
    return search_page

In [None]:
# Collect comments of videos.
# Scroll down to load more comments, until get target nunber of comments, or there is no new comments.

def get_comments(url,fetch_comment_num = 30):
    bor.get(url)
    time.sleep(random.randint(5, 8))
    last_num = -100
    print(f'scrolling ....{url}',end='  ')
    height = 362
    delta = 1500
    zero_delta_cnt = 0
    
    # Scroll the page to load more comments.
    while True:
        bor.execute_script(f"window.scrollTo(0, {height});")
        height += delta
        time.sleep(random.randint(3, 5))

        # Find all comments on the page
        comments = driver.find_elements(by='tag name',value='ytd-comment-thread-renderer')
        cnt_comments = len(comments)
        delta = cnt_comments-last_num

        # If there is no more comment, increment the zero_delta counter
        if delta ==0:
            zero_delta_cnt += 1
        print(cnt_comments,end=' ')

        # If load enough comments, or there is no new commets loaded, break the loop.
        if (cnt_comments > fetch_comment_num) or zero_delta_cnt>3:
              break

        # Update the last_num to the current number of comments
        last_num = cnt_comments


    print('')
    
    # List to store the extracted comment.
    comments_data = []

    # Loop through the loaded comments and extract the text of each comment
    for one_comment in comments:
        comment = one_comment.find_element(by='id',value='content-text').text
        comment_item = dict(url=url,title=driver.title,comment=comment)
        comments_data.append(comment_item)
    return comments_data

In [None]:
# Search
search_items = search(key_word)
print("===================")
comments_data = []
for one_item in search_items:
    link = one_item['link']
    comments = get_comments(link)
    comments_data.extend(comments)
    
# Convert the comments data and video items into a pandas DataFrame, and save to .csv files.
df = pd.DataFrame(comments_data)
df.to_csv(f"comments_Youtbe.csv")

df_item = pd.DataFrame(search_items)
df_item.to_csv(f"{key_word}_new.csv")