In [1]:
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time

In [10]:
# Insert link for video topic to be searched (make sure to filter type with video)
url = 'https://www.youtube.com/results?search_query=costco+rotisserie+chicken'

In [11]:
driver = webdriver.Chrome()
driver.get(url)

# Need to wait for page to fully load
#time.sleep(10)

#check your browser to see if the correct page loaded

In [12]:
# Scroll to bottom of page

#The function takes a single argument, "driver", which is an instance of the WebDriver class. 
#This argument is used to interact with the web page and execute JavaScript code.

#The function uses a while loop to continuously scroll the web page until the bottom is reached. 
#The loop continues until the new scroll position is equal to the old scroll position.
#This indicates that the page has reached the bottom.

#In each iteration of the loop, the function first gets the current scroll position of the page using JavaScript code 
#executed through the WebDriver's "execute_script()" method. The script checks if the 
#"window.pageYOffset" property is defined, which gives the current vertical scroll
#position of the page, and returns that value. If "window.pageYOffset" is undefined, 
#the script checks if "document.documentElement" or "document.body.parentNode" or 
#"document.body" has a scroll position and returns that value instead.

#The function then sleeps for 5 seconds using the "time.sleep()" method, 
#to allow time for the page to load and the user to see the scrolling.

#Next, the function uses the WebDriver's "execute_script()" method to scroll the page down to the bottom. 
#The script sets the "scrollTop" property of the "scrollingElement" to the "scrollHeight". 
#The "scrollingElement" can be either "document.scrollingElement" or "document.body". 
#Setting "scrollTop" to "scrollHeight" scrolls the page to the bottom.

#The function then sleeps for another 5 seconds to allow time for the page to load and for the user to see the scrolling.

#Finally, the function gets the new scroll position of the page using JavaScript code similar to the one used earlier, 
#and assigns it to the "new_position" variable.
def scroll_to_bottom(driver):
    old_position = 0
    new_position = None
    
    while (new_position != old_position):
        # Get old scroll position
        old_position = driver.execute_script(("return (window.pageYOffset !== undefined) ?"
                                            " window.pageYOffset : (document.documentElement ||"
                                            " document.body.parentNode || document.body);"))
        # Sleep and scroll
        time.sleep(5)
        driver.execute_script(("var scrollingElement = (document.scrollingElement ||"
                              " document.body);scrollingElement.scrollTop = "
                              " scrollingElement.scrollHeight"))
        
        time.sleep(5)
        # Get new position
        new_position = driver.execute_script(("return (window.pageYOffset !== undefined) ?"
                                             " window.pageYOffset : (document.documentElement ||"
                                             " document.body.parentNode || documentBody);"))

In [13]:
scroll_to_bottom(driver)

#this will start to scroll to the bottom of the page of the search results on its own

In [14]:
user_data = driver.find_elements(by=By.XPATH,value='//*[@id="video-title"]')

In [15]:
print(len(user_data))

55


In [17]:
#The first line initializes an empty list called "links" that will store the extracted links.

#The second line starts a for loop that iterates through each element in the "user_data" list.

#The third line checks whether the current element has an "href" attribute, which is commonly used in HTML to indicate a hyperlink. If the "href" attribute is not None, meaning it exists, then the code extracts the value of the "href" attribute using the "get_attribute()" method and appends it to the "links" list using the "append()" method.

#The fourth line prints the length of the "links" list, which is the total number of links found in the "user_data" list.

links = []
for i in user_data:
    if (i.get_attribute('href') != None):
        links.append(i.get_attribute('href'))

print(len(links))

41


In [18]:
df = pd.DataFrame(columns = ['link', 'title', 'description', 'category'])

In [23]:
#The first line initializes a string variable "v_category" to store the video category.
v_category = "costco_chicken"

#The second line initializes a WebDriverWait object called "wait" with a timeout of 50 seconds. 
#This object will be used to wait for elements to appear on the web page before interacting with them.

wait = WebDriverWait(driver, 50)

##The code then starts a for loop that iterates through each link in the "links" list.
#For each link, the code navigates the WebDriver to that link using the "driver.get()" method.
#The code then initializes 3 variables, "v_id", "v_title", and "v_description", to store the video ID, title, and description.
#To extract the video title, the code uses the "wait.until()" method to wait for the presence of an element with the CSS selector "h1.style-scope.ytd-watch-metadata yt-formatted-string". 
#Once the element is present, the "text" property is extracted and stored in the "v_title" variable.

#To extract the video description, the code uses the "wait.until()" method again to wait for the presence of an element with the CSS selector "div#snippet yt-formatted-string". 
#Once the element is present, the "text" property is extracted and stored in the "v_description" variable.
   
for x in links:
    driver.get(x)
    v_id = x
    v_title = wait.until(EC.presence_of_element_located(
                   (By.CSS_SELECTOR,"h1.style-scope.ytd-watch-metadata yt-formatted-string"))).text

v_description =  wait.until(EC.presence_of_element_located(
                                (By.CSS_SELECTOR,"div#snippet yt-formatted-string"))).text

#Finally, the video data is added to a pandas DataFrame called "df" using the "df.loc[]" method. The data is added as a new row, with the video ID, title, description, and category in separate columns.
df.loc[len(df)] = [v_id, v_title, v_description, v_category]

In [24]:
driver.quit()

In [25]:
print(df)

                                                link  \
0  https://www.youtube.com/watch?v=IkWt-o4FcyE&pp...   

                                               title description  \
0  Who Has The Best Rotisserie Chickens? Sam’s Cl...               

         category  
0  costco_chicken  


In [26]:
# Save the DataFrame as a CSV file
df.to_csv('comm170_youtube.csv', index=False)