In [2]:
import time
import random
import jsonlines
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException

def click_button_if_exists(driver):
    try:
        button = driver.find_element(By.CLASS_NAME, 'AssemblyButtonBase')
        button.click()
        print("Button clicked successfully.")
    except NoSuchElementException:
        print("Button not found. Skipping click.")

def scrape_quizlet(quizlet_url, output_file):
    chrome_options = Options()
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")

    driver = webdriver.Chrome(options=chrome_options)

    try:
        driver.get(quizlet_url)
        time.sleep(random.uniform(3, 7))  # Random delay 
        
        # Click the button if it exists
        click_button_if_exists(driver)
        
        terms = []
        definitions = []
        
        term_elements = driver.find_elements(By.CLASS_NAME, 'SetPageTerms-term')

        for term_elem in term_elements:
            term_x = term_elem.find_element(By.XPATH, ".//div[@data-testid='set-page-card-side'][1]/div/span/span")
            term = term_x.text.strip()

            def_x = term_elem.find_element(By.XPATH, ".//div[@data-testid='set-page-card-side'][2]/div/span/span")
            definition = def_x.text.strip()

            terms.append(term)
            definitions.append(definition)
        
        with jsonlines.open(output_file, mode='a') as writer:
            for term, definition in zip(terms, definitions):
                data = {'input': term, 'output': definition}
                writer.write(data)
        
        print(f"Scraping completed for {quizlet_url}. Data appended to {output_file}")
    
    except Exception as e:
        print(f"Error occurred during scraping for {quizlet_url}: {str(e)}")
    
    finally:
        driver.quit()

def scrape_multiple_quizlets(quizlet_urls, output_file):
    for i, url in enumerate(quizlet_urls):
        if i > 0:
            delay_time = random.randint(300, 600) 
            print(f"Pausing scraping for {delay_time} seconds before next page...")
            time.sleep(delay_time)
        
        scrape_quizlet(url, output_file)

quizlet_urls = [
    'https://quizlet.com/748183543/ap-environmental-science-flash-cards/',
    'https://quizlet.com/204426778/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/904782900/ap-environmental-science-flash-cards/',
    'https://quizlet.com/507844728/ap-environmental-science-flash-cards/',
    'https://quizlet.com/565189431/unit-1-ap-environmental-science-flash-cards/',
    'https://quizlet.com/395328278/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/281899442/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/293345572/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/282329185/ap-environmental-science-flash-cards/',
    'https://quizlet.com/84674637/ap-environmental-science-32-flash-cards/',
    'https://quizlet.com/45223659/apes-ap-environmental-science-flash-cards/',
    'https://quizlet.com/59814533/ap-environmental-science-flash-cards/',
    'https://quizlet.com/691812124/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/4933824/ap-environmental-science-exam-review-flash-cards/',
    'https://quizlet.com/203209444/ap-environmental-science-environmental-laws-flash-cards/',
    'https://quizlet.com/204310948/ap-environmental-science-review-flash-cards/',
]


output_file = 'APES_dataset.jsonl'
scrape_multiple_quizlets(quizlet_urls, output_file)


Scraping completed for https://quizlet.com/748183543/ap-environmental-science-flash-cards/. Data appended to APES_dataset.jsonl
Pausing scraping for 600 seconds before next page...
Error occurred during scraping for https://quizlet.com/204426778/ap-environmental-science-review-flash-cards/: Message: no such element: Unable to locate element: {"method":"xpath","selector":".//div[@data-testid='set-page-card-side'][2]/div/span/span"}
  (Session info: chrome=124.0.6367.63); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF6E02A1502+60802]
	(No symbol) [0x00007FF6E021AC02]
	(No symbol) [0x00007FF6E00D7CE4]
	(No symbol) [0x00007FF6E0126D4D]
	(No symbol) [0x00007FF6E0126E1C]
	(No symbol) [0x00007FF6E011A8FC]
	(No symbol) [0x00007FF6E014ABBF]
	(No symbol) [0x00007FF6E011A7C6]
	(No symbol) [0x00007FF6E014AD90]
	(No symbol) [0x00007FF6E016A224]
	(No symbol) [0x

In [8]:
import time
import jsonlines
from selenium import webdriver
from selenium.webdriver.common.by import By

def scrape_quizlet(quizlet_url, output_file):
    driver = webdriver.Chrome()

    try:
        driver.get(quizlet_url)
        time.sleep(5)
        
        terms = []
        definitions = []
        
        term_elements = driver.find_elements(By.CLASS_NAME, 'SetPageTerms-term')

        for term_elem in term_elements:
            # Extract term
            term_x = term_elem.find_element(By.XPATH, ".//div[@data-testid='set-page-card-side'][1]/div/span/span")
            term = term_x.text.strip()

            # Extract definition
            def_x = term_elem.find_element(By.XPATH, ".//div[@data-testid='set-page-card-side'][2]/div/span/span")
            definition = def_x.text.strip()

            terms.append(term)
            definitions.append(definition)
        
        with jsonlines.open(output_file, mode='a') as writer:
            for term, definition in zip(terms, definitions):
                data = {'input': term, 'output': definition}
                writer.write(data)
        
        print(f"Scraping completed for {quizlet_url}. Data appended to {output_file}")
    
    except Exception as e:
        print(f"Error occurred during scraping for {quizlet_url}: {str(e)}")
    
    finally:
        driver.quit()

def scrape_multiple_quizlets(quizlet_urls, output_file):
    for url in quizlet_urls:
        scrape_quizlet(url, output_file)

# Example usage
quizlet_urls = [
    'https://quizlet.com/748183543/ap-environmental-science-flash-cards/',
    'https://quizlet.com/204426778/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/904782900/ap-environmental-science-flash-cards/',
    'https://quizlet.com/507844728/ap-environmental-science-flash-cards/',
    'https://quizlet.com/565189431/unit-1-ap-environmental-science-flash-cards/',
    'https://quizlet.com/395328278/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/281899442/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/293345572/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/282329185/ap-environmental-science-flash-cards/',
    'https://quizlet.com/84674637/ap-environmental-science-32-flash-cards/',
    'https://quizlet.com/45223659/apes-ap-environmental-science-flash-cards/',
    'https://quizlet.com/59814533/ap-environmental-science-flash-cards/',
    'https://quizlet.com/691812124/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/4933824/ap-environmental-science-exam-review-flash-cards/',
    'https://quizlet.com/203209444/ap-environmental-science-environmental-laws-flash-cards/',
    'https://quizlet.com/204310948/ap-environmental-science-review-flash-cards/',
]

output_file = 'APES_dataset.jsonl'
scrape_multiple_quizlets(quizlet_urls, output_file)


Scraping completed for https://quizlet.com/748183543/ap-environmental-science-flash-cards/. Data appended to APES_dataset.jsonl
Scraping completed for https://quizlet.com/204426778/ap-environmental-science-review-flash-cards/. Data appended to APES_dataset.jsonl
Scraping completed for https://quizlet.com/904782900/ap-environmental-science-flash-cards/. Data appended to APES_dataset.jsonl


Exception ignored in: <function Service.__del__ at 0x0000011ED8D73920>
Traceback (most recent call last):
  File "c:\Users\shrey\AppData\Local\Programs\Python\Python312\Lib\site-packages\selenium\webdriver\common\service.py", line 189, in __del__
    self.stop()
  File "c:\Users\shrey\AppData\Local\Programs\Python\Python312\Lib\site-packages\selenium\webdriver\common\service.py", line 146, in stop
    self.send_remote_shutdown_command()
  File "c:\Users\shrey\AppData\Local\Programs\Python\Python312\Lib\site-packages\selenium\webdriver\common\service.py", line 126, in send_remote_shutdown_command
    request.urlopen(f"{self.service_url}/shutdown")
  File "c:\Users\shrey\AppData\Local\Programs\Python\Python312\Lib\urllib\request.py", line 215, in urlopen
    return opener.open(url, data, timeout)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\shrey\AppData\Local\Programs\Python\Python312\Lib\urllib\request.py", line 515, in open
    response = self._open(req, data)
         

Scraping completed for https://quizlet.com/507844728/ap-environmental-science-flash-cards/. Data appended to APES_dataset.jsonl


In [1]:
import time
import jsonlines
from selenium import webdriver
from selenium.webdriver.common.by import By

def scrape_quizlet(quizlet_url, output_file):
    driver = webdriver.Chrome()

    try:
        driver.get(quizlet_url)
        time.sleep(5)
        
        terms = []
        definitions = []
        
        term_elements = driver.find_elements(By.CLASS_NAME, 'SetPageTerms-term')

        for term_elem in term_elements:
            # Extract term
            term_x = term_elem.find_element(By.XPATH, ".//div[@data-testid='set-page-card-side'][1]/div/span/span")
            term = term_x.text.strip()

            # Extract definition
            def_x = term_elem.find_element(By.XPATH, ".//div[@data-testid='set-page-card-side'][2]/div/span/span")
            definition = def_x.text.strip()

            terms.append(term)
            definitions.append(definition)
        
        with jsonlines.open(output_file, mode='a') as writer:
            for term, definition in zip(terms, definitions):
                data = {'input': term, 'output': definition}
                writer.write(data)
        
        print(f"Scraping completed. Data saved to {output_file}")
    
    except Exception as e:
        print(f"Error occurred during scraping: {str(e)}")
    
    finally:
        driver.quit()

quizlet_url = 'https://quizlet.com/847731671/ap-music-theory-flash-cards/'
output_file = 'quizlet_data.jsonl'
scrape_quizlet(quizlet_url, output_file)


KeyboardInterrupt: 