In [1]:
import random
import jsonlines
import requests
from bs4 import BeautifulSoup

output_file = 'output.jsonl'
quizlet_urls = [
    'https://quizlet.com/748183543/ap-environmental-science-flash-cards/',
    'https://quizlet.com/204426778/ap-environmental-science-review-flash-cards/',
    # Add more Quizlet URLs as needed
]

def download_quizlet_page(url, filename):
    response = requests.get(url)
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(response.text)

def scrape_quizlet_page(html_filename):
    with open(html_filename, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')
    
    terms = []
    definitions = []

    term_elements = soup.find_all('div', class_='SetPageTerms-term')

    for term_elem in term_elements:
        term_x = term_elem.find('div', {'data-testid': 'set-page-card-side'}, recursive=True)
        term = term_x.span.span.text.strip()

        def_x = term_elem.find('div', {'data-testid': 'set-page-card-side'}, recursive=True, class_='SetPageTerm-definition')
        definition = def_x.span.span.text.strip()

        terms.append(term)
        definitions.append(definition)

    return terms, definitions

def save_data_to_jsonl(terms, definitions):
    with jsonlines.open(output_file, mode='a') as writer:
        for term, definition in zip(terms, definitions):
            data = {'input': term, 'output': definition}
            writer.write(data)

def main():
    for url in quizlet_urls:
        html_filename = url.split('/')[-2] + '.html'  # Generate a filename based on the URL
        download_quizlet_page(url, html_filename)
        terms, definitions = scrape_quizlet_page(html_filename)
        save_data_to_jsonl(terms, definitions)
        print(f"Scraping completed for {url}. Data appended to {output_file}")

if __name__ == "__main__":
    main()


Scraping completed for https://quizlet.com/748183543/ap-environmental-science-flash-cards/. Data appended to output.jsonl
Scraping completed for https://quizlet.com/204426778/ap-environmental-science-review-flash-cards/. Data appended to output.jsonl


In [12]:
import asyncio
import random
import jsonlines
from selenium.webdriver.common.by import By
from undetected_chromedriver.v2 import Chrome, ChromeOptions

output_file = 'output.jsonl'
quizlet_urls = [
    'https://quizlet.com/748183543/ap-environmental-science-flash-cards/',
    'https://quizlet.com/204426778/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/904782900/ap-environmental-science-flash-cards/',
    'https://quizlet.com/507844728/ap-environmental-science-flash-cards/',
    'https://quizlet.com/565189431/unit-1-ap-environmental-science-flash-cards/',
    'https://quizlet.com/395328278/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/281899442/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/293345572/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/282329185/ap-environmental-science-flash-cards/',
    'https://quizlet.com/84674637/ap-environmental-science-32-flash-cards/',
    'https://quizlet.com/45223659/apes-ap-environmental-science-flash-cards/',
    'https://quizlet.com/59814533/ap-environmental-science-flash-cards/',
    'https://quizlet.com/691812124/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/4933824/ap-environmental-science-exam-review-flash-cards/',
    'https://quizlet.com/203209444/ap-environmental-science-environmental-laws-flash-cards/',
    'https://quizlet.com/204310948/ap-environmental-science-review-flash-cards/',
]



async def scrape_quizlet(url, driver):
    await driver.get(url)
    await asyncio.sleep(random.uniform(30, 70))

    while True:
        try:
            button = await driver.find_element(By.CLASS_NAME, 'AssemblyButtonBase')
            await button.click()
            await asyncio.sleep(random.uniform(10, 20))
            print("Clicked on Load More button")
        except Exception as e:
            print(e)
            break

    terms = []
    definitions = []

    term_elements = await driver.find_elements(By.CLASS_NAME, 'SetPageTerms-term')

    for term_elem in term_elements:
        term_x = await term_elem.find_element(By.XPATH, ".//div[@data-testid='set-page-card-side'][1]/div/span/span")
        term = term_x.text.strip()

        def_x = await term_elem.find_element(By.XPATH, ".//div[@data-testid='set-page-card-side'][2]/div/span/span")
        definition = def_x.text.strip()

        terms.append(term)
        definitions.append(definition)

    with jsonlines.open(output_file, mode='a') as writer:
        for term, definition in zip(terms, definitions):
            data = {'input': term, 'output': definition}
            writer.write(data)

    print(f"Scraping completed for {url}. Data appended to {output_file}")
    await asyncio.sleep(random.uniform(200, 800))

async def main():
    chrome_options = ChromeOptions()
    chrome_options.add_argument("--headless")
    driver = Chrome(options=chrome_options)

    tasks = []

    # Scrape Quizlet URLs
    for url in quizlet_urls:
        tasks.append(scrape_quizlet(url, driver))

    await asyncio.gather(*tasks)

    driver.quit()

if __name__ == "__main__":
    asyncio.run(main())


ModuleNotFoundError: No module named 'undetected_chromedriver.v2'

In [10]:
import nodriver as uc
import time
import random
import jsonlines

output_file = 'output.jsonl'

async def main():
    driver = await uc.start()

    for url in quizlet_urls:
        await driver.get(url)
        time.sleep(random.uniform(30, 70))

        while True:
            try:
                button = driver.find_element(By.CLASS_NAME, 'AssemblyButtonBase')
                button.click()
                time.sleep(random.uniform(10, 20)) 
                print("Clicked on Load More button")
            except Exception as e:
                print(e)
                break  

        terms = []
        definitions = []

        term_elements = driver.find_elements(By.CLASS_NAME, 'SetPageTerms-term')

        for term_elem in term_elements:
            term_x = term_elem.find_element(By.XPATH, ".//div[@data-testid='set-page-card-side'][1]/div/span/span")
            term = term_x.text.strip()

            def_x = term_elem.find_element(By.XPATH, ".//div[@data-testid='set-page-card-side'][2]/div/span/span")
            definition = def_x.text.strip()

            terms.append(term)
            definitions.append(definition)

        with jsonlines.open(output_file, mode='a') as writer:
            for term, definition in zip(terms, definitions):
                data = {'input': term, 'output': definition}
                writer.write(data)

        print(f"Scraping completed for {url}. Data appended to {output_file}")
        time.sleep(random.uniform(200, 800))


await main()

NotImplementedError: 

In [16]:
import time
import random
import jsonlines
import undetected_chromedriver as uc


def click_button_if_exists(driver):
    try:
        button = driver.find_element(By.CLASS_NAME, 'AssemblyButtonBase')
        button.click()
        print("Button clicked successfully.")
    except NoSuchElementException:
        print("Button not found. Skipping click.")

def scrape_quizlet(quizlet_url, output_file):

    driver = uc.Chrome()
   # driver.get('https://nowsecure.nl')


    try:
        driver.get(quizlet_url)
        time.sleep(random.uniform(3, 7))  # Random delay 
        
        # Click the button if it exists
        click_button_if_exists(driver)
        time.sleep(random.uniform(30, 70))  # Random delay 


        terms = []
        definitions = []
        
        term_elements = driver.find_elements(By.CLASS_NAME, 'SetPageTerms-term')

        for term_elem in term_elements:
            term_x = term_elem.find_element(By.XPATH, ".//div[@data-testid='set-page-card-side'][1]/div/span/span")
            term = term_x.text.strip()

            def_x = term_elem.find_element(By.XPATH, ".//div[@data-testid='set-page-card-side'][2]/div/span/span")
            definition = def_x.text.strip()

            terms.append(term)
            definitions.append(definition)
        
        with jsonlines.open(output_file, mode='a') as writer:
            for term, definition in zip(terms, definitions):
                data = {'input': term, 'output': definition}
                writer.write(data)
        
        print(f"Scraping completed for {quizlet_url}. Data appended to {output_file}")
    
    except Exception as e:
        print(f"Error occurred during scraping for {quizlet_url}: {str(e)}")
    
    finally:
        driver.quit()

def scrape_multiple_quizlets(quizlet_urls, output_file):
    for i, url in enumerate(quizlet_urls):
        if i > 0:
            delay_time = random.randint(300, 600) 
            print(f"Pausing scraping for {delay_time} seconds before next page...")
            time.sleep(delay_time)
        
        scrape_quizlet(url, output_file)

quizlet_urls = [
    'https://quizlet.com/748183543/ap-environmental-science-flash-cards/',
    'https://quizlet.com/204426778/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/904782900/ap-environmental-science-flash-cards/',
    'https://quizlet.com/507844728/ap-environmental-science-flash-cards/',
    'https://quizlet.com/565189431/unit-1-ap-environmental-science-flash-cards/',
    'https://quizlet.com/395328278/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/281899442/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/293345572/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/282329185/ap-environmental-science-flash-cards/',
    'https://quizlet.com/84674637/ap-environmental-science-32-flash-cards/',
    'https://quizlet.com/45223659/apes-ap-environmental-science-flash-cards/',
    'https://quizlet.com/59814533/ap-environmental-science-flash-cards/',
    'https://quizlet.com/691812124/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/4933824/ap-environmental-science-exam-review-flash-cards/',
    'https://quizlet.com/203209444/ap-environmental-science-environmental-laws-flash-cards/',
    'https://quizlet.com/204310948/ap-environmental-science-review-flash-cards/',
]


output_file = 'APES_dataset.jsonl'
scrape_multiple_quizlets(quizlet_urls, output_file)


ModuleNotFoundError: No module named 'distutils'

In [1]:
import time
import random
import jsonlines
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException



def click_button_if_exists(driver):
    try:
        button = driver.find_element(By.CLASS_NAME, 'AssemblyButtonBase')
        button.click()
        print("Button clicked successfully.")
    except NoSuchElementException:
        print("Button not found. Skipping click.")

def scrape_quizlet(quizlet_url, output_file):
    chrome_options = Options()

    driver = webdriver.Chrome(options=chrome_options)

    try:
        driver.get(quizlet_url)
        time.sleep(random.uniform(3, 7))  # Random delay 
        
        # Click the button if it exists
        click_button_if_exists(driver)
        time.sleep(random.uniform(30, 70))  # Random delay 


        terms = []
        definitions = []
        
        term_elements = driver.find_elements(By.CLASS_NAME, 'SetPageTerms-term')

        for term_elem in term_elements:
            term_x = term_elem.find_element(By.XPATH, ".//div[@data-testid='set-page-card-side'][1]/div/span/span")
            term = term_x.text.strip()

            def_x = term_elem.find_element(By.XPATH, ".//div[@data-testid='set-page-card-side'][2]/div/span/span")
            definition = def_x.text.strip()

            terms.append(term)
            definitions.append(definition)
        
        with jsonlines.open(output_file, mode='a') as writer:
            for term, definition in zip(terms, definitions):
                data = {'input': term, 'output': definition}
                writer.write(data)
        
        print(f"Scraping completed for {quizlet_url}. Data appended to {output_file}")
    
    except Exception as e:
        print(f"Error occurred during scraping for {quizlet_url}: {str(e)}")
    
    finally:
        driver.quit()

def scrape_multiple_quizlets(quizlet_urls, output_file):
    for i, url in enumerate(quizlet_urls):
        if i > 0:
            delay_time = random.randint(300, 600) 
            print(f"Pausing scraping for {delay_time} seconds before next page...")
            time.sleep(delay_time)
        
        scrape_quizlet(url, output_file)

quizlet_urls = [
    'https://quizlet.com/748183543/ap-environmental-science-flash-cards/',
    'https://quizlet.com/204426778/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/904782900/ap-environmental-science-flash-cards/',
    'https://quizlet.com/507844728/ap-environmental-science-flash-cards/',
    'https://quizlet.com/565189431/unit-1-ap-environmental-science-flash-cards/',
    'https://quizlet.com/395328278/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/281899442/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/293345572/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/282329185/ap-environmental-science-flash-cards/',
    'https://quizlet.com/84674637/ap-environmental-science-32-flash-cards/',
    'https://quizlet.com/45223659/apes-ap-environmental-science-flash-cards/',
    'https://quizlet.com/59814533/ap-environmental-science-flash-cards/',
    'https://quizlet.com/691812124/ap-environmental-science-review-flash-cards/',
    'https://quizlet.com/4933824/ap-environmental-science-exam-review-flash-cards/',
    'https://quizlet.com/203209444/ap-environmental-science-environmental-laws-flash-cards/',
    'https://quizlet.com/204310948/ap-environmental-science-review-flash-cards/',
]


output_file = 'APES_dataset.jsonl'
scrape_multiple_quizlets(quizlet_urls, output_file)


Button clicked successfully.


KeyboardInterrupt: 