In [4]:
pip install webdriver-manager

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver-manager
Successfully installed python-dotenv-1.0.1 webdriver-manager-4.0.2
Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install --upgrade selenium

Collecting selenium
  Downloading selenium-4.25.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Downloading selenium-4.25.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading trio-0.27.0-py3-none-any.whl (481 kB)
Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Installing collected packages: trio, trio-websocket, selenium
  Attempting uninstall: selenium
    Found existing installation: selenium 3.141.0
    Uninstalling selenium-3.141.0:
      Successfully uninstalled selenium-3.141.0
Successfully installed selenium-4.25.0 trio-0.27.0 trio-websocket-0.11.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
import os

# List of country codes and sections to scrape
country_codes = [
    'en-us', 'en-in', 'en-gb', 'fr-fr', 'de-de', 'ja-jp', 'ko-kr', 'vi-vn',
    'in-id', 'it-it', 'es-es', 'ru-ru', 'en-au', 'en-my', 'en-sg', 'en-ie',
    'pt-br', 'pt-pt', 'es-latam', 'zh-tw', 'nl-nl', 'th-th', 'fil-ph', 
    'tr-tr', 'en-eu', 'en-ca', 'en-africa', 'km-kh', 'my-mm', 'ar-mena', 
    'sv-se', 'pl-pl', 'he-il', 'ru-kz', 'ur-pk', 'bn-bd', 'ro-ro', 'nb-no', 
    'az-az'
]

sections = ['news', 'product', 'safety', 'community']

# Function to scrape articles with Selenium headless mode and scroll
def scrape_articles_with_scroll(country_code, section):
    try:
        # Set Chrome options to run in headless mode
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run headless
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")

        # Initialize Selenium WebDriver
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

        # Open the TikTok Newsroom page for the given section and country
        url = f"https://newsroom.tiktok.com/{country_code}/{section}"
        driver.get(url)

        # Scroll the page to load dynamic content
        last_height = driver.execute_script("return document.body.scrollHeight")
        
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(3)  # Adjust based on page load speed
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Parse the page source with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, "html.parser")
        articles = set()

        # Locate and extract article links
        article_section = soup.find_all("a", class_="jsx-3645149265")
        for article in article_section:
            try:
                link = f"https://newsroom.tiktok.com{article['href']}"
                articles.add(link)
            except Exception as e:
                print(f"Error parsing article: {e}")

        driver.quit()
        return list(articles)
    
    except Exception as e:
        print(f"Error scraping {country_code} - {section}: {e}")
        return []


# Function to scrape all countries and sections concurrently
def scrape_all_countries_sections():
    all_links = set()

    # Using ThreadPoolExecutor for concurrent execution
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_country_section = {
            executor.submit(scrape_articles_with_scroll, country_code, section): (country_code, section)
            for country_code in country_codes
            for section in sections
        }

        # Process each future as it completes
        for future in as_completed(future_to_country_section):
            country_code, section = future_to_country_section[future]
            try:
                articles = future.result()
                all_links.update(articles)
            except Exception as e:
                print(f"Error in {country_code} - {section}: {e}")

    return all_links

# Function to fetch HTML content from a list of links
def fetch_html_from_links(links):
    html_folder = "scraped_html"
    os.makedirs(html_folder, exist_ok=True)  # Create folder to store HTML files if not exists
    
    for link in links:
        try:
            response = requests.get(link)
            if response.status_code == 200:
                # Save the HTML content to a file
                filename = os.path.join(html_folder, link.split('/')[-1] + ".html")
                with open(filename, "w", encoding="utf-8") as file:
                    file.write(response.text)
                print(f"Saved HTML from {link} to {filename}")
            else:
                print(f"Failed to retrieve {link} (Status Code: {response.status_code})")
        except Exception as e:
            print(f"Error fetching HTML from {link}: {e}")

# Main function to scrape links and fetch HTML
def main():
    # Scrape the links from all country sections
    scraped_links = scrape_all_countries_sections()

    # Fetch HTML from the scraped links
    fetch_html_from_links(scraped_links)

# Call the main function
if __name__ == "__main__":
    main()


Error scraping de-de - safety: Message: javascript error: Cannot read properties of null (reading 'scrollHeight')
  (Session info: chrome=130.0.6723.59)
Stacktrace:
0   chromedriver                        0x0000000102e87634 cxxbridge1$str$ptr + 3645404
1   chromedriver                        0x0000000102e7fe94 cxxbridge1$str$ptr + 3614780
2   chromedriver                        0x00000001028ec104 cxxbridge1$string$len + 88416
3   chromedriver                        0x00000001028f119c cxxbridge1$string$len + 109048
4   chromedriver                        0x00000001028f2b0c cxxbridge1$string$len + 115560
5   chromedriver                        0x0000000102968808 cxxbridge1$string$len + 598116
6   chromedriver                        0x0000000102967bd0 cxxbridge1$string$len + 594988
7   chromedriver                        0x0000000102922f54 cxxbridge1$string$len + 313264
8   chromedriver                        0x0000000102923ba4 cxxbridge1$string$len + 316416
9   chromedriver              

In [1]:
import os
from bs4 import BeautifulSoup

# Folder where HTML files are stored
html_folder = "scraped_html"
# Folder where you want to save the resulting text files
output_folder = "extracted_text"

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Function to extract content from the specific class and save to text file
def process_html_file(file_path, output_path):
    with open(file_path, "r", encoding="utf-8") as file:
        html_content = file.read()

    # Parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")

    # Find the div with the class "jsx-4151113370 jsx-3313709575 post-content post-full-content"
    post_content_div = soup.find("div", class_="jsx-4151113370 jsx-3313709575 post-content post-full-content")

    if post_content_div:
        # Prepare an empty string to store the extracted text
        extracted_text = ""

        # Loop through each paragraph (<p> tag) to maintain structure
        paragraphs = post_content_div.find_all('p')
        for paragraph in paragraphs:
            # Extract the text without adding spaces after punctuation
            paragraph_text = ''.join(paragraph.stripped_strings)
            extracted_text += paragraph_text + "\n\n"  # Add double newline between paragraphs

        # Save the extracted content to a text file
        with open(output_path, "w", encoding="utf-8") as text_file:
            text_file.write(extracted_text)
        print(f"Saved extracted text to {output_path}")
    else:
        print(f"Div not found in {file_path}")

# Loop through all HTML files in the scraped_html folder
for html_file in os.listdir(html_folder):
    if html_file.endswith(".html"):
        # Full path to the HTML file
        html_file_path = os.path.join(html_folder, html_file)

        # Output text file path
        output_file_name = os.path.splitext(html_file)[0] + ".txt"
        output_file_path = os.path.join(output_folder, output_file_name)

        # Process the HTML file and save the result
        process_html_file(html_file_path, output_file_path)


Saved extracted text to extracted_text/international-mother-language-day-v-tiktok.txt
Saved extracted text to extracted_text/adding-clarity-to-content-removals-au.txt
Saved extracted text to extracted_text/tiktok-launches-a-new-campaign-on-march-9-to-support-correct-understanding-of-new-coronavirus-infections-and-eliminate-discrimination-and-prejudice.txt
Saved extracted text to extracted_text/tiktok-wear.txt
Saved extracted text to extracted_text/tiktok-orizuru.txt
Saved extracted text to extracted_text/crediting-tools-tw.txt
Saved extracted text to extracted_text/helping-creators-understand-our-rules-with-refreshed-community-guidelines-cambodia.txt
Saved extracted text to extracted_text/tiktok-change-makers-in-southeast-asia.txt
Saved extracted text to extracted_text/new-prompts-to-help-people-consider-before-they-share-my.txt
Saved extracted text to extracted_text/un-creator-al-giorno-weedy.txt
Saved extracted text to extracted_text/israel-celebrating-earth-day.txt
Saved extracted t

In [None]:
df.head()