In [5]:
import os
import ast
import pandas as pd
import requests
from urllib.parse import urlparse
from pathlib import Path
from tqdm import tqdm
from twitter_scraper_selenium import scrape_profile
import ipywidgets as widgets
from IPython.display import display, clear_output
from threading import Thread

def download_file(url, save_path, chunk_size=8192, log_callback=None):
    """
    Downloads a file from the specified URL and saves it to the given path.

    Args:
        url (str): The URL of the file to download.
        save_path (str): The local path where the file will be saved.
        chunk_size (int, optional): Size of each chunk to read from the response. Defaults to 8192.
        log_callback (function, optional): Function to call for logging messages. Defaults to None.
    """
    try:
        with requests.get(url, stream=True) as response:
            response.raise_for_status()
            total_size = int(response.headers.get('content-length', 0))
            with open(save_path, 'wb') as f, tqdm(
                desc=os.path.basename(save_path),
                total=total_size,
                unit='iB',
                unit_scale=True,
                unit_divisor=1024,
            ) as bar:
                for chunk in response.iter_content(chunk_size=chunk_size):
                    if chunk:  # filter out keep-alive new chunks
                        size = f.write(chunk)
                        bar.update(size)
        if log_callback:
            log_callback(f"Successfully downloaded: {save_path}")
    except requests.exceptions.RequestException as e:
        if log_callback:
            log_callback(f"Failed to download {url}: {e}")

def sanitize_filename(filename):
    """
    Removes or replaces characters that are invalid in file names.

    Args:
        filename (str): The original filename.

    Returns:
        str: The sanitized filename.
    """
    keepcharacters = (' ', '.', '_', '-')
    return "".join(c for c in filename if c.isalnum() or c in keepcharacters).rstrip()

def get_file_extension(url, default='jpg'):
    """
    Determines the file extension based on the URL or defaults to 'jpg'.

    Args:
        url (str): The URL of the file.
        default (str, optional): The default file extension if none is found. Defaults to 'jpg'.

    Returns:
        str: The file extension.
    """
    path = urlparse(url).path
    ext = os.path.splitext(path)[1].lower()
    if ext in ['.jpg', '.jpeg', '.png', '.gif']:
        return ext
    elif ext in ['.mp4', '.mov', '.avi', '.wmv']:
        return ext
    else:
        return f".{default}"

def scrape_tweets(twitter_username, output_format, browser, tweets_count, filename, directory, log_callback=None):
    """
    Scrapes tweets from a specified Twitter profile using selenium.

    Args:
        twitter_username (str): Twitter handle of the profile to scrape.
        output_format (str): Format to save the scraped data (e.g., 'csv').
        browser (str): Browser to use for scraping (e.g., 'firefox').
        tweets_count (int): Number of tweets to scrape.
        filename (str): Name of the output file without extension.
        directory (str): Directory where the output file will be saved.
        log_callback (function, optional): Function to call for logging messages. Defaults to None.
    """
    if log_callback:
        log_callback(f"Starting to scrape tweets from @{twitter_username}...")
    scrape_profile(
        twitter_username=twitter_username,
        output_format=output_format,
        browser=browser,
        tweets_count=tweets_count,
        filename=filename,
        directory=directory
    )
    if log_callback:
        log_callback(f"Scraping completed. Data saved to {os.path.join(directory, filename + '.' + output_format)}")

def download_media(csv_file, output_base_dir, log_callback=None):
    """
    Processes the CSV file to download images and videos from tweets.

    Args:
        csv_file (str): Path to the CSV file containing tweet data.
        output_base_dir (str): Base directory where media will be downloaded.
        log_callback (function, optional): Function to call for logging messages. Defaults to None.
    """
    # Create the base output directory if it doesn't exist
    Path(output_base_dir).mkdir(parents=True, exist_ok=True)
    if log_callback:
        log_callback(f"Reading CSV file: {csv_file}")

    # Read the CSV file
    try:
        df = pd.read_csv(csv_file, dtype=str)  # Read all data as strings to avoid issues
    except Exception as e:
        if log_callback:
            log_callback(f"Error reading CSV file: {e}")
        return

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        tweet_id = str(row.get('tweet_id', '')).strip()
        if not tweet_id:
            if log_callback:
                log_callback(f"Row {index} missing 'tweet_id'. Skipping.")
            continue

        images = row.get('images', '[]')
        videos = row.get('videos', '[]')

        # Parse the string representation of lists
        try:
            image_urls = ast.literal_eval(images) if pd.notna(images) else []
            if not isinstance(image_urls, list):
                raise ValueError("Images field is not a list")
        except Exception as e:
            if log_callback:
                log_callback(f"Error parsing images for tweet_id {tweet_id}: {e}")
            image_urls = []

        try:
            video_urls = ast.literal_eval(videos) if pd.notna(videos) else []
            if not isinstance(video_urls, list):
                raise ValueError("Videos field is not a list")
        except Exception as e:
            if log_callback:
                log_callback(f"Error parsing videos for tweet_id {tweet_id}: {e}")
            video_urls = []

        # Skip if there are no images and no videos
        if not image_urls and not video_urls:
            if log_callback:
                log_callback(f"No media found for tweet_id {tweet_id}. Skipping.")
            continue

        # Create a directory for the tweet_id
        tweet_dir = os.path.join(output_base_dir, tweet_id)
        os.makedirs(tweet_dir, exist_ok=True)
        if log_callback:
            log_callback(f"Created directory: {tweet_dir}")

        # Initialize counters for sequential naming
        image_counter = 1
        video_counter = 1

        # Download images
        for img_url in image_urls:
            if not img_url or not img_url.strip():
                continue  # Skip empty URLs
            try:
                # Assign sequential naming
                ext = get_file_extension(img_url, default='jpg')
                img_name = f"image_{image_counter}{ext}"
                img_save_path = os.path.join(tweet_dir, img_name)
                if os.path.exists(img_save_path):
                    if log_callback:
                        log_callback(f"File already exists: {img_save_path}. Skipping download.")
                    image_counter += 1
                    continue
                if log_callback:
                    log_callback(f"Downloading image: {img_url} as {img_name}")
                download_file(img_url, img_save_path, log_callback=log_callback)
                image_counter += 1
            except Exception as e:
                if log_callback:
                    log_callback(f"Error downloading image {img_url} for tweet_id {tweet_id}: {e}")

        # Download videos
        for vid_url in video_urls:
            if not vid_url or not vid_url.strip():
                continue  # Skip empty URLs
            try:
                # Assign sequential naming
                ext = get_file_extension(vid_url, default='mp4')
                vid_name = f"video_{video_counter}{ext}"
                vid_save_path = os.path.join(tweet_dir, vid_name)
                if os.path.exists(vid_save_path):
                    if log_callback:
                        log_callback(f"File already exists: {vid_save_path}. Skipping download.")
                    video_counter += 1
                    continue
                if log_callback:
                    log_callback(f"Downloading video: {vid_url} as {vid_name}")
                download_file(vid_url, vid_save_path, log_callback=log_callback)
                video_counter += 1
            except Exception as e:
                if log_callback:
                    log_callback(f"Error downloading video {vid_url} for tweet_id {tweet_id}: {e}")

def start_process(twitter_username, output_format, browser, tweets_count, filename, directory, log_callback):
    """
    Starts the scraping and downloading processes.

    Args:
        twitter_username (str): Twitter handle to scrape.
        output_format (str): Output format for scraped data.
        browser (str): Browser to use for scraping.
        tweets_count (int): Number of tweets to scrape.
        filename (str): Filename for the scraped data.
        directory (str): Directory to save scraped data and downloads.
        log_callback (function): Function to call for logging messages.
    """
    # Paths for CSV and media downloads
    csv_file = os.path.join(directory, f"{filename}.{output_format}")
    output_base_dir = os.path.join(directory, "data")

    # Step 1: Scrape tweets
    scrape_tweets(
        twitter_username=twitter_username,
        output_format=output_format,
        browser=browser,
        tweets_count=tweets_count,
        filename=filename,
        directory=directory,
        log_callback=log_callback
    )

    # Step 2: Download media from scraped tweets
    download_media(csv_file, output_base_dir, log_callback=log_callback)

def create_gui():
    """
    Creates the GUI for user input using ipywidgets.
    """
    # Define Widgets
    twitter_username = widgets.Text(
        value='nyushanghai',
        placeholder='Enter Twitter username',
        description='Twitter Username:',
        disabled=False,
        layout=widgets.Layout(width='50%')
    )

    output_format = widgets.Dropdown(
        options=['csv', 'json'],
        value='csv',
        description='Output Format:',
        disabled=False,
        layout=widgets.Layout(width='50%')
    )

    browser = widgets.Dropdown(
        options=['firefox', 'chrome'],
        value='firefox',
        description='Browser:',
        disabled=False,
        layout=widgets.Layout(width='50%')
    )

    tweets_count = widgets.IntText(
        value=10,
        description='Number of Tweets:',
        disabled=False,
        layout=widgets.Layout(width='50%')
    )

    filename = widgets.Text(
        value='nyushanghai',
        placeholder='Enter filename',
        description='Filename:',
        disabled=False,
        layout=widgets.Layout(width='50%')
    )

    directory = widgets.Text(
        value='/Users/princess/Documents/RA/X-scraper',  # Adjust as needed
        placeholder='Enter save directory',
        description='Save Directory:',
        disabled=False,
        layout=widgets.Layout(width='50%')
    )

    browse_button = widgets.Button(
        description='Browse',
        button_style='',
        tooltip='Browse for directory',
        icon='folder',
        layout=widgets.Layout(width='10%')
    )

    # Log Output Widget
    log_output = widgets.Output()

    # Define Browse Function
    def on_browse_clicked(b):
        from IPython.display import display
        import tkinter as tk
        from tkinter import filedialog

        # Hide the main tkinter window
        root = tk.Tk()
        root.withdraw()
        selected_dir = filedialog.askdirectory()
        if selected_dir:
            directory.value = selected_dir

    browse_button.on_click(on_browse_clicked)

    # Define Log Callback
    def log(message):
        with log_output:
            print(message)

    # Define Start Button
    start_button = widgets.Button(
        description='Start',
        button_style='success',
        tooltip='Start scraping and downloading',
        icon='play',
        layout=widgets.Layout(width='20%')
    )

    # Define Start Button Callback
    def on_start_clicked(b):
        # Disable the start button to prevent multiple clicks
        start_button.disabled = True

        # Clear previous logs
        with log_output:
            clear_output()

        # Retrieve widget values
        twitter_username_val = twitter_username.value.strip()
        output_format_val = output_format.value
        browser_val = browser.value
        tweets_count_val = tweets_count.value
        filename_val = filename.value.strip()
        directory_val = directory.value.strip()

        # Input Validation
        error_messages = []
        if not twitter_username_val:
            error_messages.append("Twitter Username cannot be empty.")
        if output_format_val not in ['csv', 'json']:
            error_messages.append("Output Format must be 'csv' or 'json'.")
        if browser_val not in ['firefox', 'chrome']:
            error_messages.append("Browser must be 'firefox' or 'chrome'.")
        if not isinstance(tweets_count_val, int) or tweets_count_val <= 0:
            error_messages.append("Number of Tweets must be a positive integer.")
        if not filename_val:
            error_messages.append("Filename cannot be empty.")
        if not os.path.isdir(directory_val):
            error_messages.append("Save Directory is invalid or does not exist.")

        if error_messages:
            with log_output:
                for msg in error_messages:
                    print(f"Error: {msg}")
            start_button.disabled = False
            return

        # Start the scraping and downloading in a separate thread
        thread = Thread(target=start_process, args=(
            twitter_username_val,
            output_format_val,
            browser_val,
            tweets_count_val,
            filename_val,
            directory_val,
            log
        ))
        thread.start()

        # Monitor the thread
        def check_thread():
            if thread.is_alive():
                # Check again after 1 second
                import time
                time.sleep(1)
                check_thread()
            else:
                with log_output:
                    print("Process Completed.")
                start_button.disabled = False

        check_thread_thread = Thread(target=check_thread)
        check_thread_thread.start()

    start_button.on_click(on_start_clicked)

    # Arrange Widgets in Layout
    ui = widgets.VBox([
        widgets.HBox([twitter_username]),
        widgets.HBox([output_format]),
        widgets.HBox([browser]),
        widgets.HBox([tweets_count]),
        widgets.HBox([filename]),
        widgets.HBox([directory, browse_button]),
        widgets.HBox([start_button]),
        widgets.Label("Log:"),
        log_output
    ])

    display(ui)

# Execute GUI Creation
create_gui()


VBox(children=(HBox(children=(Text(value='nyushanghai', description='Twitter Username:', layout=Layout(width='…

Starting to scrape tweets from @nyushanghai...


2025-02-04 23:33:29,574 - INFO - Data Successfully Saved to nyushanghai.csv


Scraping completed. Data saved to /Users/princess/Documents/RA/Web Scraping/X/nyushanghai.csv
Reading CSV file: /Users/princess/Documents/RA/Web Scraping/X/nyushanghai.csv
Created directory: /Users/princess/Documents/RA/Web Scraping/X/data/1184026399574654976
Downloading image: https://pbs.twimg.com/media/EG6BulKU4AAdnge?format=jpg&name=medium as image_1.jpg


image_1.jpg: 100%|██████████| 271k/271k [00:00<00:00, 2.64MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/1184026399574654976/image_1.jpg
Created directory: /Users/princess/Documents/RA/Web Scraping/X/data/1488038488528195584
Downloading image: https://pbs.twimg.com/media/FKaSUKsUUAAPukp?format=jpg&name=large as image_1.jpg


image_1.jpg: 100%|██████████| 515k/515k [00:00<00:00, 12.9MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/1488038488528195584/image_1.jpg
Created directory: /Users/princess/Documents/RA/Web Scraping/X/data/1266368706705125382
Downloading image: https://pbs.twimg.com/media/EZK93q-U8AM6tjU?format=jpg&name=900x900 as image_1.jpg


image_1.jpg: 100%|██████████| 74.2k/74.2k [00:00<00:00, 18.6MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/1266368706705125382/image_1.jpg
Downloading image: https://pbs.twimg.com/media/EZK98B2VAAEgbx1?format=jpg&name=900x900 as image_2.jpg


image_2.jpg: 100%|██████████| 71.2k/71.2k [00:00<00:00, 10.5MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/1266368706705125382/image_2.jpg
Created directory: /Users/princess/Documents/RA/Web Scraping/X/data/1581148521473855488
Downloading image: https://pbs.twimg.com/ext_tw_video_thumb/1581148256263802880/pu/img/WFziqAaO5AVTOGTS.jpg as image_1.jpg


image_1.jpg: 100%|██████████| 27.5k/27.5k [00:00<00:00, 19.4MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/1581148521473855488/image_1.jpg
Created directory: /Users/princess/Documents/RA/Web Scraping/X/data/1697108903711109226
Downloading image: https://pbs.twimg.com/media/F41XYhLbwAAVV4j?format=jpg&name=small as image_1.jpg


image_1.jpg: 100%|██████████| 80.4k/80.4k [00:00<00:00, 27.0MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/1697108903711109226/image_1.jpg
Downloading image: https://pbs.twimg.com/media/F41XcdGbUAAlI_i?format=jpg&name=small as image_2.jpg


image_2.jpg: 100%|██████████| 75.4k/75.4k [00:00<00:00, 17.4MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/1697108903711109226/image_2.jpg
Downloading image: https://pbs.twimg.com/media/F41XUq1a8AAJA4s?format=jpg&name=small as image_3.jpg


image_3.jpg: 100%|██████████| 47.7k/47.7k [00:00<00:00, 39.9MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/1697108903711109226/image_3.jpg
Downloading image: https://pbs.twimg.com/media/F41XgP6a4AAc0qg?format=jpg&name=small as image_4.jpg


image_4.jpg: 100%|██████████| 84.1k/84.1k [00:00<00:00, 18.4MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/1697108903711109226/image_4.jpg
Created directory: /Users/princess/Documents/RA/Web Scraping/X/data/1783378627859751236
Downloading image: https://pbs.twimg.com/media/GL_QzBuawAEl_13?format=jpg&name=small as image_1.jpg


image_1.jpg: 100%|██████████| 68.3k/68.3k [00:00<00:00, 10.4MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/1783378627859751236/image_1.jpg
Downloading image: https://pbs.twimg.com/media/GL_Q0yha0AAPhwV?format=jpg&name=small as image_2.jpg


image_2.jpg: 100%|██████████| 70.6k/70.6k [00:00<00:00, 24.4MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/1783378627859751236/image_2.jpg
Downloading image: https://pbs.twimg.com/media/GL_Q1P7agAEkFLD?format=jpg&name=small as image_3.jpg


image_3.jpg: 100%|██████████| 68.5k/68.5k [00:00<00:00, 826kiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/1783378627859751236/image_3.jpg
Downloading image: https://pbs.twimg.com/media/GL_Q2JeasAAGu8X?format=jpg&name=small as image_4.jpg


image_4.jpg: 100%|██████████| 86.2k/86.2k [00:00<00:00, 15.0MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/1783378627859751236/image_4.jpg
Created directory: /Users/princess/Documents/RA/Web Scraping/X/data/1026302117890277378
Downloading image: https://pbs.twimg.com/media/Dj4oMt_VsAAvRRi?format=jpg&name=medium as image_1.jpg


image_1.jpg: 100%|██████████| 174k/174k [00:00<00:00, 9.92MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/1026302117890277378/image_1.jpg
Created directory: /Users/princess/Documents/RA/Web Scraping/X/data/1188294190460878848
Downloading image: https://pbs.twimg.com/media/EH2rQ5qUwAAH1wu?format=jpg&name=medium as image_1.jpg


image_1.jpg: 100%|██████████| 82.8k/82.8k [00:00<00:00, 26.2MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/1188294190460878848/image_1.jpg
Created directory: /Users/princess/Documents/RA/Web Scraping/X/data/973822217732665344
Downloading image: https://pbs.twimg.com/media/DYO1y60X4AEOez7?format=jpg&name=small as image_1.jpg


image_1.jpg: 100%|██████████| 63.0k/63.0k [00:00<00:00, 42.6MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/973822217732665344/image_1.jpg
Downloading image: https://pbs.twimg.com/media/DYO18SJWsAAlq9D?format=jpg&name=small as image_2.jpg


image_2.jpg: 100%|██████████| 65.8k/65.8k [00:00<00:00, 63.9MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/973822217732665344/image_2.jpg
Downloading image: https://pbs.twimg.com/media/DYO1_LjXkAEix_G?format=jpg&name=small as image_3.jpg


image_3.jpg: 100%|██████████| 54.1k/54.1k [00:00<00:00, 13.6MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/973822217732665344/image_3.jpg
Downloading image: https://pbs.twimg.com/media/DYO2GQjXUAEL-v5?format=jpg&name=small as image_4.jpg


image_4.jpg: 100%|██████████| 54.4k/54.4k [00:00<00:00, 26.0MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/973822217732665344/image_4.jpg
Created directory: /Users/princess/Documents/RA/Web Scraping/X/data/1418055882579931141
Downloading image: https://pbs.twimg.com/media/E63xNvzVUAMgSjP?format=jpg&name=900x900 as image_1.jpg


image_1.jpg: 100%|██████████| 133k/133k [00:00<00:00, 7.16MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/1418055882579931141/image_1.jpg
Downloading image: https://pbs.twimg.com/media/E63xOOdVUAAm64q?format=jpg&name=900x900 as image_2.jpg


image_2.jpg: 100%|██████████| 118k/118k [00:00<00:00, 27.1MiB/s]


Successfully downloaded: /Users/princess/Documents/RA/Web Scraping/X/data/1418055882579931141/image_2.jpg
Process Completed.
