Profile Scraping

In [None]:
!pip install webdriver-manager
!pip install selenium
!pip install beautifulsoup4 requests pandas

In [None]:
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import mysql.connector

import time
time.sleep(5)  # Wait 5 seconds between requests

# Set up Selenium WebDriver
def setup_driver(driver_path):
    options = webdriver.ChromeOptions()
    options.add_argument("--verbose")
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--log-level=3")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    service = Service(driver_path, log_path="chromedriver.log")
    driver = webdriver.Chrome(service=service, options=options)
    print(f"Using ChromeDriver from: {driver_path}")
    return driver


# Function to scrape Twitter profile information
def scrape_twitter_profile(driver, profile_url):
    # Open Twitter profile page
    driver.get(profile_url)
    
    try:
        # Wait for bio to load
        bio_element = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, "//div[@data-testid='UserDescription']"))
        )
        bio = bio_element.text.strip()
    except Exception as e:
        print(f"Error fetching bio for {profile_url}: {e}")
        bio = "N/A"

    try:
        # Wait for following count to load
        following_count_element = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, "//a[contains(@href, '/following')]//span"))
        )
        following_count = following_count_element.text.strip()
    except Exception as e:
        print(f"Error fetching following count for {profile_url}: {e}")
        following_count = "N/A"

    try:
        # Wait for followers count to load
        followers_count_element = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, "//a[contains(@href, '/followers')]//span"))
        )
        followers_count = followers_count_element.text.strip()
    except Exception as e:
        print(f"Error fetching followers count for {profile_url}: {e}")
        followers_count = "N/A"

    try:
        # Wait for location to load
        location_element = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, "//span[@data-testid='UserLocation']"))
        )
        location = location_element.text.strip()
    except Exception as e:
        print(f"Error fetching location for {profile_url}: {e}")
        location = "N/A"

    try:
        # Wait for website to load
        website_element = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, "//a[@data-testid='UserUrl']"))
        )
        website = website_element.text.strip()
    except Exception as e:
        print(f"Error fetching website for {profile_url}: {e}")
        website = "N/A"

    return {
        "Bio": bio,
        "Following Count": following_count,
        "Followers Count": followers_count,
        "Location": location,
        "Website": website,
        "Profile URL": profile_url
    }

# Function to insert scraped data into MySQL
def insert_data_to_mysql(db_config, profile_data):
    conn = None  # Initialize the connection variable
    try:
        # Connect to the database
        conn = mysql.connector.connect(
            host=db_config["host"],
            user=db_config["user"],
            password=db_config["password"],
            database=db_config["database"]
        )
        cursor = conn.cursor()

        # Insert data into the table
        insert_query = """
        INSERT INTO twitter_profiles (Bio, Following_Count, Followers_Count, Location, Website, Profile_URL)
        VALUES (%s, %s, %s, %s, %s, %s)
        """
        data_tuple = (
            profile_data["Bio"],
            profile_data["Following Count"],
            profile_data["Followers Count"],
            profile_data["Location"],
            profile_data["Website"],
            profile_data["Profile URL"]
        )
        cursor.execute(insert_query, data_tuple)
        conn.commit()
        print(f"Data inserted successfully for {profile_data['Profile URL']}")

    except mysql.connector.Error as e:
        print(f"Error: {e}")
    finally:
        # Close the database connection if it was successfully opened
        if conn is not None and conn.is_connected():
            cursor.close()
            conn.close()


# Main function to handle the workflow
def main(csv_filename, driver_path, db_config):
    # Step 1: Read the already-downloaded CSV file with Twitter profile URLs
    with open(csv_filename, newline='', encoding="utf-8") as file:
        reader = csv.reader(file)
        profile_urls = [row[0] for row in reader]  # Assuming each row has one URL

    # Step 2: Initialize Selenium WebDriver
    driver = setup_driver(driver_path)

    # Step 3: Scrape data and insert into MySQL
    for url in profile_urls:
        print(f"Scraping data for: {url}")
        profile_data = scrape_twitter_profile(driver, url)
        insert_data_to_mysql(db_config, profile_data)

    # Close the WebDriver
    driver.quit()


# Run the script
if __name__ == "__main__":
    # Path to the already-downloaded CSV file
    csv_filename = "twitter_profiles.csv"  # Replace with the actual path to your local CSV file

    # Path to Chrome WebDriver
    driver_path = r"E:\Deep_Learning\Scraping\chromedriver-win64\chromedriver.exe"  # Replace with the actual path to your WebDriver

    # MySQL database configuration
    db_config = {
        "host": "localhost",
        "user": "root",  # Replace with your MySQL username
        "password": "root",  # Replace with your MySQL password
        "database": "twitter_scraper_db"  # Replace with your MySQL database
    }

    main(csv_filename, driver_path, db_config)


Using ChromeDriver from: E:\Deep_Learning\Scraping\chromedriver-win64\chromedriver.exe
Scraping data for: https://twitter.com/GTNUK1
Error fetching followers count for https://twitter.com/GTNUK1: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF6EB2638A5+3004357]
	(No symbol) [0x00007FF6EAEF9970]
	(No symbol) [0x00007FF6EADA582A]
	(No symbol) [0x00007FF6EADF5B8E]
	(No symbol) [0x00007FF6EADF5E7C]
	(No symbol) [0x00007FF6EAE3EC27]
	(No symbol) [0x00007FF6EAE1BC1F]
	(No symbol) [0x00007FF6EAE3BA4C]
	(No symbol) [0x00007FF6EAE1B983]
	(No symbol) [0x00007FF6EADE7628]
	(No symbol) [0x00007FF6EADE8791]
	GetHandleVerifier [0x00007FF6EB28A00D+3161901]
	GetHandleVerifier [0x00007FF6EB2DE060+3506048]
	GetHandleVerifier [0x00007FF6EB2D400D+3465005]
	GetHandleVerifier [0x00007FF6EB050EEB+830987]
	(No symbol) [0x00007FF6EAF0467F]
	(No symbol) [0x00007FF6EAF009D4]
	(No symbol) [0x00007FF6EAF00B6D]
	(No symbol) [0x00007FF6EAEF0149]
	BaseThreadInitThunk [0x00007FFF3E1D7374+20]
	RtlUserThreadStart [0