Profile Scraping

In [None]:
!pip install webdriver-manager
!pip install selenium
!pip install beautifulsoup4 requests pandas

In [4]:
import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd

# Set up Selenium WebDriver
def setup_driver(driver_path):
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    service = Service(driver_path)
    driver = webdriver.Chrome(service=service, options=options)
    return driver

# Function to scrape Twitter profile information
def scrape_twitter_profile(driver, profile_url):
    # Open Twitter profile page
    driver.get(profile_url)
    
    try:
        # Wait for bio to load (adjust for actual element)
        bio_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@data-testid='UserDescription']"))
        )
        bio = bio_element.text.strip()
    except Exception as e:
        print(f"Error fetching bio for {profile_url}: {e}")
        bio = "N/A"

    try:
        # Wait for following count to load
        following_count_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//a[@href='/" + profile_url.split("/")[-1] + "/following']"))
        )
        following_count = following_count_element.find_element(By.XPATH, ".//span").text.strip()
    except Exception as e:
        print(f"Error fetching following count for {profile_url}: {e}")
        following_count = "N/A"

    try:
        # Wait for followers count to load
        followers_count_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//a[@href='/" + profile_url.split("/")[-1] + "/followers']"))
        )
        followers_count = followers_count_element.find_element(By.XPATH, ".//span").text.strip()
    except Exception as e:
        print(f"Error fetching followers count for {profile_url}: {e}")
        followers_count = "N/A"

    try:
        # Wait for location to load
        location_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//span[@data-testid='UserLocation']"))
        )
        location = location_element.text.strip()
    except Exception as e:
        print(f"Error fetching location for {profile_url}: {e}")
        location = "N/A"

    try:
        # Wait for website to load
        website_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//a[@data-testid='UserUrl']"))
        )
        website = website_element.text.strip()
    except Exception as e:
        print(f"Error fetching website for {profile_url}: {e}")
        website = "N/A"

    return {
        "Bio": bio,
        "Following Count": following_count,
        "Followers Count": followers_count,
        "Location": location,
        "Website": website,
        "Profile URL": profile_url
    }

# Main function to handle the workflow
def main(csv_filename, driver_path):
    # Step 1: Read the already-downloaded CSV file with Twitter profile URLs
    with open(csv_filename, newline='', encoding="utf-8") as file:
        reader = csv.reader(file)
        profile_urls = [row[0] for row in reader]  # Assuming each row has one URL

    # Step 2: Initialize Selenium WebDriver
    driver = setup_driver(driver_path)

    # Step 3: Scrape data from each Twitter profile
    profile_data = []
    for url in profile_urls:
        print(f"Scraping data for: {url}")
        data = scrape_twitter_profile(driver, url)
        profile_data.append(data)

    # Step 4: Save data to a new CSV file
    output_filename = "twitter_profile_data.csv"
    df = pd.DataFrame(profile_data)
    df.to_csv(output_filename, index=False, encoding='utf-8')
    print(f"Data saved to {output_filename}")

    # Close the WebDriver
    driver.quit()

# Run the script
if __name__ == "__main__":
    # Path to the already-downloaded CSV file
    csv_filename = "twitter_profiles.csv"  # Replace with the actual path to your local CSV file

    # Path to Chrome WebDriver
    driver_path = r"E:\Deep_Learning\Scraping\chromedriver-win64\chromedriver.exe"  # Replace with the actual path to your WebDriver

    main(csv_filename, driver_path)


In [6]:
# Load the CSV file into a DataFrame
df = pd.read_csv("twitter_profile_data.csv")

# Display the DataFrame
#print(df)
df.head(30)


Unnamed: 0,Bio,Following Count,Followers Count,Location,Website,Profile URL
0,Providing Entertainment & Travel to Commercial...,456.0,,,,https://twitter.com/GTNUK1
1,"push, push",,,California,bit.ly/WatchPushPush,https://twitter.com/whatsapp
2,Customs Broker,124.0,,,,https://twitter.com/aacb_CBPTrade
3,A & A Freight | Warehousing | Customs Brokerag...,3896.0,,,,https://twitter.com/aacbdotcom
4,A commercial glass and glazing company serving...,,,,,https://twitter.com/@AAWindowPRODUCT
5,"A&B Kia is a Kia dealer in Benwood, WV. Stay c...",,,,,https://www.twitter.com/aandb_kia
6,"Industry leader in wholesale home decor, furni...",178.0,,,,https://twitter.com/ABHomeInc
7,From large format black & white prints to the ...,123.0,,,,https://twitter.com/Abrepro
8,,,,,,http://www.twitter.com
9,A & C CHRISTOFI LTD is a fast growing professi...,291.0,,,,https://twitter.com/ACChristofiLtd
