In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup

from collections import deque
import logging
from threading import Thread

In [2]:
selectors = {
    "Container": "[data-testid='primaryColumn']",
    "Bio": "[data-testid='UserDescription']",
    "FollowingFollowers": ".css-175oi2r:not(.r-1mf7evn) .r-bcqeeo.r-qvutc0.r-poiln3.r-1b43r93.r-1cwl3u0.r-b88u0q span",
    "Location": "[data-testid='UserLocation']",
    "Website": "[data-testid='UserUrl']",
}

# the driver is public for a reason, see `structure_twitter_data`
driver: webdriver.Firefox


class TwitterUser:
    link: str  # for reference
    bio: str
    following: int
    followers: int
    location: str
    website: str

    def __init__(self, link, bio, following, followers, location, website):
        self.link = link
        self.bio = bio
        self.following = following
        self.followers = followers
        self.location = location
        self.website = website

Flow of operations:
1. Initialise the webdriver             with `start_scraping_twitter`.
2. Scrape raw data from the given links with `scrape_raw_twitter_data`
4. Free driver
3. Get needed information from data     with `structure_twitter_data`  (threaded)
5. Stop

In [3]:
import csv
from datetime import datetime

links = []

with open("twitter_links.csv", newline="") as csvfile:
    reader = csv.reader(csvfile)

    for row in reader:
        links.append(row[0])

In [4]:
information = deque()
users: [TwitterUser] = []

saved_logging_level = logging.getLogger().getEffectiveLevel()

## 1. Initialise the webdriver

In [5]:
# firefox is probably the best familiar enough option out there
options = webdriver.FirefoxOptions()
# options.add_argument("-headless") # uncomment to make headless
driver = webdriver.Firefox(options=options)

KeyboardInterrupt: 

## 2. Scrape raw data from the given links

In [None]:
for link in links:
    driver.get(link)
    # essential exception handling since we don't want to stop at any point
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, selectors["FollowingFollowers"])
            )
        )
    except:
        logging.exception(f"Exception with link {link}")
        continue

    container = driver.find_element(
        By.CSS_SELECTOR, selectors["Container"]
    ).get_attribute("innerHTML")
    information.append(container)

## 3. Free driver

In [None]:
driver.quit()

## 4. Scrape needed information from data (threaded)

In [None]:
def structure_twitter_data(information: deque, links, data: [TwitterUser]):
    # this is why deques are superior to generic queues!
    while len(information) != 0:
        i = information.popleft()
        link = links.pop(0)

        soup = BeautifulSoup(i, "html.parser")
        bio = soup.select_one(selectors["Bio"])

        following_followers = soup.select(selectors["FollowingFollowers"])
        following = following_followers[0]
        followers = following_followers[1]

        location = soup.select_one(selectors["Location"])
        website = soup.select_one(selectors["Website"])

        user = TwitterUser(
            link,
            bio.text if bio else None,
            following.text,
            followers.text,
            location.text if location else None,
            website.text if website else None,
        )

        data.append(user)


for _ in range(4):
    Thread(target=structure_twitter_data, args=(information, users)).start()

# only continute once the threads are done
while len(information) != 0:
    pass

In [4]:
users

Now we convert the data to CSV, finally.

In [13]:
output_path = f"./out/data-{datetime.now().strftime('%Y-%m-%d_%H.%M.%S')}.csv"

# finally save the data
with open(output_path, "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Link", "Bio", "Following", "Followers", "Location", "Website"])

    for user in users:
        writer.writerow(
            [
                user.link,
                user.bio,
                user.following,
                user.followers,
                user.location,
                user.website,
            ]
        )
