In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
selectors = {
    "Container": "[data-testid='primaryColumn']",
    "Bio": "[data-testid='UserDescription']",
    "FollowingFollowers": ".css-175oi2r:not(.r-1mf7evn) .r-bcqeeo.r-qvutc0.r-poiln3.r-1b43r93.r-1cwl3u0.r-b88u0q span",
    "Location": "[data-testid='UserLocation']",
    "Website": "[data-testid='UserUrl']",
}

# the driver is public for a reason, see `structure_twitter_data`
driver: webdriver.Firefox


class TwitterUser:
    link: str  # for reference
    bio: str
    following: int
    followers: int
    location: str
    website: str

    def __init__(self, link, bio, following, followers, location, website):
        self.link = link
        self.bio = bio
        self.following = following
        self.followers = followers
        self.location = location
        self.website = website

Flow of operations:
1. Initialise the webdriver             with `start_scraping_twitter`.
2. Scrape raw data from the given links with `scrape_raw_twitter_data`
4. Free driver
3. Get needed information from data     with `structure_twitter_data`  (threaded)
5. Stop

In [3]:
import csv

links = []

with open("twitter_links.csv", newline="") as csvfile:
    reader = csv.reader(csvfile)

    for row in reader:
        links.append(row[0])

In [4]:
from collections import deque
import logging

In [5]:
information: [object] = []
users: [TwitterUser] = []

saved_logging_level = logging.getLogger().getEffectiveLevel()

In [6]:
logging.getLogger().setLevel(logging.INFO)

## 1. Initialise the webdriver

In [7]:
# firefox is probably the best familiar enough option out there
options = webdriver.FirefoxOptions()
options.add_argument("-headless") # uncomment to make headless
driver = webdriver.Firefox(options=options)

## 2. Scrape raw data from the given links

In [8]:
processed_links = links.copy()

for link in links:
    driver.get(link)
    # essential exception handling since we don't want to stop at any point
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, selectors["FollowingFollowers"])
            )
        )
    except:
        logging.exception(f"Exception with {link}")
        processed_links.remove(link)  # completely get rid of the link
        continue

    container = driver.find_element(
        By.CSS_SELECTOR, selectors["Container"]
    ).get_attribute("innerHTML")
    logging.info(f"successfully added {link}")
    information.append({"html": container, "link": link})

links = processed_links.copy()

INFO:root:successfully added https://twitter.com/GTNUK1
INFO:root:successfully added https://twitter.com/whatsapp
INFO:root:successfully added https://twitter.com/aacb_CBPTrade
INFO:root:successfully added https://twitter.com/aacbdotcom
INFO:root:successfully added https://twitter.com/@AAWindowPRODUCT
INFO:root:successfully added https://www.twitter.com/aandb_kia
INFO:root:successfully added https://twitter.com/ABHomeInc
INFO:root:successfully added https://twitter.com/Abrepro
INFO:root:successfully added https://twitter.com/ACChristofiLtd
INFO:root:successfully added https://twitter.com/aeclothing1
INFO:root:successfully added https://twitter.com/AETechnologies1
INFO:root:successfully added http://www.twitter.com/wix
INFO:root:successfully added https://twitter.com/AGInsuranceLLC


## 3. Free driver

In [9]:
driver.quit()

## 4. Scrape needed information from data

In [15]:
for i in information:
    link = links.pop()
    i = information.popleft()

    # bs4 handles from this point since we don't need resource heavy selenium anymore
    soup = BeautifulSoup(i, "html.parser")
    bio = soup.select_one(selectors["Bio"])

    # since following and follower count both have the same CSS selector,
    # we'll get an array with 2 elements in it
    following_followers = soup.select(selectors["FollowingFollowers"])
    # substitution of K, M, L (lakh), etc. is required here
    for i in range(len(following_followers)):
        value = following_followers[i].text.upper()  # for case insensitivity
        # using regex to match the value and suffix
        matching = re.compile(r"(\d+(\.\d+)?)\s*([KLM]?)").match(value)

        if matching:
            number, _, suffix = matching.groups()
            multiplier = {
                "K": 10**3,  # thousand
                "L": 10**5,  # lakh
                "M": 10**6,  # million
            }.get(suffix, 1)

            following_followers[i] = int(float(number) * multiplier)
        # if it doesn't match, we'll still extract the information as a string

    following = following_followers[0]
    followers = following_followers[1]

    location = soup.select_one(selectors["Location"])
    website = soup.select_one(selectors["Website"])

    user = TwitterUser(
        link,
        bio.text if bio else None,
        following,
        followers,
        location.text if location else None,
        website.text if website else None,
    )

    data.append(user)

In [19]:
required_len = len(processed_links)

lock = Lock()
for _ in range(4):
    t = Thread(target=structure_twitter_data, args=(information, links, users, lock))
    t.start()
    t.join()


# only continute once the threads are done
while len(users) != required_len:
    pass

In [24]:
len(information), len(users), processed_links

(0,
 13,
 ['https://twitter.com/GTNUK1',
  'https://twitter.com/whatsapp',
  'https://twitter.com/aacb_CBPTrade',
  'https://twitter.com/aacbdotcom',
  'https://twitter.com/@AAWindowPRODUCT',
  'https://www.twitter.com/aandb_kia',
  'https://twitter.com/ABHomeInc',
  'https://twitter.com/Abrepro',
  'https://twitter.com/ACChristofiLtd',
  'https://twitter.com/aeclothing1',
  'https://twitter.com/AETechnologies1',
  'http://www.twitter.com/wix',
  'https://twitter.com/AGInsuranceLLC'])

Now we convert the data to CSV, finally.

In [22]:
from datetime import datetime

In [23]:
output_path = f"./out/data-{datetime.now().strftime('%Y-%m-%d_%H.%M.%S')}.csv"

# finally save the data
with open(output_path, "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Link", "Bio", "Following", "Followers", "Location", "Website"])

    for user in users:
        writer.writerow(
            [
                user.link,
                user.bio,
                user.following,
                user.followers,
                user.location,
                user.website,
            ]
        )