<a href="https://colab.research.google.com/github/risehi/data-analysis-colab-notebooks/blob/main/scrape_twitter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install playwright

Collecting playwright
  Downloading playwright-1.44.0-py3-none-manylinux1_x86_64.whl (37.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.8/37.8 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyee==11.1.0 (from playwright)
  Downloading pyee-11.1.0-py3-none-any.whl (15 kB)
Installing collected packages: pyee, playwright
Successfully installed playwright-1.44.0 pyee-11.1.0


In [2]:
!playwright install

Downloading Chromium 125.0.6422.26 (playwright build v1117)[2m from https://playwright.azureedge.net/builds/chromium/1117/chromium-linux.zip[22m
[1G156.8 MiB [] 0% 0.0s[0K[1G156.8 MiB [] 0% 32.3s[0K[1G156.8 MiB [] 0% 27.8s[0K[1G156.8 MiB [] 0% 15.3s[0K[1G156.8 MiB [] 0% 12.3s[0K[1G156.8 MiB [] 0% 9.3s[0K[1G156.8 MiB [] 1% 8.0s[0K[1G156.8 MiB [] 1% 7.5s[0K[1G156.8 MiB [] 1% 6.9s[0K[1G156.8 MiB [] 2% 6.4s[0K[1G156.8 MiB [] 2% 6.5s[0K[1G156.8 MiB [] 3% 6.5s[0K[1G156.8 MiB [] 3% 6.4s[0K[1G156.8 MiB [] 3% 5.9s[0K[1G156.8 MiB [] 4% 6.0s[0K[1G156.8 MiB [] 4% 6.1s[0K[1G156.8 MiB [] 4% 5.9s[0K[1G156.8 MiB [] 5% 5.6s[0K[1G156.8 MiB [] 5% 5.4s[0K[1G156.8 MiB [] 6% 5.4s[0K[1G156.8 MiB [] 6% 5.3s[0K[1G156.8 MiB [] 6% 5.1s[0K[1G156.8 MiB [] 7% 5.1s[0K[1G156.8 MiB [] 7% 5.0s[0K[1G156.8 MiB [] 8% 4.8s[0K[1G156.8 MiB [] 8% 4.7s[0K[1G156.8 MiB [] 9% 4.5s[0K[1G156.8 MiB [] 9% 4.6s[0K[1G156.8 MiB [] 9% 4.5s[0K[1G156.8 MiB [] 10% 4.4s[0K[1G156.

In [3]:
import asyncio
from playwright.async_api import async_playwright
import pandas as pd
import re
import numpy as np

In [4]:
async def scrape_microsoft_tweets(url: str, num_tweets: int) -> pd.DataFrame:
    async with async_playwright() as pw:
        browser = await pw.chromium.launch()
        page = await browser.new_page()
        await page.goto(url)

        data = []
        for _ in range(num_tweets):
            # Wait for tweet elements to load
            await page.wait_for_selector('[data-testid="tweetText"]')

            # Extract data
            tweet_elements = await page.query_selector_all('[data-testid="tweet"]')
            for tweet_element in tweet_elements:
                # Corrected selector for timestamp
                timestamp_element = await tweet_element.query_selector('time')
                timestamp = await timestamp_element.get_attribute('datetime') if timestamp_element else ""

                tweet_text_element = await tweet_element.query_selector('[data-testid="tweetText"]')
                tweet_text = await tweet_text_element.inner_text() if tweet_text_element else ""

                hashtags = re.findall(r'#(\w+)', tweet_text)
                mentions = re.findall(r'@(\w+)', tweet_text)

                data.append({
                    'Timestamp': timestamp,
                    'Tweets': tweet_text,
                    'Hashtags': hashtags if hashtags else np.nan,
                    'Mentions': mentions if mentions else np.nan
                })

            # Scroll down for more tweets (adjust if needed)
            await page.evaluate('window.scrollTo(0, document.body.scrollHeight)')

        await browser.close()
        return pd.DataFrame(data)

# Get the running event loop
loop = asyncio.get_running_loop()

# Execute the coroutine within the existing loop
df = await loop.create_task(scrape_microsoft_tweets("https://x.com/Microsoft", 32*4))

In [5]:
df.to_csv("/content/drive/MyDrive/twitter-data-ms/microsoft_tweets_data.csv", index=False)

In [6]:
pd.read_csv("/content/drive/MyDrive/twitter-data-ms/microsoft_tweets_data.csv")

Unnamed: 0,Timestamp,Tweets,Hashtags,Mentions
0,2024-06-18T19:30:00.000Z,When thinking about what matters to early-in-c...,,
1,2024-06-18T16:30:02.000Z,"With what he calls “super empathy,” Joao is a ...","['InclusionIsInnovation', 'DiversityAndInclusi...",
2,2024-06-01T16:00:35.000Z,This Pride we invite you to learn more about t...,"['InclusionIsInnovation', 'Pride']",
3,2024-05-30T19:01:47.000Z,Microsoft Copilot for Microsoft 365 helps you ...,['AI'],
4,2024-05-30T16:30:00.000Z,The ability to clearly articulate your thought...,,
...,...,...,...,...
507,2024-05-28T16:30:01.000Z,"When Petrus joined the US Marines, he promised...",['InclusionIsInnovation'],
508,2024-05-30T19:01:47.000Z,Microsoft Copilot for Microsoft 365 helps you ...,['AI'],
509,2024-05-30T16:30:00.000Z,The ability to clearly articulate your thought...,,
510,2024-05-29T16:30:00.000Z,We partnered with \n@HerCampus\n to give you t...,,['HerCampus']
