In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import time
import uuid
import hashlib
import pandas as pd  

def create_uuid_from_string(val):
    hex_string = hashlib.md5(val.encode("UTF-8")).hexdigest()
    return str(uuid.UUID(hex=hex_string))[:8]

def scrape_rekt_news(base_url='https://rekt.news/', num_pages=28):
    all_articles = []
    page = 0

    # Set up Selenium WebDriver (make sure you have the appropriate driver installed)
    driver = webdriver.Chrome()  # or webdriver.Firefox(), etc.

    try:
        while page < num_pages:
            url = f'{base_url}?page={page}'
            driver.get(url)

            # Wait for the articles to load
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "post"))
                )
            except TimeoutException:
                print(f"No more articles found on page {page}. Stopping.")
                break

            # Parse the page source with BeautifulSoup
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            articles = soup.find_all('article', class_='post')

            if not articles:
                print(f"No more articles found on page {page}. Stopping.")
                break

            for article in articles:
                title = article.header.h5.a.text
                summary = article.section.p.text
                date = article.header.span.time.text
                tags = article.header.span.p.text.replace(" - ", ",")
                #uuid = create_uuid_from_string(title)

                all_articles.append({
                    'title': title,
                    'date': date,
                    'tags': tags,
                    'summary': summary,
                    #'id': uuid
                })

            print(f"Scraped page {page}")
            page += 1
            time.sleep(2)  # Be polite and avoid overwhelming the server

    finally:
        driver.quit()

    return all_articles

# Run the scraper
scraped_articles = scrape_rekt_news()

WebDriverException: Message: Service /home/seacevedo/.cache/selenium/chromedriver/linux64/128.0.6613.137/chromedriver unexpectedly exited. Status code was: 127


In [None]:
scraped_articles

In [19]:
rekt_df = pd.json_normalize(scraped_articles)

In [20]:
rekt_df.to_csv('../datasets/rekt_dataset.csv')

In [9]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.remote.webdriver import WebDriver
from bs4 import BeautifulSoup
import time
import pandas as pd
from datetime import datetime, timedelta
from typing import List
from selenium.common.exceptions import WebDriverException
import uuid
import hashlib

def create_uuid_from_string(val):
    hex_string = hashlib.md5(val.encode("UTF-8")).hexdigest()
    return str(uuid.UUID(hex=hex_string))[:8]

def is_date_in_current_week(date: datetime) -> bool:
    """Checks if a given date is in the current week."""
    today = datetime.today()
    start_of_week = today - timedelta(days=today.weekday())
    end_of_week = start_of_week + timedelta(days=6)

    return start_of_week <= date <= end_of_week

def setup_driver() -> WebDriver:
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    try:
        return webdriver.Remote("http://127.0.0.1:4444/wd/hub", options=chrome_options)
    except WebDriverException:
        print("Unable to connect to driver")

def scroll_to_bottom(driver: WebDriver, max_scroll_height: int) -> None:
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for new content to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if max_scroll_height and new_height >= max_scroll_height:
            print(f"Reached maximum scroll height of {max_scroll_height} px")
            break
        if new_height == last_height:
            break
        last_height = new_height
        
def scrape_article_cards(driver: WebDriver) -> List[dict]:
    articles = []
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    cards = soup.find_all('div', class_='timeline-description')
    
    for card in cards:
        title = card.find('h2').text.strip()
        date = card.find('span', class_='timestamp').text.strip()
        summary = card.find('div', class_='timeline-body-text-wrapper').text.strip()
        tags = card.find('div', class_='tag-list theme').text.replace('Theme tags: ', '').strip()
        uuid = create_uuid_from_string(title)

        date_object = datetime.strptime(date, "%B %d, %Y")

        #if is_date_in_current_week(date_object) and 'Hack' in tags:

        if 'Hack' in tags:
            formatted_date_obj = date_object.strftime("%Y-%m-%d")
        
            articles.append({
                'id': uuid,
                'title': title,
                'date': formatted_date_obj,
                'summary': summary,
                'tags': tags
            })


    
    return articles

def main():
    driver = setup_driver()
    driver.get('https://www.web3isgoinggreat.com/')
    #max_scroll_height = 500000
    max_scroll_height = 800000
    
    try:
        # Wait for the first card to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "timeline-description"))
        )
        
        # Scroll to load all content
        scroll_to_bottom(driver, max_scroll_height)
        
        # Scrape the loaded content
        articles = scrape_article_cards(driver)

        df = pd.json_normalize(articles)

        df.to_csv('../datasets/web3isgoinggreat_dataset_1.csv', index=False)
        
        # Print the results
        for article in articles:
            print(f"Title: {article['title']}")
            print(f"Date: {article['date']}")
            print(f"Summary: {article['summary']}")
            print(f"Tags: {article['tags']}")
            print("---")
        
        print(f"Total articles scraped: {len(articles)}")
        
    finally:
        driver.quit()

if __name__ == "__main__":
    main()

Title: Victim loses over $32 million to wallet drainer
Date: 2024-09-28
Summary: A victim lost 12,083 spWETH tokens (~$32.4 million) after signing a malicious transaction stemming from someone using wallet drainer software. These drainers are "scam-as-a-service" products, where the drainer creators allow others to operate the drainer software in exchange for a 20% cut of stolen funds.The victim wallet sent a message to the thief, offering "a peaceful resolution to this situation" in which the thief could keep 20% of the total amount taken (around $6.5 million).
Tags: Hack or scam
---
Title: Bedrock staking platform loses $2 million after bug that allowed users to trade Bitcoin and Ethereum 1:1
Date: 2024-09-27
Summary: A staking platform called Bedrock lost around $2 million after exploiters discovered a bug that allowed them to swap 1 ETH for 1 BTC despite the more than $63,000 difference in prices for the two assets.A security firm working with Bedrock had tried to warn Bedrock of th

In [None]:
datetime.datetime