# Create Sam Altman Essays Dataset


In [1]:
import re
import requests
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup

In [2]:
base_url = "https://blog.samaltman.com/"

In [3]:
Path("./dataset").mkdir(parents=True, exist_ok=True)

## Extract Article URLs


In [4]:
urls = [f"{base_url}?page={i}" for i in range(25)]

In [5]:
articles = []
for url in urls:
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    articles_div = soup.find_all("article", class_="post")
    article_urls = list(map(lambda a: a.find_all("a")[0]["href"], articles_div))
    articles.extend(article_urls)

## Process Articles


In [6]:
article_dates = []
article_titles = []
article_texts = []

for article in articles:
    response = requests.get(article)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    articles_div = soup.find_all("article", class_="post")[0]

    # Extract the date of the article
    post_date = soup.find_all("span", class_="posthaven-formatted-date")[0]
    post_date = post_date.attrs["data-unix-time"]
    post_date = pd.to_datetime(int(post_date) * 1e9)
    article_dates.append(post_date)

    # Extract the title of the article
    post_title = articles_div.find_all("a")[0].text
    article_titles.append(post_title)

    # Extract the text from the article
    article_text = articles_div.find_all("div", class_="posthaven-post-body")[0]
    article_text = article_text.find_all(["p", "li", "div"])

    for i in range(len(article_text)):
        for br in article_text[i].find_all("br"):
            br.replace_with("<br />")

    # Extract the text from each HTML element in the article
    article_text = list(
        map(
            lambda p: (
                p.text if "posthaven-file" not in p.attrs.get("class", []) else ""
            ),
            article_text,
        )
    )

    article_text = [
        a.replace("\n", " ")
        .replace("<br />", "\n")
        .replace("\xa0", " ")
        .replace("  ", " ")
        for a in article_text
    ]
    article_text = "\n\n".join(article_text)
    article_text = re.sub(r"(\s*\n\s*){2,}", "\n\n", article_text)
    article_texts.append(article_text)

## Save Dataset


In [7]:
# Convert data to pandas dataframe

df = pd.DataFrame(
    {"title": article_titles, "text": article_texts, "date": article_dates}
)

In [8]:
# Rename and reorder columns

df.index.name = "id"
df = df[["title", "date", "text"]]

In [9]:
# Remove duplicates

df = df.drop_duplicates(subset=["text"], keep="first")

In [10]:
# Save dataset

df.to_csv("./dataset/sam_altman_essays.csv")