# Create Paul Graham Essays Dataset


In [2]:
import os
import re
import requests
import itertools
import pandas as pd
from pathlib import Path

## Process PG Essays txt

Raw txt can be found here: https://github.com/dbredvick/paul-graham-to-kindle/blob/main/paul_graham_essays.txt


In [3]:
Path("./text_data").mkdir(parents=True, exist_ok=True)
Path("./dataset").mkdir(parents=True, exist_ok=True)

In [4]:
url = "https://raw.githubusercontent.com/dbredvick/paul-graham-to-kindle/main/paul_graham_essays.txt"
response = requests.get(url)
with open("./text_data/paul_graham_essays.txt", "w", encoding="utf-8") as f:
    f.write(response.text)

In [5]:
# Load txt file with all essays

pg_essay_path = Path("./text_data/paul_graham_essays.txt")
pg_essay_text = pg_essay_path.read_text()

In [6]:
# Seperate Each Essay

essays = pg_essay_text.split("\n# ")[2:-1]
essays = list(map(lambda s: "# " + s, essays))

In [7]:
# Data Cleaning
filtered_strings = [
    "Translation](",
    "**Want to start a startup?** Get funded by [Y Combinator](http://ycombinator.com/apply.html).",
    "[](https://sep.yimg.com",
    "**Like to build things?** Try [Hacker News](http://news.ycombinator.com).",
    "Watch how this essay was [written](https://code.stypi.com/hacks/13sentences?doomed=true).",
    "[Comment](http://news.ycombinator.com",
    "[](http://reddit.com)[Comment](http://reddit.com",
    "the mere consciousness of an engagement will sometimes worry a whole day",
    "� Charles Dickens",
]

for i in range(len(essays)):
    essays[i] = essays[i].replace("* * *", "")
    essays[i] = essays[i].replace("[](index.html)", "")
    essays[i] = essays[i].strip()
    essays[i] = re.sub(r"(\s*\n\s*){2,}", "\n\n", essays[i])
    essays[i] = "\n\n".join(
        list(
            filter(
                lambda s: all([f not in s for f in filtered_strings]),
                essays[i].split("\n\n"),
            )
        )
    )
    essays[i] = re.sub(r"(\s*\n\s*){2,}", "\n\n", essays[i])

In [8]:
# Save each essay

for i, essay in enumerate(essays):
    output_path = Path(f"./text_data/{i}.txt")
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(essay)

In [9]:
# Delete the original txt file after processing

os.remove("./text_data/paul_graham_essays.txt")

## Create Raw Dataset


In [10]:
# Loop through each text file, adding them to the dataset

dataset = []
collection_path = Path("./text_data")
txt_files = collection_path.glob("*.txt")

collection_data = []
for file in txt_files:
    text = file.read_text()

    # Change dataset columns here
    file_data = {
        "text": text,
    }

    dataset.append(file_data)

# Convert data to pandas DataFrame
df = pd.DataFrame(dataset)

## Clean Dataset


In [11]:
# Date helpers and constants

months = [
    "January",
    "February",
    "March",
    "April",
    "May",
    "June",
    "July",
    "August",
    "September",
    "October",
    "November",
    "December",
]
years = list(map(str, range(1900, 2032)))
dates = list(map(lambda x: x[0] + " " + x[1], itertools.product(months, years))) + years

# Assign a unique index to each date capturing their relative ordering
dates_indices = {}
for month, year in itertools.product(months, years):
    dates_indices[month + " " + year] = int(year) * 15 + months.index(month) + 1
    dates_indices[year] = int(year) * 15


# Returns the latest (month, year) pair from a list of date strings
def latest_date(dates):
    return max(dates, key=lambda d: dates_indices.get(d))

In [12]:
# Loop through each essay to extract the title and date while cleaning remaining text

for i in range(len(df)):
    df.iloc[i, 0] = re.sub(r"(\s*\n\s*){2,}", "\n\n", df.iloc[i, 0]).strip()
    sentences = df.iloc[i, 0].split("\n\n")

    # Extract and remove date
    if sentences[1] in dates:
        df.loc[i, "date"] = sentences[1]
        sentences.pop(1)
    elif sentences[2] in dates:
        df.loc[i, "date"] = sentences[2]
        sentences.pop(2)
    elif ("rev" in sentences[1] or "corrected" in sentences[1]) and sum(
        [d in sentences[1] for d in dates]
    ) > 1:
        all_dates = [d for d in dates if d in sentences[1]]
        df.loc[i, "date"] = latest_date(all_dates)
        sentences.pop(1)
    else:
        df.loc[i, "date"] = None

    # Rejoin remaining sentences, excluding title
    df.iloc[i, 0] = "\n\n".join(sentences[1:])

    # Extract and remove title
    df.loc[i, "title"] = sentences[0].replace("# ", "")

    # Remove extra whitespace
    df.iloc[i, 0] = re.sub(r"(\s*\n\s*){2,}", "\n\n", df.iloc[i, 0]).strip()

In [13]:
# Handle date edge case

df.loc[df["date"] == "1993", "date"] = "September 1993"

In [14]:
# Rename and reorder columns

df.index.name = "id"
df = df[["title", "date", "text"]]

In [15]:
# Remove duplicates

df = df.drop_duplicates(subset=["text"], keep="first")

In [16]:
# Save dataset

df.to_csv("./dataset/pual_graham_essays.csv")