In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:
pip install reportlab

Collecting reportlab
  Downloading reportlab-4.2.5-py3-none-any.whl.metadata (1.5 kB)
Downloading reportlab-4.2.5-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.2.5


In [3]:
# URL of the webpage to scrape
website_url = 'https://www.paulgraham.com/articles.html'

# Fetch the webpage content
response = requests.get(website_url)

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
# Display the parsed HTML content in a readable format
# print(soup.prettify())

In [6]:
# Find all table rows with 'valign' set to 'top'
table_rows = soup.find_all('tr', {'valign': 'top'})

In [7]:
# Get the number of table rows found
row_count = len(table_rows)

228

In [8]:
# Lists to store blog titles and links
blog_titles = []
blog_links = []

# Iterate through the table rows to extract blog titles and links
for i in range(2, 225):
    try:
        # Extract the title and link from the current row
        title = table_rows[i].find('a').text
        blog_titles.append(title)
        link = table_rows[i].find('a')['href']
        blog_links.append(link)
    except:
        # Skip rows that cause errors
        continue

In [9]:
# Get the lengths of the blog titles and links lists
titles_count, links_count = len(blog_titles), len(blog_links)

In [10]:
# Remove the last three items from the blog titles and links lists
del blog_titles[-3:]
del blog_links[-3:]

In [11]:
# Get the updated lengths of the blog titles and links lists
titles_count, links_count = len(blog_titles), len(blog_links)

In [12]:
titles_count, links_count

(220, 220)

In [13]:
# Lists to store essay content and titles
essay_content = []
essay_titles = []

# Iterate through the blog links to fetch essay content and titles
for link in blog_links:
    essay_url = f"https://www.paulgraham.com/{link}"
    try:
        # Fetch the essay page
        response = requests.get(essay_url)
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract the essay content
        content_rows = soup.find_all('tr', {'valign': 'top'})
        content = content_rows[1].find('font', {'face': 'verdana', 'size': '2'}).text.replace('\n', '').strip()
        essay_content.append(content)

        # Extract the essay title
        title_rows = soup.find_all('tr', {'valign': 'top'})
        img_tag = title_rows[1].find('img')
        title = img_tag['alt'] if img_tag and 'alt' in img_tag.attrs else np.nan
        essay_titles.append(title)

    except Exception as e:
        # Handle errors by appending NaN values
        essay_titles.append(np.nan)
        essay_content.append(np.nan)
        continue

In [14]:
# Get the lengths of the essay content and titles lists
content_count, titles_count = len(essay_content), len(essay_titles)
content_count, titles_count

(220, 220)

In [15]:
# Create a DataFrame to store essay details
df = pd.DataFrame({
    'Essay Name': blog_titles,
    'Essay Link': blog_links,
    'Essay Content': essay_content,
    'Title of Essay': essay_titles
})

In [16]:
df.sample(5)

Unnamed: 0,Essay Name,Essay Link,Essay Content,Title of Essay
120,What I've Learned from Hacker News,hackernews.html,February 2009Hacker News was two yearsold last...,What I've Learned from Hacker News
39,Being a Noob,noob.html,"January 2020When I was young, I thought old pe...",Being a Noob
61,The Ronco Principle,ronco.html,"January 2015No one, VC or angel, has invested ...",The Ronco Principle
207,Design and Research,desres.html,January 2003(This article is derived from a ke...,Design and Research
154,Is It Worth Being Wise?,wisdom.html,February 2007A few days ago I finally figured ...,Is It Worth Being Wise?


In [17]:
# Remove the 'title_of_essay' column from the DataFrame
df.drop(['Title of Essay'], axis=1, inplace=True)

In [18]:
# Access the content of the second essay in the DataFrame
essay_content = df['Essay Content'][1]

In [19]:
# Remove newline characters from the 'Essay Content' column
df['Essay Content'] = df['Essay Content'].str.replace('\n', '')

In [20]:
# Load the spaCy English language model for tokenization
import spacy
nlp = spacy.load("en_core_web_sm")

In [21]:
# Define a function to tokenize text using spaCy
def tokenize_spacy(texts, nlp_model):
    tokenized_texts = [nlp_model(text).text for text in texts]
    return tokenized_texts

# Tokenize the 'Essay Content' column and store the result
df['tokenized_text'] = df['Essay Content'].apply(lambda x: tokenize_spacy([x], nlp)[0])

In [22]:
# Access the tokenized text of the second essay
tokenized_essay = df['tokenized_text'][1]

In [23]:
# Convert the 'Essay Content' column to a list
essay_content_list = df['Essay Content'].to_list()

In [24]:
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak

# Function to create a PDF from essay titles and content
def create_pdf(essay_titles, essay_content, filename):
    # Initialize the PDF document
    doc = SimpleDocTemplate(filename, pagesize=letter)

    # List to hold the PDF content
    story = []

    # Get predefined styles
    styles = getSampleStyleSheet()

    # Iterate through essay titles and content
    for title, content in zip(essay_titles, essay_content):
        # Add the essay title
        title_paragraph = Paragraph(title, styles['Title'])
        story.append(title_paragraph)

        # Add a horizontal line below the title
        line = Paragraph("<hr/>", styles['Normal'])
        story.append(line)

        # Add spacing
        story.append(Spacer(1, 12))

        # Add the essay content
        content_paragraph = Paragraph(content, styles['Normal'])
        story.append(content_paragraph)

        # Add more spacing
        story.append(Spacer(1, 24))

        # Add a page break after each essay
        story.append(PageBreak())

    # Generate the PDF
    doc.build(story)

# Create the PDF file
create_pdf(blog_titles, essay_content_list, "book.pdf")