In [6]:
import requests
import pandas as pd

# Base URL and initial parameters for pagination
base_url = 'https://www.lennysnewsletter.com/api/v1/archive'
params = {
    'sort': 'new',
    'search': '',
    'offset': 0,
    'limit': 12,
    'type': 'podcast',
    'rss_episodes_only': 'true'
}

# Headers for the request (with a sample user-agent and necessary headers)
headers = {
    'accept': '*/*',
    'accept-language': 'en-US,en;q=0.9',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
    'referer': 'https://www.lennysnewsletter.com/podcast/archive?sort=new'
}

# List to store all results
all_podcasts = []

# Loop to handle pagination
while True:
    try:
        # Make the request
        response = requests.get(base_url, headers=headers, params=params)
        response.raise_for_status()  # Ensure the request was successful
        data = response.json()

        # Check if data is a list
        if not isinstance(data, list):
            print("Unexpected response format, expected a list.")
            break

        # Break the loop if no more data is returned
        if not data:
            break

        # Extract relevant data and store it
        for item in data:
            # Failsafe mechanism for transcripts
            podcast_upload = item.get('podcastUpload')
            transcript_url = 'N/A'  # Default value

            if podcast_upload and isinstance(podcast_upload, dict):
                transcription = podcast_upload.get('transcription', {})
                if transcription and isinstance(transcription, dict):
                    transcript_url = transcription.get('cdn_url', 'N/A')
            
            all_podcasts.append({
                'Title': item.get('title', 'N/A'),
                'URL': item.get('canonical_url', 'N/A'),
                'Transcript CDN URL': transcript_url
            })

        # Update the offset for the next page
        params['offset'] += params['limit']

    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        break
    except Exception as e:
        print(f"An error occurred: {e}")
        break

# Convert results to a DataFrame
df = pd.DataFrame(all_podcasts)

# Display the DataFrame
print(df)

# Optionally save to a CSV
# df.to_csv('podcasts_with_transcripts.csv', index=False)


                                                 Title  \
0    Becoming an AI PM | Aman Khan (Arize AI, ex-Sp...   
1    Everything you’ve ever wanted to know about SA...   
2    Breaking the rules of growth: Why Shopify bans...   
3    Just evil enough: Subversive marketing strateg...   
4    4 questions Shreyas Doshi wishes he’d asked hi...   
..                                                 ...   
216  Gokul Rajaram on designing your product develo...   
217  April Dunford on product positioning, segmenta...   
218  Shreyas Doshi on pre-mortems, the LNO framewor...   
219  Julie Zhuo on accelerating your career, impost...   
220                         Welcome to my new podcast!   

                                                   URL  \
0    https://www.lennysnewsletter.com/p/becoming-an...   
1    https://www.lennysnewsletter.com/p/product-own...   
2    https://www.lennysnewsletter.com/p/shopifys-gr...   
3    https://www.lennysnewsletter.com/p/just-evil-e...   
4    https://

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
import time
import random

# Function to extract <p> tags and keep all text except what's inside <a> tags
def extract_text_and_links(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the specific div containing the content
        content_div = soup.find('div', class_='available-content')
        if not content_div:
            return []  # Return empty if the div is not found

        # Find all <p> tags inside the "available-content" div
        p_tags = content_div.find_all('p')
        extracted_data = []

        # Iterate over each <p> tag and clean the text
        for p_tag in p_tags:
            # Check if the <p> tag also contains <a> tags for extracting hrefs
            # a_tag = p_tag.find('a', href=True)

            # Remove all <a> tags from the <p> tag
            for a_tag in p_tag.find_all('a'):
                a_tag.decompose()
            
            # Get the cleaned text from the <p> tag
            cleaned_text = p_tag.get_text(separator=' ').strip()

            
            if cleaned_text and a_tag:
                href_value = a_tag['href']
                extracted_data.append({'Span Value': cleaned_text, 'Link': href_value})

        return extracted_data
    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        return []

# Sample DataFrame (assume 'df' is your DataFrame with columns 'Title' and 'URL')
# df = pd.DataFrame({'Title': ['Podcast 1', 'Podcast 2'], 'URL': ['https://example.com/page1', 'https://example.com/page2']})

# Iterate over the DataFrame and extract data with a random delay
extracted_data_list = []
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing URLs"):
    url = row['URL']
    podcast_title = row['Title']
    
    # Extract data from the current URL
    references = extract_text_and_links(url)
    extracted_data_list.append({
        'Podcast Title': podcast_title,
        'Podcast URL': url,
        'References': references
    })

    # print(references)
    
    # Wait for a random amount of time between 1 and 5 seconds
    time_to_wait = random.randint(1, 2)
    time.sleep(time_to_wait)

# Display the collected data
for item in extracted_data_list:
    print(item)

# Optionally convert to DataFrame and save to CSV if needed
# extracted_df = pd.DataFrame(extracted_data_list)
# extracted_df.to_csv('extracted_podcast_references.csv', index=False)


Processing URLs:   0%|          | 0/180 [00:00<?, ?it/s]

Error processing URL https://www.lennysnewsletter.com/p/becoming-an-ai-pm-aman-khan: can only concatenate str (not "NoneType") to str


Processing URLs:   1%|          | 1/180 [00:01<03:19,  1.11s/it]

Error processing URL https://www.lennysnewsletter.com/p/product-owners-melissa-perri: can only concatenate str (not "NoneType") to str


Processing URLs:   1%|          | 1/180 [00:02<07:41,  2.58s/it]


KeyboardInterrupt: 

In [118]:
# Function to filter references to only include those with "amazon" or "goodreads" in the link
def filter_amazon_goodreads_references(references):
    return [
        item for item in references
        if 'amazon' in item['Link'] or 'goodreads/book/show/' in item['Link']
    ]

# Iterate over each row in extracted_data_list and apply the filtering
for item in extracted_data_list:
    item['References Clean'] = filter_amazon_goodreads_references(item['References'])

# Display the final filtered data
for item in extracted_data_list:
    print(item)

# Optionally convert to DataFrame and save to CSV if needed
# final_filtered_df = pd.DataFrame(extracted_data_list)
# final_filtered_df.to_csv('final_filtered_podcast_references.csv', index=False)


{'Podcast Title': 'Becoming an AI PM | Aman Khan (Arize AI, ex-Spotify, Apple, Cruise)', 'Podcast URL': 'https://www.lennysnewsletter.com/p/becoming-an-ai-pm-aman-khan', 'References': [], 'References Clean': []}
{'Podcast Title': 'Everything you’ve ever wanted to know about SAFe and the product owner role | Melissa Perri (author, founder of Product Institute)', 'Podcast URL': 'https://www.lennysnewsletter.com/p/product-owners-melissa-perri', 'References': [], 'References Clean': []}
{'Podcast Title': 'Just evil enough: Subversive marketing strategies for startups | Alistair Croll (author, advisor, entrepreneur)', 'Podcast URL': 'https://www.lennysnewsletter.com/p/just-evil-enough-alistair-croll', 'References': [], 'References Clean': []}
{'Podcast Title': '4 questions Shreyas Doshi wishes he’d asked himself sooner | Former PM leader at Stripe, Twitter, Google', 'Podcast URL': 'https://www.lennysnewsletter.com/p/shreyas-doshi-live', 'References': [], 'References Clean': []}
{'Podcast Ti

In [119]:
# Convert extracted_data_list to a DataFrame with one row per reference
flattened_data = []

for item in extracted_data_list:
    podcast_title = item['Podcast Title']
    podcast_url = item['Podcast URL']
    
    for reference in item['References Clean']:
        flattened_data.append({
            'Episode Title': podcast_title,
            'Episode URL': podcast_url,
            'Reference Title': reference['Span Value'],
            'Reference Link': reference['Link']
        })

# Create a DataFrame from the flattened data
references_df = pd.DataFrame(flattened_data)

# Display the DataFrame
print(references_df)

# Optionally save to CSV
# references_df.to_csv('podcast_references_expanded.csv', index=False)


Empty DataFrame
Columns: []
Index: []


In [91]:
import requests
from openai import OpenAI
from pydantic import BaseModel
import pandas as pd
from tqdm import tqdm
import concurrent.futures
import json

# Initialize OpenAI client
client = OpenAI()

# Define response models using Pydantic for book validation
class BookValidationResponse(BaseModel):
    is_real_book: bool

class Book(BaseModel):
    title: str
    author: str

# Function to call OpenAI for book extraction
def extract_book_references(transcript_text):
    prompt = (
        "This is a podcast transcript text. Your goal is to find any book references and store them as a JSON list of books. "
        "Each book should have a 'title' and 'author'. If the author is not available, leave it blank. "
        f"This is the text: {transcript_text}"
    )
    
    try:
        response = client.beta.chat.completions.parse(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            response_format=BookValidationResponse
        )
        return response.choices[0].message.parsed.books
    except Exception as e:
        return [{"error": str(e)}]

# Function to call Serper API to retrieve the Goodreads URL
def get_goodreads_url(title):
    url = "https://google.serper.dev/search"
    payload = json.dumps({"q": f"{title} book url goodreads"})
    headers = {
        'X-API-KEY': '46e6377865b21659da0a212efadbadf2129740f5',
        'Content-Type': 'application/json'
    }
    
    try:
        response = requests.request("POST", url, headers=headers, data=payload)
        data = response.json()
        
        for result in data.get('organic', []):
            if 'https://www.goodreads.com/book/show/' in result['link'] and title in result['title']:
                return result['link']
        
    except Exception as e:
        print(f"Error fetching Goodreads URL: {e}")
        return ""

# Function to process each book entry
def process_book_entry(args):
    index, episode_title, episode_url, book = args
    print(book)
    title = book.title

    # Get the Goodreads URL using Serper
    book_url = get_goodreads_url(title)
    
    return {
        'Index': index,  # Include index for later sorting
        'Title': episode_title,
        'URL': episode_url,
        'Book Title': title,
        'Book URL': book_url
    }

# Filter DataFrame to remove rows with empty or invalid 'Cleaned Transcript'
df = df[~df['Cleaned Transcript'].isin(["", "403 Forbidden - Access Denied", "Failed to fetch transcript", "Error in transcript format"])]

# Apply the function in parallel using ThreadPoolExecutor to extract book references
def extract_books_in_parallel(df):
    results_map = {}
    with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
        futures = {executor.submit(extract_book_references, row['Cleaned Transcript']): idx for idx, row in df.iterrows()}
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Extracting books"):
            idx = futures[future]
            try:
                results_map[idx] = future.result()
            except Exception as e:
                results_map[idx] = [{"error": str(e)}]
    return results_map




In [None]:
# Extract books and store results in the DataFrame
results_map = extract_books_in_parallel(df)

In [92]:
for idx, result in results_map.items():
    df.at[idx, 'Book References'] = result

# Prepare arguments for parallel processing of book entries
book_entries = []
for index, row in df.iterrows():
    episode_title = row['Title']
    episode_url = row['URL']
    book_references = row['Book References']  # Assuming this is a list of dictionaries
    
    for book in book_references:
        book_entries.append((index, episode_title, episode_url, book))

# Use ThreadPoolExecutor for parallel processing of book verification and URL retrieval
with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
    results = list(tqdm(executor.map(process_book_entry, book_entries), total=len(book_entries), desc="Processing books"))

# Convert results to DataFrame and sort by 'Index' to ensure mapping integrity
verified_books_df = pd.DataFrame(results)
verified_books_df.sort_values(by='Index', inplace=True)
verified_books_df.drop(columns=['Index'], inplace=True)

# Save to CSV
verified_books_df.to_csv('verified_books.csv', index=False)

print("CSV file 'verified_books.csv' has been created.")


{'error': "'BookValidationResponse' object has no attribute 'books'"}
{'error': "'BookValidationResponse' object has no attribute 'books'"}
{'error': "'BookValidationResponse' object has no attribute 'books'"}
{'error': "'BookValidationResponse' object has no attribute 'books'"}
{'error': "'BookValidationResponse' object has no attribute 'books'"}
{'error': "'BookValidationResponse' object has no attribute 'books'"}
{'error': "'BookValidationResponse' object has no attribute 'books'"}
{'error': "'BookValidationResponse' object has no attribute 'books'"}
{'error': "'BookValidationResponse' object has no attribute 'books'"}
{'error': "'BookValidationResponse' object has no attribute 'books'"}
{'error': "'BookValidationResponse' object has no attribute 'books'"}
{'error': "'BookValidationResponse' object has no attribute 'books'"}
{'error': "'BookValidationResponse' object has no attribute 'books'"}
{'error': "'BookValidationResponse' object has no attribute 'books'"}
{'error': "'BookVali

Processing books:   0%|          | 0/180 [00:00<?, ?it/s]


AttributeError: 'dict' object has no attribute 'title'

In [84]:
# Filter out rows where the book is not real, the author is empty, or the book title is empty
filtered_books_df = verified_books_df[
    # (verified_books_df['Is Real Book']) &
    (verified_books_df['Book Title'].str.strip() != '')
]

# Save the filtered DataFrame to a new CSV file
filtered_books_df.to_csv('filtered_verified_books.csv', index=False)

print("CSV file 'filtered_verified_books.csv' has been created with only valid book entries.")


CSV file 'filtered_verified_books.csv' has been created with only valid book entries.
