Analyzing the responses to a hacker news article "who is looking for a job"

Used Gemini, had to iterate twice 
- escape the html properly
- specify the fields explicitly

Lot's more could be done, but pretty straightforward for the output here!

In [None]:
import requests
import re
from collections import defaultdict
import html # For unescaping HTML entities
from bs4 import BeautifulSoup # For parsing HTML

In [None]:
def get_top_item_ids(num_items=500):
    """Fetches the IDs of the top stories from Hacker News."""
    url = "https://hacker-news.firebaseio.com/v0/topstories.json"
    response = requests.get(url)
    response.raise_for_status()
    return response.json()[:num_items]

def get_item_details(item_id):
    """Fetches the details of a specific item (story or comment) by its ID."""
    url = f"https://hacker-news.firebaseio.com/v0/item/{item_id}.json"
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

def find_who_is_hiring_thread_id(top_item_ids):
    """Tries to find the 'Who is hiring?' or 'Who wants to be hired?' thread among the top stories."""
    for item_id in top_item_ids:
        item = get_item_details(item_id)
        if item and 'title' in item and ("Who is hiring?" in item['title'] or "Who wants to be hired?" in item['title']):
            print(f"Found thread: '{item['title']}' with ID: {item_id}")
            return item_id
    return None

def extract_links_from_text(text):
    """
    Extracts various types of links from a given text,
    first unescaping HTML entities and then parsing <a> tags.
    """
    website = None
    resume = None
    github = None

    # 1. Unescape HTML entities
    unescaped_text = html.unescape(text)

    # 2. Use BeautifulSoup to parse the HTML and find all <a> tags
    soup = BeautifulSoup(unescaped_text, 'html.parser')
    links = soup.find_all('a', href=True)

    for link_tag in links:
        url = link_tag['href']

        if "github.com" in url:
            github = url
        elif "resume" in url.lower() or "cv" in url.lower() or any(ext in url.lower() for ext in ['.pdf', '.doc', '.docx']):
            resume = url
        else:
            if website is None: # Only capture the first general website link found
                website = url
    return website, resume, github

def parse_structured_info(comment_text):
    """
    Parses specific fields like Location and Technologies from the comment text
    based on the expected structure.
    """
    data = {
        'Location': None,
        'Remote': None,
        'Willing to relocate': None,
        'Technologies': None,
        'Résumé/CV': None, # We'll still extract this from <a> tags for consistency
        'Email': None,
    }

    # Unescape HTML entities first, as the structure might also contain them
    unescaped_text = html.unescape(comment_text)

    # Split the text by <p> tags or newlines to process line by line
    # Using regex to split by <p> (case-insensitive) or newline characters
    lines = re.split(r'(?:<p>|\n)', unescaped_text)

    for line in lines:
        line = line.strip()
        if not line:
            continue

        if line.startswith("Location:"):
            data['Location'] = line.split("Location:", 1)[1].strip()
        elif line.startswith("Remote:"):
            data['Remote'] = line.split("Remote:", 1)[1].strip()
        elif line.startswith("Willing to relocate:"):
            data['Willing to relocate'] = line.split("Willing to relocate:", 1)[1].strip()
        elif line.startswith("Technologies:"):
            data['Technologies'] = line.split("Technologies:", 1)[1].strip()
        elif line.startswith("Résumé/CV:"):
            # We will still rely on extract_links_from_text for the actual URL
            # but capture this if present for contextual understanding
            pass # The link extraction will handle the URL
        elif line.startswith("Email:"):
            data['Email'] = line.split("Email:", 1)[1].strip()

    return data

In [None]:
def analyze_hacker_news_thread(thread_id):
    """
    Analyzes a Hacker News thread to extract information from comments.
    """
    thread_details = get_item_details(thread_id)
    if not thread_details or 'kids' not in thread_details:
        print("Could not find comments for this thread.")
        return

    job_responses = []
    total_responses = 0
    with_website = 0
    with_resume_link = 0
    with_github_repo = 0

    location_counts = defaultdict(int)
    technology_counts = defaultdict(int)

    # Let's track the raw data for locations and technologies for more granular analysis later
    raw_locations = []
    raw_technologies = []


    for comment_id in thread_details['kids']:
        comment = get_item_details(comment_id)
        if comment and 'text' in comment:
            total_responses += 1
            comment_text = comment['text']

            # Extract structured info (Location, Technologies)
            structured_info = parse_structured_info(comment_text)

            # Extract links (Website, Resume, Github)
            website, resume_link, github_repo = extract_links_from_text(comment_text)

            if website:
                with_website += 1
            if resume_link:
                with_resume_link += 1
            if github_repo:
                with_github_repo += 1

            # Populate location and technology counts
            if structured_info['Location']:
                # Basic cleanup, e.g., splitting by comma for multiple locations if desired
                locations = [loc.strip() for loc in structured_info['Location'].split(',')]
                for loc in locations:
                    if loc: # Ensure not empty string
                        location_counts[loc.lower()] += 1 # Standardize to lowercase
                        raw_locations.append(loc)

            if structured_info['Technologies']:
                # Split by comma and trim whitespace
                technologies = [tech.strip() for tech in structured_info['Technologies'].split(',')]
                for tech in technologies:
                    if tech: # Ensure not empty string
                        technology_counts[tech.lower()] += 1 # Standardize to lowercase
                        raw_technologies.append(tech)


            job_responses.append({
                'id': comment_id,
                'text': comment_text,
                'website': website,
                'resume_link': resume_link,
                'github_repo': github_repo,
                'parsed_location': structured_info['Location'],
                'parsed_remote': structured_info['Remote'],
                'parsed_relocate': structured_info['Willing to relocate'],
                'parsed_technologies': structured_info['Technologies'],
                'parsed_email': structured_info['Email']
            })

            # For comments without GitHub, check website/resume (this requires fetching content from those links)
            # This logic remains a placeholder for advanced implementation
            if not github_repo and (website or resume_link):
                pass

    print(f"\n--- Analysis Summary ({total_responses} responses) ---")
    print(f"Responses with website: {with_website} ({with_website/total_responses:.2%})")
    print(f"Responses with resume link: {with_resume_link} ({with_resume_link/total_responses:.2%})")
    print(f"Responses with GitHub repo: {with_github_repo} ({with_github_repo/total_responses:.2%})")

    print("\n--- Top Locations ---")
    # Sort and print top locations
    sorted_locations = sorted(location_counts.items(), key=lambda item: item[1], reverse=True)
    for loc, count in sorted_locations[:10]: # Print top 10
        print(f"{loc.title()}: {count}") # Capitalize for display

    print("\n--- Top Technologies ---")
    # Sort and print top technologies
    sorted_technologies = sorted(technology_counts.items(), key=lambda item: item[1], reverse=True)
    for tech, count in sorted_technologies[:15]: # Print top 15
        print(f"{tech.title()}: {count}")

    return job_responses



In [None]:
# --- Main execution ---
if __name__ == "__main__":
    print("Fetching top Hacker News stories...")
    top_ids = get_top_item_ids(num_items=500) # Get more IDs to increase chances of finding the thread
    who_is_hiring_thread_id = find_who_is_hiring_thread_id(top_ids)

    if who_is_hiring_thread_id:
        print(f"Analyzing thread ID: {who_is_hiring_thread_id}")
        job_data = analyze_hacker_news_thread(who_is_hiring_thread_id)
        # You can now further process job_data, e.g., save to CSV, perform deeper analysis.
    else:
        print("Could not find a 'Who is hiring?' or 'Who wants to be hired?' thread among the top stories.")