In [1]:
import requests
import orchest
import orchest.transfer
import time
import traceback
import re
import queue

import bs4
from bs4 import BeautifulSoup
from tqdm import tqdm
from lxml.html import fromstring
from itertools import cycle

In [2]:
usernames = orchest.get_step_param("usernames")
all_pages = orchest.get_step_param("all_pages")
base_url = "https://news.ycombinator.com/"

In [3]:
users = [{"username": username, "thread_url": base_url + "threads?id=" + username} for username in usernames]

In [4]:
def get_proxies():
    url = "https://free-proxy-list.net/"

    response = requests.get(url)
    parser = fromstring(response.text)

    proxies = set()

    for i in parser.xpath("//tbody/tr"):
        if i.xpath('.//td[7][contains(text(),"yes")]'):
            proxy = "http://" + ":".join(
                [i.xpath(".//td[1]/text()")[0], i.xpath(".//td[2]/text()")[0]]
            )
            proxies.add(proxy)

    return proxies

In [5]:
# Get Orchest params and input
proxy_enabled = orchest.get_step_param("proxy_enabled")

In [6]:
if proxy_enabled:
    proxies = get_proxies()
    proxy_pool = cycle(proxies)
    
    # At least able to fetch a couple of proxies
    assert len(proxies) > 3
    print("Fetching with %d proxies" % len(proxies))

Fetching with 167 proxies


In [7]:
def request(url, proxy_enabled, proxy_pool, timeout=2):
    kargs = {"timeout": timeout}

    if proxy_enabled:
        proxy = next(proxy_pool)
        kargs["proxies"] = {"http": proxy, "https": proxy}

    try:
        return requests.get(url, **kargs)
    except Exception as e:
        pass
        #print("Error: %s [%s]" % (e, type(e)))

In [8]:
def fetch_till_success(url, proxy_enabled, proxy_pool, tries=25):
    for i in range(tries):
        resp = request(url, proxy_enabled, proxy_pool)
        if resp is None:
            pass
        elif resp.status_code == 200:
            return resp
    print(f"Failed to fetch {url} after {tries} tries.")
    return None

In [9]:
def find_comments(soup, username_filter):
    
    comments = {}

    for tr in soup.select("tr.comtr"):
        try:
            username = tr.select(".comhead a.hnuser")[0].text
            date = tr.select(".comhead .age")[0].get('title')
            item_id = tr.get('id')
            permalink = base_url + "item?id=" + item_id

            # Replace <a> text with their href attribute content to avoid collapsed links
            for a in tr.select(".comment")[0].select("a"):
                if type(a.previousSibling) is bs4.element.NavigableString:
                    new_string = a.previousSibling.string + a.get('href')
                    if type(a.nextSibling) is bs4.element.NavigableString:
                        new_string += a.nextSibling.string
                        a.nextSibling.extract()
                    a.previousSibling.string.replace_with(new_string)
                    a.extract()
                
            body = re.sub(' +', ' ',  tr.select(".comment")[0].getText(separator=u'\n\n').strip())
            if(username == username_filter):
                comments[item_id] = {
                    "username": username,
                    "date": date,
                    "permalink": permalink,
                    "body": body,
                }
        except Exception as e:
            print(f"Failed to extract comment {e}[{type(e)}]")
    
    return comments

def find_more_link(soup, base_url):
    morelink = soup.select("a.morelink")
    if len(morelink):
        return base_url + morelink[0].get('href')
    else:
        return None

In [10]:
comment_store = {}

for user in tqdm(users):
    
    tqdm.write(f"Fetching users {user['username']}")
    
    url_queue = queue.Queue()
    url_queue.put(user["thread_url"])
    
    while not url_queue.empty():
        url = url_queue.get()
        resp = fetch_till_success(url, proxy_enabled, proxy_pool)
        if resp is None:
            continue
        
        soup = BeautifulSoup(resp.content, features="html.parser")
        comment_store.update(find_comments(soup, user["username"]))
        
        if all_pages:
            # Put more link in the queue if it's on the page
            more_link = find_more_link(soup, base_url)
            if more_link:
                url_queue.put(more_link)
                print(f"Adding {more_link}")

  0%|          | 0/1 [00:00<?, ?it/s]

                                     



  0%|          | 0/1 [00:00<?, ?it/s]

Fetching users ricklamers


100%|██████████| 1/1 [00:25<00:00, 25.81s/it]

100%|██████████| 1/1 [00:25<00:00, 25.81s/it]




In [11]:
orchest.transfer.output_to_disk(comment_store, name="comments")