In [1]:
# line_count.py
import sys

count = 0
for line in sys.stdin:
    count += 1

# print goes to sys.stdout
print(count)

0


In [2]:
def get_domain(email_address: str) -> str:
    """Split on '@' and return the last piece"""
    return email_address.lower().split("@")[-1]

# a couple of tests
assert get_domain('joelgrus@gmail.com') == 'gmail.com'
assert get_domain('joel@m.datasciencester.com') == 'm.datasciencester.com'

In [2]:
from bs4 import BeautifulSoup
import requests

url = "https://www.house.gov/representatives"
text = requests.get(url).text
soup = BeautifulSoup(text, "html5lib")

all_urls = [a['href']
            for a in soup('a')
            if a.has_attr('href')]

print(len(all_urls))  # 966 for me, way too many

966


In [3]:
import re

# Must start with http:// or https://
# Must end with .house.gov or .house.gov/
regex = r"^https?://.*\.house\.gov/?$"

# Let's write some tests!
assert re.match(regex, "http://joel.house.gov")
assert re.match(regex, "https://joel.house.gov")
assert re.match(regex, "http://joel.house.gov/")
assert re.match(regex, "https://joel.house.gov/")
assert not re.match(regex, "joel.house.gov")
assert not re.match(regex, "http://joel.house.com")
assert not re.match(regex, "https://joel.house.gov/biography")

# And now apply
good_urls = [url for url in all_urls if re.match(regex, url)]

print(len(good_urls))  # still 874 for me

874


In [4]:
good_urls = list(set(good_urls))
print(len(good_urls))

437


In [5]:
from typing import Dict, Set

press_releases: Dict[str, Set[str]] = {}

for house_url in good_urls:
    html = requests.get(house_url).text
    soup = BeautifulSoup(html, 'html5lib')
    pr_links = {a['href'] for a in soup('a') if 'press releases'
                                             in a.text.lower()}
    print(f"{house_url}: {pr_links}")
    press_releases[house_url] = pr_links

https://lamborn.house.gov/: {'/media/press-releases'}
https://waters.house.gov: {'/media-center/press-releases'}
https://buck.house.gov/: {'/media-center/press-releases'}
https://stewart.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://scalise.house.gov/: {'/media/press-releases'}
https://susielee.house.gov: {'/media/press-releases'}
https://kevinbrady.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=2657'}
https://gosar.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://ruiz.house.gov: {'/media-center/press-releases'}
https://fulcher.house.gov/: {'/press-releases'}
https://correa.house.gov: {'/news'}
https://roy.house.gov: {'/media/press-releases'}
https://carter.house.gov/: {'/press-releases/'}
https://cohen.house.gov/: {'/media-center/press-releases'}
https://morgangriffith.house.gov/: {'/News/DocumentQuery.aspx?DocumentTypeID=2235'}
https://arrington.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://mikelevin.house.gov: {'/media

https://wenstrup.house.gov: {'/news/documentquery.aspx?DocumentTypeID=2491'}
https://velazquez.house.gov: {'/media-center/press-releases'}
https://sablan.house.gov/: set()
https://nunes.house.gov/: {'/News/DocumentQuery.aspx?DocumentTypeID=2133'}
https://jeffduncan.house.gov/: {'/media/press-releases'}
https://dean.house.gov: {'/media/press-releases'}
https://dustyjohnson.house.gov/: {'/media/press-releases'}
https://gottheimer.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://keller.house.gov: {'/media/press-releases'}
https://reed.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://teddeutch.house.gov/: set()
https://comer.house.gov/: {'/media/press-releases'}
https://plaskett.house.gov/: {'/news/documentquery.aspx?documenttypeid=27'}
https://norton.house.gov/: {'/media-center/press-releases'}
https://mikegarcia.house.gov/: {'/media/press-releases'}
https://dunn.house.gov: {'/press-releases'}
https://lee.house.gov/: set()
https://hayes.house.gov: {'/med

https://scottpeters.house.gov: {'/media-center/press-releases'}
https://frankel.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://timryan.house.gov/: {'/media/press-releases'}
https://horn.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://flores.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=2467'}
https://wassermanschultz.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://rutherford.house.gov: {'/media/press-releases'}
https://napolitano.house.gov/: {'/media/press-releases'}
https://case.house.gov/: {'/news/documentsingle.aspx?DocumentID=321', '/news/documentquery.aspx?DocumentTypeID=27'}
https://wexton.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://lujan.house.gov/: {'https://lujan.house.gov/media-center/press-releases'}
https://clyburn.house.gov/: {'/press-releases'}
https://sherrill.house.gov/: {'/media/press-releases'}
https://gonzalez-colon.house.gov: {'/media/press-releases'}
https://thompson.house.gov: 

https://norcross.house.gov: set()
https://jhb.house.gov/: {'/News/DocumentQuery.aspx?DocumentTypeID=2113'}
https://jacksonlee.house.gov/: {'/media-center/press-releases'}
https://cartwright.house.gov: {'/media-center/press-releases'}
https://neal.house.gov/: {'/press-releases'}
https://fudge.house.gov/: set()
https://defazio.house.gov/: {'/media-center/press-releases'}
https://crenshaw.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://rouzer.house.gov/: {'/press-releases'}
https://lofgren.house.gov/: {'/media/press-releases'}
https://brooks.house.gov/: set()
https://vargas.house.gov: {'/media-center/press-releases'}
https://craig.house.gov: {'/media/press-releases'}
https://algreen.house.gov: {'/press-releases'}
https://steil.house.gov: {'/media/press-releases'}
https://lamb.house.gov: {'/media/press-releases'}
https://mcnerney.house.gov/: {'/media-center/press-releases'}
https://mooney.house.gov/: {'/media-center/press-releases'}
https://rubengallego.house.gov/: {'/me

In [6]:
def paragraph_mentions(text: str, keyword: str) -> bool:
    """
    Returns True if a <p> inside the text mentions {keyword}
    """
    soup = BeautifulSoup(text, 'html5lib')
    paragraphs = [p.get_text() for p in soup('p')]

    return any(keyword.lower() in paragraph.lower()
               for paragraph in paragraphs)

text = """<body><h1>Facebook</h1><p>Twitter</p>"""
assert paragraph_mentions(text, "twitter")       # is inside a <p>
assert not paragraph_mentions(text, "facebook")  # not inside a <p>

In [7]:
for house_url, pr_links in press_releases.items():
    for pr_link in pr_links:
        url = f"{house_url}/{pr_link}"
        text = requests.get(url).text

        if paragraph_mentions(text, 'data'):
            print(f"{house_url}")
            break  # done with this house_url

https://gosar.house.gov/
https://ruiz.house.gov
https://correa.house.gov
https://cicilline.house.gov/
https://long.house.gov/
https://anthonybrown.house.gov
https://crist.house.gov
https://steube.house.gov/
https://palazzo.house.gov/
https://tonko.house.gov/
https://roybal-allard.house.gov
https://delbene.house.gov
https://jeffduncan.house.gov/
https://chu.house.gov/
https://kim.house.gov/
https://buddycarter.house.gov/
https://foxx.house.gov/
https://castor.house.gov/
https://gaetz.house.gov
https://quigley.house.gov/
https://luria.house.gov
https://cardenas.house.gov
https://mucarsel-powell.house.gov
https://rubengallego.house.gov/
https://robinkelly.house.gov/
https://schakowsky.house.gov


In [8]:
import requests, json

github_user = "joelgrus"
endpoint = f"https://api.github.com/users/{github_user}/repos"

repos = json.loads(requests.get(endpoint).text)
repos

[{'id': 112873601,
  'node_id': 'MDEwOlJlcG9zaXRvcnkxMTI4NzM2MDE=',
  'name': 'advent2017',
  'full_name': 'joelgrus/advent2017',
  'private': False,
  'owner': {'login': 'joelgrus',
   'id': 1308313,
   'node_id': 'MDQ6VXNlcjEzMDgzMTM=',
   'avatar_url': 'https://avatars1.githubusercontent.com/u/1308313?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/joelgrus',
   'html_url': 'https://github.com/joelgrus',
   'followers_url': 'https://api.github.com/users/joelgrus/followers',
   'following_url': 'https://api.github.com/users/joelgrus/following{/other_user}',
   'gists_url': 'https://api.github.com/users/joelgrus/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/joelgrus/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.github.com/users/joelgrus/subscriptions',
   'organizations_url': 'https://api.github.com/users/joelgrus/orgs',
   'repos_url': 'https://api.github.com/users/joelgrus/repos',
   'events_url': 'https://api.github.com/users/