In [4]:
# Native imports
import sys
import os
import os.path
import json
from pprint import pprint
import urllib.request
import urllib.parse

# 3rd-party scraping/parsing imports
import grequests
import requests
from bs4 import BeautifulSoup
import dateutil.parser as dt

# 3rd-party data science imports
import pandas as pd
import seaborn as sns

# Used to make the plots bigger
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (8, 8)

In [5]:
# Source: https://stackoverflow.com/a/312464
split_into_chunks = lambda lst, n: [lst[i:i + n] for i in range(0, len(lst), n)]

ORGS_CATALOG_URL = 'https://callink.berkeley.edu/api/discovery/search/organizations?orderBy%5B0%5D=UpperName%20asc&top=1600'
ORG_BASE_URL = 'https://callink.berkeley.edu/organization/'


def fetch_init_app_state(soup):
    for element in soup.find_all('script'):
        if len(element.contents) > 0 and 'window.initialAppState' in element.contents[0]:
            json_str = element.contents[0].split(' = ')[1][:-1]
            return json.loads(json_str)
    return None


def extract_email_from_org_html(org_html):
    soup = BeautifulSoup(org_html, 'html.parser')
    app_state = fetch_init_app_state(soup)

    email = app_state['preFetchedData']['organization'].get('email')
    return email


def scrape_emails():
    callink_data = json.loads(
        requests.get(ORGS_CATALOG_URL).text.strip()
    )
    
    num_orgs = callink_data['@odata.count']
    orgs = callink_data['value']

    callink_emails = []
    N = 20
    
    org_urls = [ORG_BASE_URL + org['WebsiteKey'] for org in orgs]
    org_url_chunks = split_into_chunks(org_urls, N)
    
    for i, orgs_chunk in enumerate(org_url_chunks):
        reqs = [grequests.get(org_url) for org_url in orgs_chunk]
        replies = grequests.map(reqs)
        emails = [extract_email_from_org_html(reply.text.strip()) for reply in replies]
        emails = [email for email in emails if email is not None]
        
        print('Emails processed:', (i + 1) * N)
        callink_emails += emails
    
    print('Number of organizations:', num_orgs)
    print('Number of valid emails collected:', len(callink_emails))
    return callink_emails

In [6]:
%%time
scrape_emails()

Emails processed: 20
Emails processed: 40
Emails processed: 60
Emails processed: 80
Emails processed: 100
Emails processed: 120
Emails processed: 140
Emails processed: 160
Emails processed: 180
Emails processed: 200
Emails processed: 220
Emails processed: 240
Emails processed: 260
Emails processed: 280
Emails processed: 300
Emails processed: 320
Emails processed: 340
Emails processed: 360
Emails processed: 380
Emails processed: 400
Emails processed: 420
Emails processed: 440
Emails processed: 460
Emails processed: 480
Emails processed: 500
Emails processed: 520
Emails processed: 540
Emails processed: 560
Emails processed: 580
Emails processed: 600
Emails processed: 620
Emails processed: 640
Emails processed: 660
Emails processed: 680
Emails processed: 700
Emails processed: 720
Emails processed: 740
Emails processed: 760
Emails processed: 780
Emails processed: 800
Emails processed: 820
Emails processed: 840
Emails processed: 860
Emails processed: 880
Emails processed: 900
Emails process

['recruitment.berkeley@180dc.org',
 '3dma.at.berkeley@gmail.com',
 '3dmc.berkeley@gmail.com',
 'fiveandtwo.berkeley@gmail.com',
 'officers@abetterway.berkeley.edu',
 'seanhkim@berkeley.edu',
 'abbamodern@gmail.com',
 'abcsketches@gmail.com',
 'cal5@acacia.org',
 'acecoaches@berkeley.edu',
 'achievementim@gmail.com',
 'info@acts2fellowship.org',
 'calasae@gmail.com',
 'calasapresident@gmail.com',
 'aasdsocialmedia@gmail.com',
 'afxdance@gmail.com',
 'dalilalva@berkeley.edu',
 'adseberkeley@gmail.com',
 'axopichapterpresident@gmail.com',
 'axs.sigma.president@gmail.com',
 'info@aquadelt.org',
 'caladpipresident@gmail.com',
 'aakashr@berkeley.edu',
 'ago_actives@lists.berkeley.edu',
 'AKARHO1921@gmail.com',
 'brandonqinqin@gmail.com',
 'calalphaphipresident@gmail.com ',
 'icecolddynasty@gmail.com',
 'president@calaphio.com',
 'calapx@gmail.com',
 'ato.berkeley@gmail.com',
 'berkeleyaltprotein@gmail.com',
 'berkeleyaltbreaks@gmail.com',
 'kvillacorta@berkeley.edu',
 'blast@law.berkeley.edu