In [11]:
import requests
import newspaper
import numpy as np
import spacy
import pickle
import sys
import time
import re
from newspaper import Article
from newspaper import Config
from bs4 import BeautifulSoup
from collections import defaultdict

from selenium import webdriver 
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException

# For Spacy
nlp = spacy.load("en_core_web_sm")

# For Newspaper3k
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
config = Config()
config.browser_user_agent = user_agent

# For Selenium
option = webdriver.ChromeOptions()
option.add_argument(" — incognito")
executable_path = '/Users/kenmiyachi/Desktop/chromedriver'

In [3]:
def get_base_url(url):
    return '/'.join(url.split('/')[:3])

def print_with_newlines(strings):
    for s in strings:
        print(s + '\n')

def print_summary(article):
    article.nlp()
    strings = [article.title, article.authors, article.keywords, article.summary]
    print_with_newlines(strings)
    print('\n-------------------\n')
    
def form_query_strings(search_query):
    search_terms = search_query.split(' ')
    plus_query = '+'.join(search_terms)
    percent_query = '%20'.join(search_terms)
    return plus_query, percent_query
        
def get_articles(url, base_url, max_articles):
    """Returns all scraped articles from a website up to a maximum number of articles

    Parameters
    ----------
    url : str
        The url of the website including the page number
    base_url : str
        The root url of the website
    max_articles : int
        The maximum number of articles that will be scraped 
        (will be less than this if there are not enough articles on the webpage)
    """
    assert(base_url[-1] != '/')
    articles = []
    
    if base_url == 'https://www.coindesk.com': return get_coindesk_articles(url, base_url, max_articles)
    elif base_url == 'https://cointelegraph.com': return get_cointelegraph_articles(url, base_url, max_articles)
    elif base_url == 'https://www.forbes.com': return get_forbes_articles(url, base_url, max_articles)
    elif base_url == 'https://nulltx.com': return get_nulltx_articles(url, base_url, max_articles)
    elif base_url == 'https://techstartups.com': return get_techstartups_articles(url, base_url, max_articles)
    elif base_url == 'https://medium.com': return get_medium_articles(url, base_url, max_articles)
    elif base_url == 'https://www.newsbtc.com': return get_newsbtc_articles(url, base_url, max_articles)
    elif base_url == 'https://bitcoinist.com': return get_bitcoinist_articles(url, base_url, max_articles)
    
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    all_links = soup.findAll('a', href=True)

    for link in all_links:
        link = link['href']

        # Format hrefs that only include the endpoint names
        if link != '' and link[0] == '/':
            link = base_url + link
            
        # Check for valid urls
        if (base_url in link and '#' not in link and len(link) > len(base_url) + 1 
            and '-' in ''.join(link.split('/')[3:])):
            try:
                print(link)
                article = get_article(link)
                if article and article.title not in [article.title for article in articles]:
                    articles.append(article)
                    print('added')
                    if len(articles) >= max_articles:
                        return articles
                    
            except Exception as e:
                print('%s\n' % e)

    return articles

# Helper function for get_articles
def get_article(link):
    article = Article(link, config=config)
    article.download()
    article.parse()

    if article.publish_date: # Check if link is a valid article
        return article
    return None

def extract_startup_names(article):
    startup_names = []
    print_with_newlines([article.title])
    doc = nlp(article.text)
    for ent in doc.ents:
        if ent.label_ == 'ORG' and '@' not in ent.text:
            #print(ent.text, ent.start_char, ent.end_char, ent.label_)
            startup_names.append(ent.text)
        
    return startup_names

### Scraping Helper Functions (For 3 types of websites)

In [4]:
def scrape_paged_site(url, base_url, max_articles, articles_per_page, get_page_url, scrape_links):
    """Scrapes website that has multiple pages that are accessible by changing the url

    Parameters
    ----------
    url : str
        The url of the website including the page number
    base_url : str
        The root url of the website
    max_articles : int
        The maximum number of articles that will be scraped 
        (will be less than this if there are not enough articles on the webpage)
    articles_per_page : int
        The number of articles on a single page
    get_page_url : function
        Function that take as input a page number and returns a url for the page
    scrape_links : function
        Function that returns all scraped article urls on the webpage
    """
    assert(base_url[-1] != '/')
    articles = []
    
    for i in range(1, int(max_articles / articles_per_page) + 1):
        url = get_page_url(url, base_url, i)
        print(url)
        try:
            page = requests.get(url)
        except Exception as e:
            print(e)
            break
        soup = BeautifulSoup(page.content, 'html.parser')

        for link in scrape_links(soup):
            try:
                print(link)
                article = get_article(link)
                if article and article.title not in [article.title for article in articles]:
                    articles.append(article)
                    print('added')
                    if len(articles) >= max_articles:
                        return articles

            except Exception as e:
                print('%s\n' % e)
    return articles

def scrape_scroll_site(url, max_articles, articles_per_page, scrape_links):
    """Scrapes website that generates new content as the user scrolls down

    Parameters
    ----------
    url : str
        The url of the website including the page number
    max_articles : int
        The maximum number of articles that will be scraped 
        (will be less than this if there are not enough articles on the webpage)
    articles_per_page : int
        The number of articles on a single page
    scrape_links : function
        Function that returns all scraped article urls on the webpage
    """
    for attempt in range(10):
        try:
            browser = webdriver.Chrome(executable_path=executable_path, chrome_options=option)
            browser.get(url)
        except Exception as e:
            print(e)
            print('trying again')
            time.sleep(10)
            break
        else:
            break
    else:
        return []
    
    for _ in range(int(np.ceil(max_articles / articles_per_page))): # Ensures that at least max_articles count can be scraped
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)
    
    articles = []
    soup = BeautifulSoup(browser.page_source, 'html.parser')
    
    for link in scrape_links(soup):
        print(link)
        try:
            print(link)
            article = get_article(link)
            if article and article.title not in [article.title for article in articles]:
                articles.append(article)
                print('added')
                if len(articles) >= max_articles:
                    return articles

        except Exception as e:
            print('%s\n' % e)

    return articles

def scrape_button_site(url, max_articles, articles_per_page, seemore_xpath, scrape_links, close_toast_xpath=''):
    """Scrapes website that has a button user presses to show more articles on the page

    Parameters
    ----------
    url : str
        The url of the website including the page number
    max_articles : int
        The maximum number of articles that will be scraped 
        (will be less than this if there are not enough articles on the webpage)
    articles_per_page : int
        The number of articles on a single page
    seemore_xpath : str or function
        The string or string returning function that is the xpath of the button that generates more articles
    scrape_links : function
        Function that returns all scraped article urls on the webpage
    close_toast_xpath: str, optional
        The xpath of the button that closes toast pop-ups that inform the user about cookies
    """
    for attempt in range(10):
        try:
            browser = webdriver.Chrome(executable_path=executable_path, chrome_options=option)
            browser.get(url)

            # Close the "accept cookies" toast pop-up
            if close_toast_xpath != '':
                WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.XPATH, close_toast_xpath)))
                WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.XPATH, close_toast_xpath)))
                time.sleep(3)
                browser.find_element_by_xpath(close_toast_xpath).click()
                time.sleep(3)

            # Click on "See More" button
            for i in range(1, int(np.ceil(max_articles / articles_per_page))+1):
                try:
                    if callable(seemore_xpath):
                        seemore_xpath_string = seemore_xpath(i)
                    else:
                        seemore_xpath_string = seemore_xpath
                    WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.XPATH, seemore_xpath_string)))
                    python_button = browser.find_element_by_xpath(seemore_xpath_string)
                    python_button.click()
                    time.sleep(3)
                except Exception as e:
                    print(e)
                    break
        except Exception as e:
            print(e)
            print('trying again')
            time.sleep(10)
            break
        else:
            break
    else:
        return []

    articles = []
    soup = BeautifulSoup(browser.page_source, 'html.parser')
    for link in scrape_links(soup):
        try:
            print(link)
            article = get_article(link)
            if article and article.title not in [article.title for article in articles]:
                articles.append(article)
                print('added')
                if len(articles) >= max_articles:
                    return articles
        except Exception as e:
            print('%s\n' % e)
    return articles

### Scraping Functions for Specific Websites (All wrapped in the Get_Articles function above)

In [5]:
def get_page_url(url, base_url, page_num):
    return base_url + '/page/' + str(i) + '/' + url.split('/')[-1]

def get_nulltx_articles(url, base_url, max_articles):
    def scrape_links(soup):
        links = []
        all_items = soup.findAll('a', rel='bookmark', href=True)
        for link in all_items:
            links.append(link['href'])
        return links
    
    articles_per_page = 19
    #page_urls = get_page_urls(url, base_url, max_articles, articles_per_page)
    articles = scrape_paged_site(url, base_url, max_articles, articles_per_page, get_page_url, scrape_links)
    return articles

def get_medium_articles(url, base_url, max_articles):
    def scrape_links(soup):
        links = []
        all_items = soup.findAll('div', class_='postArticle-content')
        for link in all_items:
            links.append(link.find('a', href=True)['href'])
        return links
    
    articles_per_page = 10
    articles = scrape_scroll_site(url, max_articles, articles_per_page, scrape_links)
    return articles

def get_techstartups_articles(url, base_url, max_articles):
    def scrape_links(soup):
        links = []
        all_items = soup.findAll('div', class_='post_img static one_third')
        for link in all_items:
            links.append(link.find('a', href=True)['href'])
        return links
    
    articles_per_page = 15
    #page_urls = get_page_urls(url, base_url, max_articles, articles_per_page)
    articles = scrape_paged_site(url, base_url, max_articles, articles_per_page, get_page_url, scrape_links)
    return articles

def get_coindesk_articles(url, base_url, max_articles):
    def scrape_links(soup):
        links = []
        all_items = soup.findAll('div', class_='text-content')
        for link in all_items:
            endpoint = link.findAll('a')[1]['href']
            link = 'https://www.coindesk.com' + str(endpoint)
            links.append(link)
        return links
    
    def get_seemore_xpath(i):
        xpath = '//*[@id="__next"]/main/section/div/section[2]/article/div/section/div[%d1]/button' % i
        return xpath
    
    articles_per_page = 10
    articles = scrape_button_site(url, max_articles, articles_per_page, get_seemore_xpath, scrape_links)
    return articles

def get_cointelegraph_articles(url, base_url, max_articles):
    def scrape_links(soup):
        links = []
        all_items = soup.findAll('h2', class_='header')
        for link in all_items:
            links.append(link.find('a')['href'])
        return links
    
    seemore_xpath = '//*[@id="search-results"]/div/div[3]/nav/div/div[2]/a'
    close_toast_xpath = '//*[@id="vue-footer"]/div/div/div/div[2]/a[2]'
    articles_per_page = 28
    articles = scrape_button_site(url, max_articles, articles_per_page, seemore_xpath, scrape_links, close_toast_xpath)
    return articles

def get_forbes_articles(url, base_url, max_articles):
    def scrape_links(soup):
        links = []
        all_items = soup.findAll('a', class_='stream-item__title', href=True)
        for link in all_items:
            links.append(link['href'])
        return links
    
    seemore_xpath = '/html/body/div[1]/main/div[1]/div[1]/div[5]'
    close_toast_xpath = '//*[@id="truste-consent-button"]'
    articles_per_page = 20
    articles = scrape_button_site(url, max_articles, articles_per_page, seemore_xpath, scrape_links, close_toast_xpath)
    return articles

def get_newsbtc_articles(url, base_url, max_articles):
    def scrape_links(soup):
        links = []
        all_items = soup.findAll('h2', class_='title medium')
        for link in all_items:
            links.append(link.find('a', href=True)['href'])
        return links

    seemore_xpath = '//*[@id="content"]/div[2]/div/div/div[2]/span'
    close_toast_xpath = '/html/body/div[2]/div/div[2]/div[1]/a[2]'
    articles_per_page = 3*8
    articles = scrape_button_site(url, max_articles, articles_per_page, seemore_xpath, scrape_links, close_toast_xpath)
    return articles

def get_bitcoinist_articles(url, base_url, max_articles):
    def scrape_links(soup):
        links = []
        all_items = soup.findAll('h3', class_='title')
        for link in all_items:
            link = link.find('a', href=True)
            if link:
                links.append(link['href'])
        return links
    
    seemore_xpath = '//*[@id="content"]/div[2]/section/div[2]/a'
    articles_per_page = 3*3
    articles = scrape_button_site(url, max_articles, articles_per_page, seemore_xpath, scrape_links)
    return articles

### List of websites to choose from

In [6]:
"""
not_working = [
    # Possible to write code for these with a bit of time
    'https://www.wired.com/search/?q=%s&page=1&sort=score' % percent_query,
    'https://bitcoinmagazine.com/search?text=%s&page=1' % percent_query,
    'https://www.tomshardware.com/search?searchTerm=%s' % plus_query, 
    'https://www.cnet.com/search/?query=%s' % plus_query,
    'https://www.ccn.com/?s=%s' % plus_query,
    'https://gigaom.com/?s=%s' % plus_query,
    
    # False positives; dates found for irrelevant articles - May need specific code for scraping
    'https://mashable.com/search/?t=stories&q=%s' % percent_query,
    'https://thenextweb.com/?q=%s' % percent_query,
    'https://www.firstpost.com/search?q=%s' % percent_query,
    'https://www.theverge.com/search?q=%s' % plus_query,
    
    # No search feature
    'https://www.todayonchain.com/',
    
    # Results may not be that helpful
    'https://cryptoslate.com/?s=%s' % plus_query,
    'https://search.techcrunch.com/search?p=%s&fr2=sb-top&fr=techcrunch' % percent_query,
    
]

working = [
    'https://www.coindesk.com/search?q=%s&s=relevant' % percent_query,
    'https://cointelegraph.com/search?query=%s' % percent_query,
    'https://nulltx.com/?s=%s' % plus_query,
    'https://techstartups.com//?s=%s' % plus_query,
    'https://www.forbes.com/search/?q=%s' % plus_query,
    'https://medium.com/search?q=%s' % percent_query,
    'https://www.newsbtc.com/?s=%s&lang=en' % plus_query,
    'https://bitcoinist.com/?s=%s&lang=en' % plus_query,
]
""";

## Scrape Articles

In [12]:
# Enter search query here
search_query = 'top blockchain'

# Enter max number of articles to retrieve from each site
max_articles = 20


plus_query, percent_query = form_query_strings(search_query)

# Can be the same as "working"
chosen_sites = [
    'https://cointelegraph.com/search?query=%s' % percent_query,
    'https://www.coindesk.com/search?q=%s&s=relevant' % percent_query,
    'https://nulltx.com/?s=%s' % plus_query,
    'https://techstartups.com//?s=%s' % plus_query,
    'https://medium.com/search?q=%s' % percent_query,
    'https://www.newsbtc.com/?s=%s&lang=en' % plus_query,
    'https://bitcoinist.com/?s=%s&lang=en' % plus_query,
]


all_articles = defaultdict() # key is index, value is articles
try:
    for i in range(len(chosen_sites)):
        print(chosen_sites[i] + '\n')
        all_articles[i] = get_articles(chosen_sites[i], get_base_url(chosen_sites[i]), max_articles)
        print()

except KeyboardInterrupt: # Save articles in case of error
    #pickle.dump(all_articles, open('all_articles_backup.p', 'wb'))
    pass
#except Exception as e:
    #pickle.dump(all_art icles, open('all_articles_backup.p', 'wb'))
    #print(e)

https://cointelegraph.com/search?query=top%20blockchain





Message: 

https://cointelegraph.com/news/cointelegraph-announces-chinese-hq-bolstering-its-international-expansion
added
https://cointelegraph.com/news/cross-border-blockchain-b2b-volume-to-hit-44-trillion-by-2024
added
https://cointelegraph.com/news/bitfinex-hack-new-twist-two-arrested-in-israel-after-15m-moved
added
https://cointelegraph.com/news/major-blockchain-investors-arrington-xrp-neo-global-back-dex-focused-startup
added

https://www.coindesk.com/search?q=top%20blockchain&s=relevant

https://www.coindesk.com/ibm-blockchain-vp-every-dollar-spent-on-blockchain-yields-15-on-cloud
added
https://www.coindesk.com/discussion-us-crypto-taxes-are-a-nightmare-could-these-proposals-help
added
https://www.coindesk.com/why-defis-billion-dollar-milestone-matters
added
https://www.coindesk.com/now-more-than-ever-serc-is-scrutinizing-unregistered-token-offerings
added
https://www.coindesk.com/chinas-coronavirus-whistleblower-is-now-memorialized-on-ethereum
added
https://www.coindesk.com/95-c

NameError: name 'get_page_urls' is not defined

In [13]:
for i in range(len(all_articles)):
    print(len(all_articles[i]))

4
20


### Save Articles

In [14]:
# TODO - Save articles
#pickle.dump(all_articles, open('all_articles.p', 'wb'))

# Test that it is saved properly
#saved_articles = pickle.load(open('all_articles.p', 'rb'))
#saved_articles

## Extract Startup Names

In [15]:
startup_names = []
for key, articles in all_articles.items():
    print(key)
    for article in articles:
        startup_names.extend(extract_startup_names(article))

0
Cointelegraph Announces Chinese HQ, Bolstering Its International Expansion

Global Blockchain B2B Volume Expected to Hit $4.4 Trillion by 2024

Bitfinex Hack New Twist: Two Arrested in Israel After $1.5M Moved

Major Blockchain Investors Arrington XRP, NEO Global Back DEX-Focused Startup

1
IBM Blockchain VP: Every Dollar Spent on Blockchain Yields $15 on Cloud

US Crypto Taxes Are a Nightmare. Could These Proposals Help?

Why DeFi’s Billion-Dollar Milestone Matters

Now More Than Ever, SEC Is Scrutinizing Unregistered Token Offerings

China’s Coronavirus Whistleblower Is Now Memorialized on Ethereum

‘95% Confidence’: Ethereum Developers Pencil In July 2020 for Eth 2.0 Launch

How Blockchain Will Track Taxes (and Tax Cheats)

SEC Commissioner Hester Peirce Proposes 3-Year Safe Harbor Period for Crypto Token Sales

Blockstack’s New Consensus Mechanism Creates New Use Case for Bitcoin

How the Long Tail of the Coronavirus Might Slow Bitcoin’s Hash Power Growth

Libra Vice Chair Not Wo

### Clean Extracted Names

In [16]:
to_remove = []

unique_startup_names = np.unique(startup_names).tolist()
for i, name in enumerate(unique_startup_names):
    unique_startup_names[i] = name.strip().lower()
    if name == '' or 'http' in name or not re.search('[a-zA-Z]', name):
        to_remove.append(i)
    else:
        name = name.replace(u'’', u"'").lower()
        for symbol in [',', '\n', '\'s', ' and ', '/', ' vs ']:
            if symbol in name:
                #print(name)
                to_remove.append(i)
                unique_startup_names.extend(name.split(symbol))

to_remove.reverse()
for i in to_remove:
    unique_startup_names.pop(i)

            
unique_startup_names = np.unique(np.array(unique_startup_names))
unique_startup_names

array(['"tendermint inc.', 'amentum investment management', 'antminer s9',
       'ap', 'api', 'apple podcasts', 'apps', 'arrington',
       'artificial intelligence laboratory (csail', 'at&t', 'atom',
       'bain capital ventures', 'bank of china', 'bfx', 'big blue',
       'bitmain', 'bits inc.', 'block', 'blockstack', 'brave.com',
       'buterin', 'canaan creative', 'carnegie mellon university',
       'chain capital', 'chinese communist party', 'clarity',
       'clipboard hijackers’', 'coindesk', 'coinfund', 'computer science',
       'consensys', 'consensys labs', 'cornell university', 'cpc',
       'crown', 'csail', 'cuomo', 'cusack', 'dai', 'davispolk', 'defi',
       'democratic caucus', 'digital currency research institute',
       'disgorgement', 'disparte', 'dtc capital', 'ecc', 'elliptic',
       'eos', 'erc-20 eos', 'ernst & young', 'eth', 'eth 1.x', 'eu',
       'exchange commission', 'exchanges."satoshi', 'facebook',
       'federal reserve', 'fenwick & west', 'finast

## Evaluation

In [17]:
blockchain_startup_names = None
with open('blockchain_startup_names.txt', 'r') as f:
    blockchain_startup_names = [line.strip() for line in f]
print('Names from the Master List')
blockchain_startup_names

Names from the Master List


['Abra',
 'Aeternity',
 'AlphaPoint',
 'AirFox',
 'Ardor',
 'Ark',
 'Ascribe',
 'Augur',
 'Auroracoin',
 'Backfeed',
 'BigchainDB',
 'Bitfury',
 'BitGive',
 'Bitmark',
 'BitPagos',
 'Bitpay',
 'BitSE',
 'Bitswift',
 'BlackCoin',
 'Blakecoin',
 'BlockMedX',
 'Blockphase',
 'Blockstream',
 'BlockVerify',
 'Bloq',
 'BTL Group',
 'CareX',
 'Cashaa',
 'Chain Inc.',
 'Chain of Things',
 'Chainy',
 'Circle',
 'Cognate',
 'Coinbase',
 'Colony',
 'Colu',
 'COMIT',
 'ConnectJob',
 'Consensys Systems',
 'ContentKid',
 'CrowdWiz',
 'Crowdz',
 'CureCoin',
 'Cypherium',
 'Decent',
 'Decissio',
 'DFINITY',
 'Digital Asset Holdings',
 'DMarket',
 'Earthport',
 'Enigma',
 'Epiphyte',
 'Ethereum',
 'Experty',
 'ExtraCredit',
 'Filecoin',
 'Flowchain',
 'Global Blockchain',
 'Hello Block',
 'Herosphere',
 'HIVE BLOCKCHAIN',
 'Horizon State',
 'Humaniq',
 'Hyperledger',
 'ICOBox',
 'Kalpa Digital Health',
 'Komodo',
 'KYC-CHAIN',
 'LAToken',
 'Lisk',
 'Loci',
 'Luna',
 'MazaCoin',
 'MintHealth',
 'Monax',

In [18]:
target_startup_names = blockchain_startup_names
found = 0
for target in target_startup_names:
    if target.lower() in unique_startup_names:
        print(target.lower())
        found += 1
print('\nPercent of Startups found from the master list: %.3f' % (found / len(target_startup_names)))

parity technologies

Percent of Startups found from the master list: 0.009


## Explore Text Data

In [19]:
article_choice = all_articles[0][0]

In [20]:
print(article_choice.title)
print(article_choice.publish_date)
print(article_choice.text)

Cointelegraph Announces Chinese HQ, Bolstering Its International Expansion
2019-12-04 19:26:00+00:00
To support our international expansion and global reach, Cointelegraph is delighted to announce the launch of the Chinese-language version of the publication. Today, Dec. 4, we celebrated the opening of the office of Cointelegraph China (Cointelegraph 中文).

The news — which marks another milestone moment in Cointelegraph’s growth — was announced at the Nova Global Blockchain Investment Institutions Summit hosted by the investment ecosystem alliance, Nova Club. Nova Club was formed by top blockchain organizations and aims to facilitate blockchain project development by consolidating resources and expertise.

The new expansion will be led from the heart of Guangzhou, with other offices in Beijing and Shanghai.

Meet the Cointelegraph China business team

Cointelegraph China has brought together leading names in the industry to highlight blockchain and crypto trends in the area.

Co-founde

In [21]:
article.nlp()
article.keywords

['loans',
 'bitcoin',
 'monitor',
 'loan',
 'banks',
 'bank',
 'able',
 'silvergate',
 'crypto',
 'touch',
 'lane',
 'wants',
 'sen',
 'firms']