# Scrape every MarketWatch article from June - September

This notebook uses selenium and BeautifulSoup to scrape every marketwatch.com news article from June through September 2017. It first scrapes all the links and titles and then segments the articles based on the section they appeared in, specifically looking for articles related to economic and Federal Reserve news. 

In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import re
import numpy as np
import pandas as pd
import string
from datetime import datetime, timedelta
import pickle

chromedriver = '/Applications/chromedriver'
os.environ['webdriver.chrome.driver'] = chromedriver

### Use selenium to remotely navigate marketwatch.com and set search parameters

In [None]:
#Initialize marketwatch.com
url = 'http://www.marketwatch.com'
driver = webdriver.Chrome(chromedriver)
driver.get(url)

#Advanced search
path = '//a[@class="btn btn--outline btn--search"]'
driver.find_element_by_xpath(path).click()
driver.find_element_by_xpath('//a[text()="Advanced Search"]').send_keys(Keys.RETURN)

#Advanced Search
driver.find_element_by_id('refinesearchtoggle').click()
driver.find_element_by_xpath('//input[@type="checkbox"]').click()

#Set Subject
search_mode = '//select[@id="mp"]/option[@value="806"]'
driver.find_element_by_xpath(search_mode).click()

#Results Per Page
results = '//select[@id="rpp"]/option[@value="100"]'
driver.find_element_by_xpath(results).click()

#Set date
date = driver.find_element_by_id('bdv')
date.click()
date.send_keys('09/30/2017')

#Search
driver.find_element_by_xpath('//input[@value="Search"]').click()
current_url = driver.current_url

### Starting url

In [None]:
#Should be for {Keyword: All MarketWatch, Results Per Page: 100, News On Or Before: 9/30/2017}
#Only run if you have run the above code to get the 'current_url'
current_url = 'http://www.marketwatch.com/search?q=&m=Keyword&rpp=100&mp=806&bd=true&bd=false&bdv=09%2F30%2F2017&rs=true'

### Function that scrapes each article's link

In [None]:
def get_links(url):
    '''
    Get the title, url, and date of each article
    MUST CHECK THE DICTIONARY AT THE END OF THE LOOP
    ---
    IN: specific url
    OUT: date
    '''
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    pages = soup.find_all(class_='searchresult')
    for div in pages:
        attrs = dict()
        if div.find('a', href=True):
            if div.a.parent.name == 'div':
                attrs['url'] = div.a['href']
                attrs['date'] = div.next_sibling.span.text
                econ[div.a.text] = attrs  #MUST CHECK THIS DICTIONARY
    return pages[0].next_sibling.span.text

### Format date

In [None]:
def get_date(date):
    '''
    Properly format date from scraped date
    '''
    regex = re.compile('[%s]' % re.escape(string.punctuation.replace(':', '')))
    date = regex.sub('', date)
    date = re.search(r'([A-Z].*)', date)[1]
    date = re.sub(r'([A-Za-z]{3})[a-z]*', r'\1', date)
    date = datetime.strptime(date, '%b %d %Y')
    #date = date.strftime('%m/%d/%Y')
    return date

### Continuously scrape search result pages

In [None]:
def continuous_search(month, current_url, stop_month):
    '''
    This function uses selenium and the above functions to scrape each article link from each search page,
    when the search results run out or if there are not links present, then the function will take the most recent date,
    and input it into the date field and start a new search and continue to loop through each page until it reaches May
    ---
    IN: month = start month (September: 9), current_url = first url to search from, stop_month = end month (May: 5)
    OUT: dictionary of relevant articles and their attributes
    '''
    
    driver = webdriver.Chrome(chromedriver)
    driver.get(current_url)
    
    while month != stop_month:
        
        try:
            first_link_date = get_links(current_url)
            date = get_date(first_link_date) + timedelta(days=1)
            search_date = date.strftime('%m/%d/%Y')
            month = date.month
        
            try:
            
                driver.find_element_by_xpath('//a[text()="Next"]').click()
                current_url = driver.current_url
                
            except:
            
                date = driver.find_element_by_id('bdv')
                date.clear()
                date.send_keys(search_date)
                driver.find_element_by_xpath('//input[@value="Search"]').click()
                current_url = driver.current_url
        
        except:   
            date = driver.find_element_by_id('bdv')
            date.clear()
            date.send_keys(search_date)
            driver.find_element_by_xpath('//input[@value="Search"]').click()
            current_url = driver.current_url

## Call function to get all/relevant links for 4 months
## (Verify dictionary in get_links function is the same)

### Initialize the links dictionary

In [None]:
links = dict()

### Run function for all articles

In [None]:
continuous_search(9, current_url, 5)

In [None]:
len(links)

In [None]:
with open('links.pkl', 'wb') as f:
    pickle.dump(links, f)

### Initialize fed dictionary

In [None]:
fed_dict = dict()

### Run function for Fed articles

In [None]:
continuous_search(9, 'http://www.marketwatch.com/search?q=&m=Section&rpp=100&mp=Economy+%26+Politics%7CFederal+Reserve&bd=true&bd=false&bdv=09%2F30%2F2017&rs=true', 5)

In [None]:
len(fed_dict)

In [None]:
with open('fed.pkl', 'wb') as f:
    pickle.dump(fed_dict, f)

### Initialize economics dictionary

In [None]:
econ = dict()

### Run function for Econ articles

In [None]:
continuous_search(9, 'http://www.marketwatch.com/search?q=&m=Subject&rpp=100&mp=ECAT&bd=true&bd=false&bdv=09%2F30%2F2017&rs=true', 5)

In [None]:
len(econ)

In [None]:
with open('econ.pkl', 'wb') as f:
    pickle.dump(econ, f)

## Determine which econ articles are related to the Fed

In [None]:
econlist = [x for x in econ.items()]

def search_links(econlist=econlist):
    '''
    This function searches every economics article and determines if it is related to the Federal Resereve
    "Strong" cases are when the words 'yellen', 'fed', 'federal', 'reserve', or 'fomc' appear in the title
    "Soft" or weak cases are when the above words (sans 'federal') appear anywhere in the article
    ---
    IN: list of economic articles/links (econlist)
    OUT: strong = list of article info for strong cases, soft = list of article infor weak cases
    '''
    strong = []
    soft = []
    for link in econlist:
        
        title_count = 0
        article_count = 0
        
        title = link[0]
        
        for word in title.lower().split():
            word = re.sub(r'[%s]' % re.escape(string.punctuation), '', word)
            word = re.search(r'(\w*)(\'s)?', word)[1]
            
            if word in ['yellen', 'fed', 'federal', 'reserve', 'fomc']:
                title_count += 1
                
        if title_count > 0:
            strong.append(link)
        
        else:
            url = link[1]['url']
            soup = BeautifulSoup(requests.get(url).text, 'html.parser')
            paragraphs = soup.find(id='article-body').find_all('p')
            for para in [t.lower().split() for t in [''.join(p.find_all(text=True)) for p in paragraphs if bool(p.findChildren('strong')) == False]]:
                for word in para:
                    if word in ['yellen', 'fed', 'reserve', 'fomc']:
                        article_count += 1
            if article_count > 0:
                soft.append(link)
                
    return strong, soft

In [None]:
strong, soft = search_links()

In [None]:
with open('strong.pkl', 'wb') as f:
    pickle.dump(strong, f)

In [None]:
with open('soft.pkl', 'wb') as f:
    pickle.dump(soft, f)