# Scrape every MarketWatch article from June - September

### Import relevant libraries

In [293]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
from collections import defaultdict
import re
import numpy as np
import pandas as pd
import string
from datetime import datetime, timedelta
import quandl
import pickle

chromedriver = '/Applications/chromedriver'
os.environ['webdriver.chrome.driver'] = chromedriver

### Use selenium to remotely navigate marketwatch.com

In [6]:
url = 'http://www.marketwatch.com'
driver = webdriver.Chrome(chromedriver)
driver.get(url)

### Set search parameters

In [None]:
#Search 'yellen'
path = '//a[@class="btn btn--outline btn--search"]'
driver.find_element_by_xpath(path).click()
driver.find_element_by_xpath('//a[text()="Advanced Search"]').send_keys(Keys.RETURN)

#Advanced Searcb
driver.find_element_by_id('refinesearchtoggle').click()
driver.find_element_by_xpath('//input[@type="checkbox"]').click()

#Set Subject
search_mode = '//select[@id="mp"]/option[@value="806"]'
driver.find_element_by_xpath(search_mode).click()

#Results Per Page
results = '//select[@id="rpp"]/option[@value="100"]'
driver.find_element_by_xpath(results).click()

#Set date
date = driver.find_element_by_id('bdv')
date.click()
date.send_keys('09/30/2017')

#Search
driver.find_element_by_xpath('//input[@value="Search"]').click()
current_url = driver.current_url

### Scraping function

In [341]:
#Should be for {Keyword: All MarketWatch, Results Per Page: 100, News On Or Before: 9/30/2017}
#Only run if you have run the above code to get the 'current_url'
current_url = 'http://www.marketwatch.com/search?q=&m=Keyword&rpp=100&mp=806&bd=true&bd=false&bdv=09%2F30%2F2017&rs=true'

### Function that scrapes each article's link
### MUST CHECK THE DICTIONARY AT THE END OF THE LOOP

In [524]:
def get_links(url):
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    pages = soup.find_all(class_='searchresult')
    for div in pages:
        attrs = dict()
        if div.find('a', href=True):
            if div.a.parent.name == 'div':
                attrs['url'] = div.a['href']
                attrs['date'] = div.next_sibling.span.text
                econ[div.a.text] = attrs  #MUST CHECK THIS DICTIONARY
    return pages[0].next_sibling.span.text

In [525]:
def get_date(date):
    regex = re.compile('[%s]' % re.escape(string.punctuation.replace(':', '')))
    date = regex.sub('', date)
    date = re.search(r'([A-Z].*)', date)[1]
    date = re.sub(r'([A-Za-z]{3})[a-z]*', r'\1', date)
    date = datetime.strptime(date, '%b %d %Y')
    #date = date.strftime('%m/%d/%Y')
    return date

In [526]:
def continuous_search(month, current_url, stop_month):
    
    driver = webdriver.Chrome(chromedriver)
    driver.get(current_url)
    
    while month != stop_month:
        
        #if BeautifulSoup(requests.get(current_url).text, 'html.parser').find_all(class_='searchresult')[0]:
        try:
            first_link_date = get_links(current_url)
            date = get_date(first_link_date) + timedelta(days=1)
            search_date = date.strftime('%m/%d/%Y')
            month = date.month
        
            try:
            
                driver.find_element_by_xpath('//a[text()="Next"]').click()
                current_url = driver.current_url
                
            except:
            
                date = driver.find_element_by_id('bdv')
                date.clear()
                date.send_keys(search_date)
                driver.find_element_by_xpath('//input[@value="Search"]').click()
                current_url = driver.current_url
        
        #else:
        except:   
            date = driver.find_element_by_id('bdv')
            date.clear()
            date.send_keys(search_date)
            driver.find_element_by_xpath('//input[@value="Search"]').click()
            current_url = driver.current_url
            

### Call function to get all links for 4 months

## Initialize the links dictionary
## Only run the first time
## Verify dictionary in get_links function is the same

In [None]:
#links = dict()

In [345]:
continuous_search(9, current_url, 5)

In [523]:
len(links)

11359

In [540]:
with open('links.pkl', 'wb') as f:
    pickle.dump(links, f)

### Run function for Fed articles

In [518]:
#fed_dict = dict()

In [519]:
continuous_search(9, 'http://www.marketwatch.com/search?q=&m=Section&rpp=100&mp=Economy+%26+Politics%7CFederal+Reserve&bd=true&bd=false&bdv=09%2F30%2F2017&rs=true', 5)

In [521]:
len(fed_dict)

300

In [539]:
with open('fed.pkl', 'wb') as f:
    pickle.dump(fed_dict, f)

### Run function for Econ articles

In [527]:
#econ = dict()

In [529]:
continuous_search(9, 'http://www.marketwatch.com/search?q=&m=Subject&rpp=100&mp=ECAT&bd=true&bd=false&bdv=09%2F30%2F2017&rs=true', 5)

In [530]:
len(econ)

1496

In [538]:
with open('econ.pkl', 'wb') as f:
    pickle.dump(econ, f)

In [551]:
econlist = [x for x in econ.items()]

In [552]:
def search_links(econlist=econlist):
    strong = []
    soft = []
    for link in econlist:
        
        title_count = 0
        article_count = 0
        
        title = link[0]
        
        for word in title.lower().split():
            word = re.sub(r'[%s]' % re.escape(string.punctuation), '', word)
            word = re.search(r'(\w*)(\'s)?', word)[1]
            
            if word in ['yellen', 'fed', 'federal', 'reserve', 'fomc']:
                title_count += 1
                
        if title_count > 0:
            strong.append(link)
        
        else:
            url = link[1]['url']
            soup = BeautifulSoup(requests.get(url).text, 'html.parser')
            paragraphs = soup.find(id='article-body').find_all('p')
            for para in [t.lower().split() for t in [''.join(p.find_all(text=True)) for p in paragraphs if bool(p.findChildren('strong')) == False]]:
                for word in para:
                    if word in ['yellen', 'fed', 'reserve', 'fomc']:
                        article_count += 1
            if article_count > 0:
                soft.append(link)
                
    return strong, soft


In [553]:
strong, soft = search_links()

In [556]:
with open('strong.pkl', 'wb') as f:
    pickle.dump(strong, f)

In [557]:
with open('soft.pkl', 'wb') as f:
    pickle.dump(soft, f)

In [None]:
def breadcrumbs(url):
    bread = []
    try:
        soup = BeautifulSoup(requests.get(url).text, 'html.parser')
        for crumb in soup.find_all(class_ = 'fa fa-angle-right breadcrumb_item'):
            bread.append(crumb.replace('\n', ''))
        return bread
    except:
        return 'null'