# Scrape every MarketWatch article from June - September

### Import relevant libraries

In [293]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
from collections import defaultdict
import re
import numpy as np
import pandas as pd
import string
from datetime import datetime, timedelta
import quandl
import pickle

chromedriver = '/Applications/chromedriver'
os.environ['webdriver.chrome.driver'] = chromedriver

### Use selenium to remotely navigate marketwatch.com

In [6]:
url = 'http://www.marketwatch.com'
driver = webdriver.Chrome(chromedriver)
driver.get(url)

### Set search parameters

In [None]:
#Search 'yellen'
path = '//a[@class="btn btn--outline btn--search"]'
driver.find_element_by_xpath(path).click()
driver.find_element_by_xpath('//a[text()="Advanced Search"]').send_keys(Keys.RETURN)

#Advanced Searcb
driver.find_element_by_id('refinesearchtoggle').click()
driver.find_element_by_xpath('//input[@type="checkbox"]').click()

#Set Subject
search_mode = '//select[@id="mp"]/option[@value="806"]'
driver.find_element_by_xpath(search_mode).click()

#Results Per Page
results = '//select[@id="rpp"]/option[@value="100"]'
driver.find_element_by_xpath(results).click()

#Set date
date = driver.find_element_by_id('bdv')
date.click()
date.send_keys('09/30/2017')

#Search
driver.find_element_by_xpath('//input[@value="Search"]').click()
current_url = driver.current_url

### Scraping function

In [341]:
#Should be for {Keyword: All MarketWatch, Results Per Page: 100, News On Or Before: 9/30/2017}
#Only run if you have run the above code to get the 'current_url'
current_url = 'http://www.marketwatch.com/search?q=&m=Keyword&rpp=100&mp=806&bd=true&bd=false&bdv=09%2F30%2F2017&rs=true'

### Function that scrapes each article's link
### MUST CHECK THE DICTIONARY AT THE END OF THE LOOP

In [524]:
def get_links(url):
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    pages = soup.find_all(class_='searchresult')
    for div in pages:
        attrs = dict()
        if div.find('a', href=True):
            if div.a.parent.name == 'div':
                attrs['url'] = div.a['href']
                attrs['date'] = div.next_sibling.span.text
                econ[div.a.text] = attrs  #MUST CHECK THIS DICTIONARY
    return pages[0].next_sibling.span.text

In [525]:
def get_date(date):
    regex = re.compile('[%s]' % re.escape(string.punctuation.replace(':', '')))
    date = regex.sub('', date)
    date = re.search(r'([A-Z].*)', date)[1]
    date = re.sub(r'([A-Za-z]{3})[a-z]*', r'\1', date)
    date = datetime.strptime(date, '%b %d %Y')
    #date = date.strftime('%m/%d/%Y')
    return date

In [526]:
def continuous_search(month, current_url, stop_month):
    
    driver = webdriver.Chrome(chromedriver)
    driver.get(current_url)
    
    while month != stop_month:
        
        #if BeautifulSoup(requests.get(current_url).text, 'html.parser').find_all(class_='searchresult')[0]:
        try:
            first_link_date = get_links(current_url)
            date = get_date(first_link_date) + timedelta(days=1)
            search_date = date.strftime('%m/%d/%Y')
            month = date.month
        
            try:
            
                driver.find_element_by_xpath('//a[text()="Next"]').click()
                current_url = driver.current_url
                
            except:
            
                date = driver.find_element_by_id('bdv')
                date.clear()
                date.send_keys(search_date)
                driver.find_element_by_xpath('//input[@value="Search"]').click()
                current_url = driver.current_url
        
        #else:
        except:   
            date = driver.find_element_by_id('bdv')
            date.clear()
            date.send_keys(search_date)
            driver.find_element_by_xpath('//input[@value="Search"]').click()
            current_url = driver.current_url
            

### Call function to get all links for 4 months

## Initialize the links dictionary
## Only run the first time
## Verify dictionary in get_links function is the same

In [None]:
#links = dict()

In [345]:
continuous_search(9, current_url, 5)

In [523]:
len(links)

11359

In [540]:
with open('links.pkl', 'wb') as f:
    pickle.dump(links, f)

### Run function for Fed articles

In [518]:
#fed_dict = dict()

In [519]:
continuous_search(9, 'http://www.marketwatch.com/search?q=&m=Section&rpp=100&mp=Economy+%26+Politics%7CFederal+Reserve&bd=true&bd=false&bdv=09%2F30%2F2017&rs=true', 5)

In [521]:
len(fed_dict)

300

In [539]:
with open('fed.pkl', 'wb') as f:
    pickle.dump(fed_dict, f)

### Run function for Econ articles

In [527]:
#econ = dict()

In [529]:
continuous_search(9, 'http://www.marketwatch.com/search?q=&m=Subject&rpp=100&mp=ECAT&bd=true&bd=false&bdv=09%2F30%2F2017&rs=true', 5)

In [530]:
len(econ)

1496

In [538]:
with open('econ.pkl', 'wb') as f:
    pickle.dump(econ, f)

In [551]:
econlist = [x for x in econ.items()]

In [552]:
def search_links(econlist=econlist):
    strong = []
    soft = []
    for link in econlist:
        
        title_count = 0
        article_count = 0
        
        title = link[0]
        
        for word in title.lower().split():
            word = re.sub(r'[%s]' % re.escape(string.punctuation), '', word)
            word = re.search(r'(\w*)(\'s)?', word)[1]
            
            if word in ['yellen', 'fed', 'federal', 'reserve', 'fomc']:
                title_count += 1
                
        if title_count > 0:
            strong.append(link)
        
        else:
            url = link[1]['url']
            soup = BeautifulSoup(requests.get(url).text, 'html.parser')
            paragraphs = soup.find(id='article-body').find_all('p')
            for para in [t.lower().split() for t in [''.join(p.find_all(text=True)) for p in paragraphs if bool(p.findChildren('strong')) == False]]:
                for word in para:
                    if word in ['yellen', 'fed', 'reserve', 'fomc']:
                        article_count += 1
            if article_count > 0:
                soft.append(link)
                
    return strong, soft


In [553]:
strong, soft = search_links()

In [556]:
with open('strong.pkl', 'wb') as f:
    pickle.dump(strong, f)

In [557]:
with open('soft.pkl', 'wb') as f:
    pickle.dump(soft, f)

In [493]:
df = pd.DataFrame()

In [494]:
dictlist = [x for x in links.items()]

In [495]:
df['article'] = [x[0].lower() for x in dictlist]
df['datetime'] = [x[1]['date'] for x in dictlist]
df['url'] = [x[1]['url'] for x in dictlist]

In [496]:
regex = re.compile('[%s]' % re.escape(string.punctuation.replace(':', '')))
df['datetime'] = df['datetime'].apply(lambda x: regex.sub('', x))

df['time'] = df['datetime'].apply(lambda x: re.search(r'^(.+?\s.+?)\s', x)[1])
df['time_string'] = df['time'].apply(lambda x: datetime.strptime(x, '%I:%M %p').time())


df['date'] = df['datetime'].apply(lambda x: re.search(r'([A-Z].*)', x)[1])
df['date'] = df['date'].apply(lambda x: re.sub(r'([A-Za-z]{3})[a-z]*', r'\1', x))
df['date'] = df['date'].apply(lambda x: datetime.strptime(x, '%b %d %Y'))
df['date_string'] = df['date'].apply(lambda x: x.date())

#df['datetime'] = df['datetime'].apply(lambda x: datetime.strptime(x, '%b %d %Y %I:%M %p'))

In [497]:
df['datetime'] = df.apply(lambda x: datetime.combine(x['date_string'], x['time_string']), axis=1)

In [498]:
df['month'] = df['datetime'].apply(lambda x: x.month)
df['day'] = df['datetime'].apply(lambda x: x.weekday())

In [499]:
df['url'] = url + df['url']

In [500]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11359 entries, 0 to 11358
Data columns (total 9 columns):
article        11359 non-null object
datetime       11359 non-null datetime64[ns]
url            11359 non-null object
time           11359 non-null object
time_string    11359 non-null object
date           11359 non-null datetime64[ns]
date_string    11359 non-null object
month          11359 non-null int64
day            11359 non-null int64
dtypes: datetime64[ns](2), int64(2), object(5)
memory usage: 798.8+ KB


In [501]:
df.head()

Unnamed: 0,article,datetime,url,time,time_string,date,date_string,month,day
0,the world’s cryptocurrencies are bigger than p...,2017-09-29 18:40:00,http://www.marketwatch.com/story/the-worlds-cr...,6:40 pm,18:40:00,2017-09-29,2017-09-29,9,4
1,sec charges two initial coin offerings with fraud,2017-09-29 18:33:00,http://www.marketwatch.com/story/sec-charges-t...,6:33 pm,18:33:00,2017-09-29,2017-09-29,9,4
2,activist nelson peltz wins more backing in bat...,2017-09-29 18:24:00,http://www.marketwatch.com/story/activist-nels...,6:24 pm,18:24:00,2017-09-29,2017-09-29,9,4
3,aig no longer a threat to u.s. financial stabi...,2017-09-29 18:22:00,http://www.marketwatch.com/story/aig-no-longer...,6:22 pm,18:22:00,2017-09-29,2017-09-29,9,4
4,the americans most likely to fear financial cr...,2017-09-29 17:36:00,http://www.marketwatch.com/story/the-americans...,5:36 pm,17:36:00,2017-09-29,2017-09-29,9,4


In [502]:
df = df.sort_values(['datetime']).set_index('date')

In [505]:
df = df['2017-06-01':]

In [506]:
df

Unnamed: 0_level_0,article,datetime,url,time,time_string,date_string,month,day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-06-01,china daily yuan fix sees biggest jump in months,2017-06-01 00:47:00,http://www.marketwatch.com/story/china-daily-y...,12:47 am,00:47:00,2017-06-01,6,3
2017-06-01,australian retail sales rebound sharply in april,2017-06-01 00:48:00,http://www.marketwatch.com/story/australian-re...,12:48 am,00:48:00,2017-06-01,6,3
2017-06-01,"japan stocks change tack, gain as other asian ...",2017-06-01 01:55:00,http://www.marketwatch.com/story/japan-stocks-...,1:55 am,01:55:00,2017-06-01,6,3
2017-06-01,the dirty secret wall street tries to hide as ...,2017-06-01 02:28:00,http://www.marketwatch.com/story/the-dirty-sec...,2:28 am,02:28:00,2017-06-01,6,3
2017-06-01,marc andreessen says the idea that robots will...,2017-06-01 02:30:00,http://www.marketwatch.com/story/marc-andreess...,2:30 am,02:30:00,2017-06-01,6,3
2017-06-01,these commodities clocked the biggest moves in...,2017-06-01 02:30:00,http://www.marketwatch.com/story/these-commodi...,2:30 am,02:30:00,2017-06-01,6,3
2017-06-01,"facebook, snap are sitting on a $16 billion op...",2017-06-01 02:32:00,http://www.marketwatch.com/story/facebook-snap...,2:32 am,02:32:00,2017-06-01,6,3
2017-06-01,ohio sues 5 drug companies over opioid epidemic,2017-06-01 02:56:00,http://www.marketwatch.com/story/ohio-sues-5-d...,2:56 am,02:56:00,2017-06-01,6,3
2017-06-01,ecb urged by germans to prepare to unwind qe,2017-06-01 03:09:00,http://www.marketwatch.com/story/ecb-urged-by-...,3:09 am,03:09:00,2017-06-01,6,3
2017-06-01,johnson matthey profit rises 19% for year,2017-06-01 03:10:00,http://www.marketwatch.com/story/johnson-matth...,3:10 am,03:10:00,2017-06-01,6,3


In [507]:
len(df)

10948

In [396]:
df.groupby(['month', 'date']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,article,url,time
month,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,Jun 1 2017,126,126,126
6,Jun 10 2017,17,17,17
6,Jun 11 2017,26,26,26
6,Jun 12 2017,106,106,106
6,Jun 13 2017,126,126,126
6,Jun 14 2017,112,112,112
6,Jun 15 2017,120,120,120
6,Jun 16 2017,89,89,89
6,Jun 17 2017,29,29,29
6,Jun 18 2017,33,33,33


In [None]:
spx = pd.read_csv('/Users/samfunk/Downloads/GSPC.csv')

spx['Date'] = pd.to_datetime(spx['Date'], format='%Y-%m-%d')
spx = spx.sort_values(by='Date').set_index('Date')

start = spx.index.get_loc(pd.to_datetime('2017-01-01'), method='nearest')
end = spx.index.get_loc(pd.to_datetime('2017-07-01'), method='nearest')

spx = spx['09-15-2017':]

spx['First Difference'] = spx['Close'].diff()
spx['Daily Difference'] = spx['Close'].diff() / spx['Close'].shift()