In [1]:
# Setup (Imports)
from datetime import datetime, timedelta
from collections import defaultdict

import requests
import random
import os
import re

from Database import add_stock_ticks, add_headlines

In [2]:

def consume_ticker_csv(stock, filename):
    """Loads data from csv file into database"""
    entries = []
    
    with open(os.path.join('..', 'data', filename), 'r') as tick_csv:
        
        for line in tick_csv:
            
            if "Date" not in line:
                
                date, open_, high, low, close, adj_close, volume = line.split(',')
                
                entries.append((stock, date, open_, high, low, close, adj_close, volume))
                
    add_stock_ticks(entries)


In [3]:

def get_reddit_news(subs, search_terms, limit=None, praw_config='StockMarketML'):
    "Get headlines from Reddit"
    print('Downloading Reddit Posts: ' + ", ".join(subs))
    
    from praw import Reddit
    
    reddit = Reddit(praw_config)

    articles = defaultdict(list)
    
    used = []
    
    for term in search_terms:

        for submission in reddit.subreddit('+'.join(subs)).search(term, limit=limit):
            
            if submission.title.count(' ') > 4 and submission.title not in used:
                
                used.append(submission.title)
                
                date_key = datetime.fromtimestamp(submission.created).strftime('%Y-%m-%d')

                articles[date_key].append(submission.title)
        
    return articles

def get_reuters_news(stock, pages=70):
    """Get headlines from Reuters"""
    print('Downloading Reuters: ' + stock)
    
    found_headlines = []
    
    articles = defaultdict(list)
    
    pattern_headline = re.compile('<h2><a [\s\S]+?>([\s\S]+?)<\/a>[\s\S]*?<\/h2>')
    
    date_current = datetime.now()
    
    while pages > 0:

        text = requests.get('http://www.reuters.com/finance/stocks/company-news/{}?date={}'.format(stock, date_current.strftime('%m%d%Y')),  headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'}).text
        
        for match in pattern_headline.finditer(text):
            
            headline = match.group(1)
            
            headline = headline.replace('\u200d', '').replace('\u200b', '')
            
            headline = re.sub('^[A-Z]+[A-Z\d\s]*\-', '', headline)
            
            date_key = date_current.strftime('%Y-%m-%d')
            
            if headline not in found_headlines:
            
                articles[date_key].append(headline)
                found_headlines.append(headline)
        
        pages -= 1
        
        date_current -= timedelta(days=1)
        
    return articles

def get_twitter_news(querys, limit=100):
    """Get headlines from Twitter"""
    print('Downloading Tweets: ' + ", ".join(querys))
    
    from twitter import Twitter, OAuth
    import twitter_creds as c # Self-Created Python file with Creds

    twitter = Twitter(auth=OAuth(c.ACCESS_TOKEN, c.ACCESS_SECRET, c.CONSUMER_KEY, c.CONSUMER_SECRET))
    
    limit = min(limit, 100)
    
    articles = defaultdict(list)
    
    for query in querys:
    
        tweets = twitter.search.tweets(q=query, result_type='popular', lang='en', count=limit)['statuses']
        
        for tweet in tweets:
            
            text = re.sub(r'https?:\/\/\S+', '', tweet['text'])
            text = re.sub(r'[^\w\s:/]+', '', text)
            
            date = tweet['created_at']
            
            if '\n' not in text and len(text) > len(query) and ' ' in text:
                
                date_key = datetime.strptime(date, "%a %b %d %H:%M:%S %z %Y" ).strftime('%Y-%m-%d')
                
                articles[date_key].append(text)
                
    return articles

def get_seekingalpha_news(stock, pages=500):
    """Get headlines from SeekingAlpha"""
    print('Downloading SeekingAlpha: ' + stock)

    articles = defaultdict(list)

    re_headline = re.compile('<a class="market_current_title" [\s\S]+?>([\s\S]+?)<\/a>')
    re_dates = re.compile('<span class="date pad_on_summaries">([\s\S]+?)<\/span>')

    cookies = None

    for i in range(1, pages + 1):

        if i == 1:
            url = 'https://seekingalpha.com/symbol/{}/news'.format(stock)
        else:
            url = 'https://seekingalpha.com/symbol/{}/news/more_news_all?page={}'.format(stock, i)

        r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'}, cookies=cookies)

        text = r.text.replace('\\"', '"')
        cookies = r.cookies # SeekingAlpha wants cookies.

        headlines = [match.group(1) for match in re_headline.finditer(text)]
        dates = [match.group(1) for match in re_dates.finditer(text)]

        for headline, date in zip(headlines, dates):
            
            headline = headline.replace('(update)', '')
            
            date = date.replace('.', '')

            if 'Today' in date:
                date = datetime.today()
            elif 'Yesterday' in date:
                date = datetime.today() - timedelta(days=1)
            else:
                temp = date.split(',')
                if len(temp[0]) == 3:
                    date = datetime.strptime(temp[1], " %b %d").replace(year=datetime.today().year)
                else:
                    date = datetime.strptime("".join(temp[0:2]), "%b %d %Y")

            articles[date.strftime('%Y-%m-%d')].append(headline)

    return articles


In [4]:

def clean_headline(headline, replacements={}):
    """
    Clean headline
    
    Removes extra chars and replaces words
    """
    headline = headline.lower()
    headline = re.sub('\d+%', 'STAT', headline)
    headline = ''.join(c for c in headline if c in "abcdefghijklmnopqrstuvwxyz ")
    headline = re.sub('\s+', ' ', headline)
    
    for original, replacement in replacements.items():
        headline = headline.replace(original, replacement)
        
    headline = headline.replace('STAT', '**STATISTIC**')
        
    headline = headline.replace('****', '** **') # Seperate joined kwords
    
    return headline.strip()


In [11]:

def save_headlines(headlines, kword_replacements={}):
    """Save headlines to file"""
    
    for stock in headlines:
        
        entries = []
        
        for source in headlines[stock]:
            
            for date in headlines[stock][source]:
                
                for headline in headlines[stock][source][date]:
                    
                    headline = clean_headline(headline, kword_replacements[stock])
                    
                    entries.append((stock, date, source, headline))
                    
        add_headlines(entries)
    

In [6]:

if __name__ == "__main__":
    
    headlines = {
            'GOOG': {
                'reddit': get_reddit_news(['google', 'Android', 'GooglePixel', 'news'], ['Google', 'pixel', 'android', 'stock']), 
                'reuters': get_reuters_news('GOOG.O'),
                'twitter': get_twitter_news(['@Google', '#Google', '#googlepixel', '#Alphabet']),
                'seekingalpha': get_seekingalpha_news('GOOG')
            },
            'AAPL': {
                'reddit': get_reddit_news(['apple', 'ios', 'AAPL', 'news'], ['apple', 'iphone', 'ipad', 'ios', 'stock']), 
                'reuters': get_reuters_news('AAPL.O'),
                'twitter': get_twitter_news(['@Apple', '#Apple', '#IPhone', '#ios']),
                'seekingalpha': get_seekingalpha_news('AAPL')
            },
            'MSFT': {
                'reddit': get_reddit_news(['microsoft', 'windowsphone', 'windows'], ['microsoft', 'phone', 'windows', 'stock']), 
                'reuters': get_reuters_news('MSFT.O'),
                'twitter': get_twitter_news(['@Microsoft', '#Windows', '#Microsoft', '#windowsphone']),
                'seekingalpha': get_seekingalpha_news('MSFT')
            },
            'AMD': {
                'reddit': get_reddit_news(['Amd', 'AMD_Stock', 'pcmasterrace'], ['AMD', 'radeon', 'ryzen', 'stock']), 
                'reuters': get_reuters_news('AMD.O'),
                'twitter': get_twitter_news(['@AMD', '#AMD', '#Ryzen', '#radeon']),
                'seekingalpha': get_seekingalpha_news('AMD')
            },
            'AMZN': {
                'reddit': get_reddit_news(['amazon', 'amazonprime', 'amazonecho'], ['amazon', 'echo', 'prime', 'stock']), 
                'reuters': get_reuters_news('AMZN.O'),
                'twitter': get_twitter_news(['@amazon', '#Amazon', '#jeffbezos', '@amazonecho', '#amazonprime']),
                'seekingalpha': get_seekingalpha_news('AMZN')
            }
    }
    
    kword_replacements = { # To futher generalize headlines
        'GOOG': {
            'google': '**COMPANY**',
            'alphabet': '**COMPANY**',
            'android': '**PRODUCT**',
            'pixel': '**PRODUCT**',
            'maps': '**PRODUCT**',
            'youtube': '**PRODUCT**',
            'chromecast': '**PRODUCT**'
        },
        'AAPL': {
            'apple': '**COMPANY**', 
            'macbook': '**PRODUCT**',
            'iphone': '**PRODUCT**',
            'ipad': '**PRODUCT**',
            'ios': '**PRODUCT**',
            'icloud': '**PRODUCT**'
        },
        'MSFT': {
            'microsoft': '**COMPANY**',
            'windows': '**PRODUCT**'
        },
        'AMD': {
            'amd': '**COMPANY**', 
            'ryzen': '**PRODUCT**',
            'radeon': '**PRODUCT**'
        },
        'AMZN': {
            'amazon': '**COMPANY**',
            'echo': '**PRODUCT**',
            'prime': '**PRODUCT**',
            'alexa': '**PRODUCT**',
            'firetv': '**PRODUCT**'
        }
    }


Downloading Reddit Posts: google, Android, GooglePixel, news
Downloading Reuters: GOOG.O
Downloading Tweets: @Google, #Google, #googlepixel, #Alphabet
Downloading SeekingAlpha: GOOG
Downloading Reddit Posts: apple, ios, AAPL, news
Downloading Reuters: AAPL.O
Downloading Tweets: @Apple, #Apple, #IPhone, #ios
Downloading SeekingAlpha: AAPL
Downloading Reddit Posts: microsoft, windowsphone, windows
Downloading Reuters: MSFT.O
Downloading Tweets: @Microsoft, #Windows, #Microsoft, #windowsphone
Downloading SeekingAlpha: MSFT
Downloading Reddit Posts: Amd, AMD_Stock, pcmasterrace
Downloading Reuters: AMD.O
Downloading Tweets: @AMD, #AMD, #Ryzen, #radeon
Downloading SeekingAlpha: AMD
Downloading Reddit Posts: amazon, amazonprime, amazonecho
Downloading Reuters: AMZN.O
Downloading Tweets: @amazon, #Amazon, #jeffbezos, @amazonecho, #amazonprime
Downloading SeekingAlpha: AMZN


In [12]:

if __name__ == "__main__":

    save_headlines(headlines, kword_replacements)


**COMPANY** shifts thermostat maker nest into **COMPANY**
**COMPANY** eyes gaming with yeti streaming service report
**COMPANY** names former time warner cable executive to lead internet unit
**COMPANY** names former time warner cable exec to lead internet unit
**COMPANY** eyes chinese esports market with investment in chushou
wall st week aheadwhere netflix goes big tech may follow
wall st week aheadwhere netflix goes big tech may follow despite us government shutdown
**COMPANY** unveils new **PRODUCT** software in india to power cheap smartphones
**PRODUCT** to expand teams reviewing extremist content
uber rejected million settlement with waymo earlier this week
us stockswall street rising facebook **COMPANY** lululemon gain
murdoch calls for fee for trusted news publishers on facebook
**COMPANY** to buy chelsea market building for over bln report
**COMPANY** to expand cloud infrastructure with new regions submarine cables
**COMPANY** says will commission three subsea cables in
**COM

**COMPANY** is preparing to drop support for the physical wallet card on june th
shocked that stock doesnt have builtin screenshot editing
driver ticketed for wearing **COMPANY** glass goes on trial today
fbi snatches **COMPANY** glass off the face of innocent amc moviegoer
**COMPANY** allo drops off the top apps chart on the play store
ios vs **PRODUCT** m visual comparison screenshots
antipiracy group hits indie creators for using the word **PRODUCT**s
report cyanogen inc turns down acquisition attempt by **COMPANY** seeks billion valuation
**COMPANY** play htc stock camera app now available on **COMPANY** play
reasons why **COMPANY** should buy radioshack **COMPANY** could immediately have a bigger retail presence than apple with us stores a rejuvenated workforce and a lucrative business model selling carrier **PRODUCT** devices and accessories
**PRODUCT** m is a big deal
are there any music players for **PRODUCT** that let you access your **COMPANY** music on the cloud besides the 

new leak suggests **COMPANY** will finally kill gb **PRODUCT**s
today in **COMPANY** history scott forstall forced out of **COMPANY**
robots to help stock shelves at walmart stores
adobe finally brings flash to **PRODUCT** and **PRODUCT**
i wish the **PRODUCT** leaks would stop and the new rmbp leaks would start
**COMPANY** issues **PRODUCT** beta for **PRODUCT** **PRODUCT** and ipod touch to developers
nintendo becomes most traded japanese stock in any one day this century
buying an **PRODUCT** from a rd party please check the **PRODUCT** for an activation lock with this website first too many people are still buying expensive paperweights and stolen phones
shkreli is nothing like the stock market genius and savvy entrepreneur he had portrayed say federal prosecutors
gif of **COMPANY**s website in
samsung strikes a b supply deal with **COMPANY** for oled panels
**COMPANY**s stock sets first new alltime high closing price in nearly two years
**PRODUCT** developer goes into extended sup

worlds biggest pension fund loses billion in stock rout the worlds biggest pension fund posted a billion loss last quarter as stocks tumbled and the yen surged wiping out all investment gains since it overhauled its strategy by boosting shares and cutting bonds
**COMPANY** patents a way to make allglass **PRODUCT**s **PRODUCT**s monitors and tvs
kgi new **PRODUCT**s to debut next quarter will slow decline in sales inch model wildcard
yesterday i bought an **PRODUCT** from a blind girl working at the **COMPANY** store
eas day of reckoning is here after star wars game uproar billion in stock value wiped out
**PRODUCT** snivelers put up or stfu a call to reason
dear **COMPANY** let us merge several **COMPANY** ids into one account sincerely multiple account holders
having a widget in the mac notification center showing battery percentage of devices in **PRODUCT** **PRODUCT** **PRODUCT** airpods **COMPANY** watch etc like on **PRODUCT** would be nice
am i the only person who actually enjoy

**PRODUCT** software crimson relive edition release notes
**COMPANY** **PRODUCT** cpu with cores and threads spotted
**COMPANY** **PRODUCT** rx vega is just around the corner videocardzcom
**PRODUCT** ghz vs k ghz retest with faster ddr windows update **PRODUCT** is faster oo
**PRODUCT** mhz ram closing the k gap in gaming mindblank tech
newegg visiontek **PRODUCT** rx screenshot leak
probable rx public stock fs ultra score
new **PRODUCT** build with a twist i was just about to pull the trigger for ti bring vega already look at what happened to me for being loyal
hmm wonder if i need **PRODUCT**
i guess were sharing our **COMPANY** xfx giveaway entries
sapphire **PRODUCT** nitro scheduled for th july
**COMPANY** confirms **PRODUCT** rx and rx specifications
when the hell are the rx s gonna be back in stock
your mission to push the **PRODUCT** rebellion forward with your **PRODUCT**powered station on pcpartpicker every enthusiast who participates in this challenge will receive a limited

it could be recalled that on the th of june kaduna state government signed a year enterprise agreement w
join us to learn why **COMPANY** azure is the best cloud platform for your **PRODUCT** server workloads register now
one of the challenges when demonstrating value of an itinvestment is that not the right people are at the table wh
avanade provides it consulting and services focused on businessanalytics business applications and cloud through
safcsp and **COMPANY** singed an mou that includes knowledge exchange technology transfer and localization of cap
soon you wont have to be a **PRODUCT** insider to test **COMPANY**s newest apps
he loves playing xbox he loves his surface hes a total **COMPANY** kid and anoojs wishday is meeting ceo of
mypov the salesforce **COMPANY** oracle sap strategy is to take transactional data and traverse the stages to
learn more about serverless containers and seamless dev environments with kubernetesio and azure at bitnamis se
**COMPANY**s cortana comes

In [8]:

if __name__ == "__main__":
    
    consume_ticker_csv('AAPL', 'AAPL.csv')
    consume_ticker_csv('AMZN', 'AMZN.csv')
    consume_ticker_csv('AMD', 'AMD.csv')
    consume_ticker_csv('GOOG', 'GOOG.csv')
    consume_ticker_csv('MSFT', 'MSFT.csv')
