## Research Project 1
```text
- Source: Reuters
- Goal: Build headline aggregator (e.g. Google News)
- Techniques: TFIDF, Word Embeddings, Cosine Similarity
- Tools: GloVe
- Lines of code: ~100```

In [4]:
# Let's go on reuters.com and pick a URL
url = 'https://www.reuters.com/finance/stocks/company-news/AAPL.O?date=05102018'

In [9]:
# Get that page
import requests
res = requests.get(url)
res.content[:1000]

b'<!--[if !IE]> This has NOT been served from cache <![endif]-->\n<!--[if !IE]> Request served from apache server: produs--i-0211636dda8f9f281 <![endif]-->\n<!--[if !IE]> token: f6fab749-fff3-4017-a510-dfd50d24e039 <![endif]-->\n<!--[if !IE]> App Server /produs--i-0211636dda8f9f281/ <![endif]-->\n\n<!doctype html><html lang="en"><head>\n<title>Apple Inc (AAPL.O)  News| Reuters.com</title>\n    <meta http-equiv="X-UA-Compatible" content="IE=edge"><meta charset="utf-8"><meta http-equiv="x-dns-prefetch-control" content="on"><link rel="dns-prefetch" href="//s1.reutersmedia.net"/><link rel="dns-prefetch" href="//s2.reutersmedia.net"/><link rel="dns-prefetch" href="//s3.reutersmedia.net"/><link rel="dns-prefetch" href="//s4.reutersmedia.net"/><link rel="dns-prefetch" href="//static.reuters.com"/><link rel="dns-prefetch" href="//www.googletagservices.com"/><link rel="dns-prefetch" href="//www.googletagmanager.com"/><link rel="dns-prefetch" href="//www.google-analytics.com"/><link rel="dns-pre

In [19]:
# Parse HTML
from lxml import html
tree = html.fromstring(res.content)
tree.getchildren()

[<Element head at 0x128e98368>, <Element body at 0x11f6a1318>]

In [20]:
# What children do we need?
children = tree.xpath('//div[@class="feature"]')

In [23]:
list(children[0].itertext())

['US STOCKS-Wall St rallies and Apple approaches $1 trillion value',
 '\n\t',
 '* Indexes up: Dow 0.80 pct, S&P 500 0.94 pct, Nasdaq 0.89\npct\n(Updates to close)',
 '\n\t']

In [22]:
# Get headlines
headlines = [list(child.itertext())[0] for child in children]
headlines

['US STOCKS-Wall St rallies and Apple approaches $1 trillion value',
 'UPDATE 1-Goldman Sachs, Apple to launch joint credit card - WSJ',
 'Goldman Sachs, Apple to launch joint credit card - WSJ',
 'BRIEF-Apple, Goldman Sachs Team Up On New Credit Card - WSJ',
 'Apple scraps $1 billion Irish data center over planning delays',
 'Apple drops plans for data centre in Ireland due to planning delays - RTE']

### Final code

In [3]:
# Standard library
import datetime
from collections import defaultdict

# Third-party
import spacy
import requests
import numpy as np
from lxml import html
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

with open("glove.6B/glove.6B.50d.txt", "r") as lines:
    W2V = {line.split()[0]: np.array([float(i) for i in line.split()[1:]])
           for line in lines}
    
NLP = spacy.load('en_core_web_sm')

def tokenize(text, lower=True):
    if lower:
        text = text.lower()
    doc = NLP(text)
    return [token.text for token in doc]

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(next(iter(word2vec.values())))
    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

def get_ticker_headlines(ticker, date, 
                         base_url=('https://www.reuters.com/finance/stocks/'
                                   'company-news/%s?date=%s')):
    str_date = date.strftime('%m%d%Y')
    url = base_url % (ticker, str_date)
    res = requests.get(url)
    tree = html.fromstring(res.content)
    headlines = []
    for div in tree.xpath('//div[@class="feature"]'):
        headline = list(div.itertext())[0]
        headlines.append({'original': headline,
                          'tokenized': tokenize(headline),
                          'date': date})
    return headlines

def get_all_headlines(tickers,
                      start=datetime.datetime(2018, 4, 1), 
                      days=30):
    total = []
    for ticker in tickers:
        for day in range(days):
            date = start + datetime.timedelta(day)
            headlines = get_ticker_headlines(ticker, date)
            if headlines:
                total += headlines
    return sorted(total, key=lambda x: x['date'])

def get_similarities(total):
    vect = TfidfEmbeddingVectorizer(word2vec=W2V)
    tokenized = [i['tokenized'] for i in total]
    vect.fit(tokenized)
    transformed = vect.transform(tokenized)
    return cosine_similarity(transformed, transformed)

def cluster(headlines, sims, threshold=0.89, tolerance_days=3):
    done = set()
    for num1 in range(len(headlines) - 1):
        if num1 not in done:
            print('%d) %s (%s)' % (num1, headlines[num1]['original'], 
                                   headlines[num1]['date'].date()))
            done.add(num1)
        for num2 in range(num1 + 1, len(headlines)):
            diff = (headlines[num1]['date'] - headlines[num2]['date']).days
            if sims[num1][num2] > threshold and \
                    num2 not in done and \
                    abs(diff) < tolerance_days:
                print('\t%d) %s (%s)' % (num2, headlines[num2]['original'], 
                                         headlines[num2]['date'].date()))
                done.add(num2)
            
headlines = get_all_headlines(tickers=['AAPL.O'])
sims = get_similarities(headlines)
cluster(headlines, sims)

0) UPDATE 3-Apple plans to replace Intel chips in Macs with its own -Bloomberg (2018-04-02)
1) Apple discloses gender pay gap at UK operations (2018-04-03)
	2) BRIEF-Apple Reports Mean Pay Gap In UK Is 5 Pct Lower For Women (2018-04-03)
3) Tech, trade fears weighed on European shares (2018-04-03)
	4) UPDATE 2-Tech, trade fears weighed on European shares (2018-04-03)
5) BOLSAS EUROPA-Tecnológicas, receios comércio pesam nas acções europeias (2018-04-03)
6) Apple Pay arrives in Brazil, partnering with Itaú Unibanco (2018-04-04)
	7) Apple Pay arrives in Brazil, partnering with Itaú Unibanco (2018-04-04)
8) Apple says repeal of U.S. EPA carbon plan would threaten investments (2018-04-06)
9) Apple says all its facilities now powered by clean energy (2018-04-09)
10) BRIEF-Apple Introduces iPhone 8 And iPhone 8 Plus Red Special Edition (2018-04-09)
	14) CORRECTED-Apple Music appoints new head, hits 48 mln subscribers (2018-04-11)
11) Xiaomi pushes for smartphone component suppliers to invest 