## Research Project 3
---
```text
- Source: Reuters
- Goal: Build headline aggregator (e.g. Google News)
- Techniques: Word Embeddings, Cosine Similarity
- Tools: Tensorflow Hub
- Lines of code: ~70```

In [11]:
# Standard library
import logging
import datetime

# Third-party
import requests
from lxml import html
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.metrics.pairwise import cosine_similarity

tf.logging.set_verbosity(tf.logging.ERROR)

def get_ticker_headlines(ticker, date, 
                         base_url=('https://www.reuters.com/finance/stocks/'
                                   'company-news/%s?date=%s')):
    str_date = date.strftime('%m%d%Y')
    url = base_url % (ticker, str_date)
    res = requests.get(url)
    tree = html.fromstring(res.content)
    headlines = []
    for div in tree.xpath('//div[@class="feature"]'):
        headline = list(div.itertext())[0]
        headlines.append({'text': headline,
                          'date': date})
    return headlines

def get_all_headlines(tickers,
                      start=datetime.datetime(2018, 4, 1), 
                      days=30):
    total = []
    for ticker in tickers:
        for day in range(days):
            date = start + datetime.timedelta(day)
            headlines = get_ticker_headlines(ticker, date)
            if headlines:
                total += headlines
    return sorted(total, key=lambda x: x['date'])

def get_similarities(headlines):
    embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/1")
    embeddings = embed([i['text'] for i in headlines])
    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), 
                     tf.tables_initializer()])
        transformed = session.run(embeddings)        
    return cosine_similarity(transformed, transformed)

def cluster(headlines, sims, threshold, tolerance_days=3):
    done = set()
    for num1 in range(len(headlines) - 1):
        if num1 not in done:
            print('\n%d) %s' % (num1, headlines[num1]['text']))
            done.add(num1)
        for num2 in range(num1 + 1, len(headlines)):
            diff = (headlines[num1]['date'] - headlines[num2]['date']).days
            if sims[num1][num2] > threshold and \
                    num2 not in done and \
                    abs(diff) < tolerance_days:
                print('\t%d) %s' % (num2, headlines[num2]['text'][:80]))
                done.add(num2)

headlines = get_all_headlines(tickers=['AAPL.O'])
sims = get_similarities(headlines)
cluster(headlines, sims, threshold=0.68)


0) UPDATE 3-Apple plans to replace Intel chips in Macs with its own -Bloomberg

1) Apple discloses gender pay gap at UK operations
	2) BRIEF-Apple Reports Mean Pay Gap In UK Is 5 Pct Lower For Women

3) Tech, trade fears weighed on European shares
	4) UPDATE 2-Tech, trade fears weighed on European shares

5) BOLSAS EUROPA-Tecnológicas, receios comércio pesam nas acções europeias

6) Apple Pay arrives in Brazil, partnering with Itaú Unibanco
	7) Apple Pay arrives in Brazil, partnering with Itaú Unibanco

8) Apple says repeal of U.S. EPA carbon plan would threaten investments

9) Apple says all its facilities now powered by clean energy

10) BRIEF-Apple Introduces iPhone 8 And iPhone 8 Plus Red Special Edition

11) Xiaomi pushes for smartphone component suppliers to invest in India
	12) UPDATE 1-Xiaomi pushes for smartphone component suppliers to invest in India

13) Apple adds Isaac Asimov sci-fi series to TV development list

14) CORRECTED-Apple Music appoints new head, hits 48 mln su