## Research Project 1
```text
- Source: Reuters
- Goal: Scrape news about Apple Inc, cluster them
- Techniques: TFIDF, Word Embeddings, Cosine Similarity
- Tools: Word2Vec```

In [91]:
# Standard library
import datetime
from pprint import PrettyPrinter

# Third-party
import requests
import numpy as np
from lxml import html

In [92]:
pp = PrettyPrinter(width=120)
with open("glove.6B/glove.6B.50d.txt", "rb") as lines:
    w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
           for line in lines}

In [94]:
def get_ticker_headlines(ticker, date):
    str_date = date.strftime('%m%d%Y')
    url = 'https://www.reuters.com/finance/stocks/company-news/%s?date=%s' % (ticker, str_date)
    res = requests.get(url)
    tree = html.fromstring(res.content)
    headlines = []
    for div in tree.xpath('//div[@class="feature"]'):
        headline = list(div.itertext())[0]
        headlines.append(headline)
    return headlines

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(word2vec.itervalues().next())
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [90]:
total = {}
start = datetime.datetime(2018, 4, 1)
for day in range(30):
    date = start + datetime.timedelta(day)
    headlines = get_ticker_headlines('AAPL.O', date)
    if headlines:
        total[date.strftime('%Y-%m-%d')] = headlines
pp.pprint(total)

{'2018-04-02': ['UPDATE 3-Apple plans to replace Intel chips in Macs with its own -Bloomberg'],
 '2018-04-03': ['Apple discloses gender pay gap at UK operations',
                'BRIEF-Apple Reports Mean Pay Gap In UK Is 5 Pct Lower For Women',
                'Tech, trade fears weighed on European shares',
                'UPDATE 2-Tech, trade fears weighed on European shares',
                'BOLSAS EUROPA-Tecnológicas, receios comércio pesam nas acções europeias'],
 '2018-04-04': ['Apple Pay arrives in Brazil, partnering with Itaú Unibanco',
                'Apple Pay arrives in Brazil, partnering with Itaú Unibanco'],
 '2018-04-06': ['Apple says repeal of U.S. EPA carbon plan would threaten investments'],
 '2018-04-09': ['Apple says all its facilities now powered by clean energy',
                'BRIEF-Apple Introduces iPhone 8 And iPhone 8 Plus Red Special Edition',
                'Xiaomi pushes for smartphone component suppliers to invest in India',
                'UPDATE 1-

In [95]:
vect = TfidfEmbeddingVectorizer(word2vec=w2v)

AttributeError: 'dict' object has no attribute 'itervalues'