# Description:
In this notebook we request 100 news articles from the NewsAPI (maximum allowed) and we use their truncated content to build a corpus, then we preprocess the corpus using the built CorpusPreprocess scikit-learn-like transformer. We train a gensim doc2vec model on the preprocessed corpus and we assess the model by checking document rankings for each document (the document should be the most similar with itself) and by comparing random documents' content with their similar documents' content. Finally, we request new documents from NewsAPI, apply preprocessing, infer their vectors and assess their quality by getting their most similar documents.

# TODO:
- add date, price, weekday, ... token to CorpusPreprocess
- webscrape full content from urls provided by api

In [None]:
from src.utils import CorpusPreprocess, check_random_doc_similarity, compare_documents, similarity_query
import os
from dotenv import load_dotenv, find_dotenv
from datetime import datetime, timedelta
import random
import collections
from newsapi import NewsApiClient
from string import punctuation
from nltk.corpus import stopwords
from gensim import models
# import numpy as np
# from scipy.spatial.distance import pdist, squareform

In [None]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

NEWSAPIKEY = os.environ.get("NEWSAPIKEY")

In [None]:
# Init
newsapi = NewsApiClient(api_key=NEWSAPIKEY)

# Get news articles
articles = newsapi.get_top_headlines(language='en',
                                     category='sports',  # 'business','entertainment','general','health','science','sports','technology'
                                      # domains='bbc.co.uk',
                                      # from_param=datetime.today() - timedelta(30),
                                      # to=datetime.today(),
                                      page_size=100,
                                      country='us')

corpus = list(set([c['content'] for c in articles['articles'] if c['content']]))

print("Example of article content:\n\n{}".format(corpus[0]))

In [None]:
# Train/ test split
test_idx = random.sample(range(len(corpus)), int(len(corpus) * 0.1))
test_corpus = [corpus[i] for i in test_idx]
train_corpus = list(set(corpus).difference(set(test_corpus)))

In [None]:
# Preprocessing - removing stopwords, lowercasing, strip accents, strip punctuation, stemming, max_df and min_df thresholds
prep = CorpusPreprocess(stop_words=stopwords.words('english'), lowercase=True, strip_accents=True,
                        strip_punctuation=punctuation, stemmer=True, max_df=0.2, min_df=2)
processed_train_corpus = prep.fit_transform(train_corpus)
processed_test_corpus = prep.transform(test_corpus)

print("Example of preprocessed article content:\n\n{}".format(processed_train_corpus[0]))

In [None]:
# TaggedDocument format (input to doc2vec)
tagged_corpus = [models.doc2vec.TaggedDocument(text, [i]) for i, text in enumerate(processed_train_corpus)]

# Doc2Vec model
model = models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=200)
model.build_vocab(tagged_corpus)
model.train(tagged_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
model.wv.vocab.keys()  # this accesses the words in the vocabulary

In [None]:
# Assessing Doc2Vec model
ranks = []
for doc_id in range(len(tagged_corpus)):
    inferred_vector = model.infer_vector(tagged_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

# Optimally we want as much documents to be the most similar with themselves (i.e. rank 0)
print(collections.OrderedDict(sorted(collections.Counter(ranks).items())))

### Observation:
Above we can the distribution of self-document similarity rank (i.e. ~ 53 documents have itself as the most similar document - rank 0, ~ 2 documents have itself as the second most similar document - rank 0, ...)

In [None]:
# Pick a random document from the train corpus, infer its vector and check similarity with other documents
doc_id, sims = check_random_doc_similarity(model, tagged_corpus)
compare_documents(doc_id, train_corpus, sims, train_corpus)

print("---------------------------------------------------------------------------------------------------------------------------------\n")
# Pick a random document from the test corpus, infer its vector and check similarity with other documents
doc_id, sims = check_random_doc_similarity(model, tagged_corpus, processed_test_corpus)
compare_documents(doc_id, test_corpus, sims, train_corpus)

In [None]:
# Get new news articles
new_articles = newsapi.get_everything(language='en',
                                      domains='bbc.co.uk',
                                      from_param=datetime.today() - timedelta(30),
                                      to=datetime.today() - timedelta(20),
                                      page_size=10)

new_corpus = list(set([c['content'] for c in new_articles['articles'] if c['content']]))

# Apply preprocessing
new_processed_corpus = prep.transform(new_corpus)

# Similarity query
doc_id = random.randint(0, len(test_corpus) - 1)
unkwnown_doc = new_processed_corpus[doc_id]
sims = similarity_query(model, unkwnown_doc)
compare_documents(doc_id, new_corpus, sims, train_corpus)