In [26]:
import feedparser as fp
import json
import newspaper
from newspaper import Article
from time import mktime
from datetime import datetime
from collections import Counter
from collections import defaultdict
import spacy
import en_core_web_lg
import pandas as pd 
import re
nlp = en_core_web_lg.load()

In [18]:
newspaper_dict = {
  "SEC": {
    "rss": "https://www.sec.gov/news/pressreleases.rss"
  }
}

In [19]:
AriticlesToDownload = 25
# Iterate through each news company
for company, value in newspaper_dict.items():
    count = 1
    if 'rss' in value:
        d = fp.parse(value['rss'])
        print("Downloading articles from ", company)
        newsPaper = {
            "rss": value['rss'],
            "articles": []
        }
        if len(d.entries) == 0:
            print('Cannot get the data')
        for entry in d.entries:
            if hasattr(entry, 'published'):
                if count > AriticlesToDownload:
                    break
                article = {}
                article['link'] = entry.link
                date = entry.published_parsed
                article['published'] = datetime.fromtimestamp(mktime(date)).isoformat()
                try:
                    content = Article(entry.link)
                    content.download()
                    content.parse()
                except:
                    print("continuing...")
                    continue
                article['title'] = content.title
                article['text'] = content.text
                article['link'] = content.url
                newsPaper['articles'].append(article)
                print(count, "articles downloaded from", company, ", url: ", entry.link)
                count += 1

Downloading articles from  SEC
1 articles downloaded from SEC , url:  https://www.sec.gov/news/press-release/2020-259
2 articles downloaded from SEC , url:  https://www.sec.gov/news/press-release/2020-258
3 articles downloaded from SEC , url:  https://www.sec.gov/news/press-release/2020-257
4 articles downloaded from SEC , url:  https://www.sec.gov/news/press-release/2020-256
5 articles downloaded from SEC , url:  https://www.sec.gov/news/press-release/2020-255
6 articles downloaded from SEC , url:  https://www.sec.gov/news/press-release/2020-254
7 articles downloaded from SEC , url:  https://www.sec.gov/news/press-release/2020-253
8 articles downloaded from SEC , url:  https://www.sec.gov/news/press-release/2020-252
9 articles downloaded from SEC , url:  https://www.sec.gov/news/press-release/2020-251
10 articles downloaded from SEC , url:  https://www.sec.gov/news/press-release/2020-250
11 articles downloaded from SEC , url:  https://www.sec.gov/news/press-release/2020-249
12 article

In [27]:
# Create keyword list
keyword_example_list = ['compliance', 'cybersecurity', 'surveillance', 'market structure', 
'Regulation', 'short sales', 'funds', 'Securit ies', 'investment']
keyword_example_string = nlp(' '.join(keyword_example_list))

In [28]:
# Calculate the similarity score
all_news_list = [nlp(newsPaper['articles'][i]['text']) for i in range(len(newsPaper['articles']))]
all_news_list_token = [[token.text for token in news if token.pos_ == 'NOUN'] for news in all_news_list]
all_news_title = [newsPaper['articles'][i]['title'] for i in range(len(newsPaper['articles']))]
all_news_date = [newsPaper['articles'][i]['published'] for i in range(len(newsPaper['articles']))]
all_news_url = [newsPaper['articles'][i]['link'] for i in range(len(newsPaper['articles']))]
similarity_score = [nlp(' '.join(news)).similarity(keyword_example_string) for news in all_news_list_token]

In [30]:
result = pd.DataFrame(zip(all_news_date, all_news_title, all_news_url, similarity_score), columns=['Published Date', 'Title', 'Link', 'Similarity Score'])
result_ranked = result.sort_values('Similarity Score', ascending=False)
result_ranked

Unnamed: 0,Published Date,Title,Link,Similarity Score
11,2020-10-07T17:05:48,SEC Proposes Conditional Exemption for Finders...,https://www.sec.gov/news/press-release/2020-248,0.884801
13,2020-10-05T22:30:00,SEC Charges John McAfee With Fraudulently Tout...,https://www.sec.gov/news/press-release/2020-246,0.875059
12,2020-10-07T14:00:00,SEC Updates Regulatory Framework for Fund of F...,https://www.sec.gov/news/press-release/2020-247,0.87097
14,2020-10-05T22:00:00,SEC Staff Releases Report on U.S. Credit Marke...,https://www.sec.gov/news/press-release/2020-245-0,0.866023
18,2020-09-30T21:08:00,SEC Charges HP Inc. With Disclosure Violations...,https://www.sec.gov/news/press-release/2020-241,0.865334
23,2020-09-29T20:30:00,SEC Charges Former Real Estate Executive With ...,https://www.sec.gov/news/press-release/2020-236,0.860091
21,2020-09-30T13:55:00,Morgan Stanley Agrees to Pay $5 Million for Re...,https://www.sec.gov/news/press-release/2020-238,0.851052
5,2020-10-14T20:15:00,SEC Charges Brazilian Meat Producers With FCPA...,https://www.sec.gov/news/press-release/2020-254,0.849427
22,2020-09-30T00:00:48,SEC Charges Manitex International and Three Fo...,https://www.sec.gov/news/press-release/2020-237,0.847541
1,2020-10-15T22:40:00,SEC Charges Andeavor for Inadequate Controls A...,https://www.sec.gov/news/press-release/2020-258,0.841825


In [34]:
# Try different word vector 
# https://fasttext.cc/docs/en/english-vectors.html
nlp2 = spacy.load('/tmp/en_vectors_wiki_lg')
keyword_example_string = nlp2(' '.join(keyword_example_list))

In [35]:
all_news_list = [nlp2(newsPaper['articles'][i]['text']) for i in range(len(newsPaper['articles']))]
all_news_list_token = [[token.text for token in news if not token.is_stop] for news in all_news_list]
all_news_title = [newsPaper['articles'][i]['title'] for i in range(len(newsPaper['articles']))]
all_news_date = [newsPaper['articles'][i]['published'] for i in range(len(newsPaper['articles']))]
similarity_score2 = [nlp2(' '.join(news)).similarity(keyword_example_string) for news in all_news_list_token]
result['similarity_score_wiki'] = similarity_score2
result.sort_values('Similarity Score', ascending=False)

Unnamed: 0,Published Date,Title,Similarity Score,similarity_score_wiki
6,2020-10-07T17:05:48,SEC Proposes Conditional Exemption for Finders...,0.884801,0.838762
8,2020-10-05T22:30:00,SEC Charges John McAfee With Fraudulently Tout...,0.875059,0.824958
7,2020-10-07T14:00:00,SEC Updates Regulatory Framework for Fund of F...,0.87097,0.842015
9,2020-10-05T22:00:00,SEC Staff Releases Report on U.S. Credit Marke...,0.866023,0.838683
13,2020-09-30T21:08:00,SEC Charges HP Inc. With Disclosure Violations...,0.865334,0.841059
22,2020-09-29T16:15:10,SEC Charges Swedish National with Global Schem...,0.861425,0.827589
18,2020-09-29T20:30:00,SEC Charges Former Real Estate Executive With ...,0.860091,0.842572
20,2020-09-29T18:00:00,SEC Charges Unregistered Brokers in Penny Stoc...,0.859748,0.835083
16,2020-09-30T13:55:00,Morgan Stanley Agrees to Pay $5 Million for Re...,0.851052,0.829623
24,2020-09-28T21:55:00,Fiat Chrysler Agrees to Pay $9.5 Million Penal...,0.849773,0.849859


In [6]:
test_dict = {'SEC':
        {'link': 'https://www.sec.gov/news/pressreleases'}
       }

In [32]:
AriticlesToDownload = 10
newsPaper = defaultdict(dict)
for company, value in test_dict.items():
    count = 1
    if 'link' in value:
        # This is the fallback method if a RSS-feed link is not provided.
        # It uses the python newspaper library to extract articles
        print("Building site for ", company)
        paper = newspaper.build(value['link'], memoize_articles=False)
        newsPaper[company] = {
            "link": value['link'],
            "articles": []
        }
    
        noneTypeCount = 0
        for content in paper.articles:
            if count > AriticlesToDownload:
                break
            if len(re.findall(r'.*press-release.*', content.url)) == 1:
                print(content.url)
                try:
                    content.download()
                    content.parse()
                except Exception as e:
                    print(e)
                    print("continuing...")
                    continue
                if content.publish_date is None:
                    print(count, " Article has date of type None...")
                    noneTypeCount += 1
                    if noneTypeCount > 10:
                        print("Too many noneType dates, aborting...")
                        noneTypeCount = 0
                        break
                    count += 1
                    continue
                article = {}
                article['title'] = content.title
                article['text'] = content.text
                article['link'] = content.url
                article['published'] = content.publish_date.isoformat()
                newsPaper[company]['articles'].append(article)
                print(count, "articles downloaded from", company, "url: ", content.url)
                count += 1
                noneTypeCount = 0

Building site for  SEC
https://www.sec.gov/news/press-release/2020-259
1 articles downloaded from SEC url:  https://www.sec.gov/news/press-release/2020-259
https://www.sec.gov/news/press-release/2020-258
2 articles downloaded from SEC url:  https://www.sec.gov/news/press-release/2020-258
https://www.sec.gov/news/press-release/2020-257
3 articles downloaded from SEC url:  https://www.sec.gov/news/press-release/2020-257
https://www.sec.gov/news/press-release/2020-256
4 articles downloaded from SEC url:  https://www.sec.gov/news/press-release/2020-256
https://www.sec.gov/news/press-release/2020-255
5 articles downloaded from SEC url:  https://www.sec.gov/news/press-release/2020-255
https://www.sec.gov/news/press-release/2020-254
6 articles downloaded from SEC url:  https://www.sec.gov/news/press-release/2020-254
https://www.sec.gov/news/press-release/2020-253
7 articles downloaded from SEC url:  https://www.sec.gov/news/press-release/2020-253
https://www.sec.gov/news/press-release/2020-25

In [40]:
newsPaper['SEC']['articles']

[{'title': 'SEC and CFTC to Hold a Joint Open Meeting on October 22',
  'text': 'FOR IMMEDIATE RELEASE\n\n2020-259\n\nThe U.S. Securities and Exchange Commission and the Commodity Futures Trading Commission today announced their agencies will hold a joint open meeting on Thursday, Oct. 22, 2020, at 10:00 a.m. (EDT). The meeting will be held virtually.\n\nThe Commissions will consider the following:\n\nJoint Final Rule: Customer Margin Rules Relating to Security Futures – The Commissions will consider whether to adopt rule amendments to align the minimum margin required on security futures with other similar financial products.\n\nRequest for comment: Portfolio Margining of Uncleared Swaps and Non-Cleared Security-Based Swaps – The Commissions will consider whether to issue a request for comment on the portfolio margining of uncleared swaps and non-cleared security-based swaps. The request for comment would solicit comment on all aspects of the portfolio margining of uncleared swaps, no

In [45]:
url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
article = Article(url)
article.

'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'