## Notebook for Loading data from NewsAPI 

In [None]:
small_dataset = True
verbose = True

#### Imports

In [43]:
import os
import requests
from newspaper import Article
import hashlib 
import sklearn
from sklearn.model_selection import train_test_split

#### Get NewsAPI key

In [70]:
api_key = os.environ['NEWS_API_KEY']
print(api_key)

5be0c655294c4f2091c64c2fcfa6abd3


#### Set up NewsAPI request 

In [161]:
url = 'https://newsapi.org/v2/everything'
country = 'us'
pageSize = '100'
page = '1'
from_date = '2024-04-20'
to_date = '2024-04-22'
domains = 'cnn.com,bbc.co.uk,nbc.com,nprnews.com,foxnews.com,washingtonpost.com,reuters.com,nytimes.com'
excludeDomains = 'cnnespanol.cnn.com,arabic.cnn.com'
sortBy = 'popularity'
request = f'%s?domains=%s&excludeDomains=%s&from=%s&to=%s&sortBy=%s&pageSize=%s&page=%s&apiKey=%s' % (url, domains, excludeDomains, from_date, to_date, sortBy, pageSize, page, api_key)

print(request)


https://newsapi.org/v2/everything?domains=cnn.com,bbc.co.uk,nbc.com,nprnews.com,foxnews.com,washingtonpost.com,reuters.com,nytimes.com&excludeDomains=cnnespanol.cnn.com,arabic.cnn.com&from=2024-04-20&to=2024-04-22&sortBy=popularity&pageSize=100&page=1&apiKey=5be0c655294c4f2091c64c2fcfa6abd3


#### Perform request

In [162]:
response = requests.get(request)

#### Get number of articles 

In [163]:
print(response.json()['totalResults'])

699


#### Data processing 

In [164]:
data = response.json()['articles']

In [165]:
dataset = []
count = 0

if small_dataset:
    max_articles = 10
else:
    max_articles = 100
    

for article in data:
    url = article['url']

    # remove articles that NewsAPI did not return
    if url == "https://removed.com":
        continue 
        
    # remove foreign articles 
    if "espanol" in url or "arabic" in url:
        continue
        
    if verbose:
        print(str(count) + " " + url)    
    
    # remove articles that can't be scraped
    news_article = Article(url)
    try: 
        news_article.download()
        news_article.parse()
    except:
        print("failed to download")
        continue 
    
    content = news_article.text
    
    # remove articles that have a length of 0 
    if len(content) == 0 or len(article['description']) == 0:
        continue
    
    # save all other articles 
    dataset.append({'id' : hashlib.md5(url.encode()).hexdigest(), 'highlights' : article['description'], 'article' : content} )
    count += 1
    
    if count == max_articles: 
        break 

0 https://www.bbc.co.uk/news/world-asia-68867725
1 https://www.bbc.co.uk/news/uk-politics-68870117
2 https://www.bbc.co.uk/sport/live/football/68650698
3 https://www.bbc.co.uk/news/live/world-us-canada-68861011
4 https://www.bbc.co.uk/news/blogs-the-papers-68867084
5 https://www.bbc.co.uk/sport/athletics/68868160
6 https://www.bbc.co.uk/sport/av/rugby-union/68870733
7 https://www.bbc.co.uk/news/world-africa-68846770
8 https://www.bbc.co.uk/news/uk-politics-68721389
9 https://www.bbc.co.uk/news/uk-northern-ireland-68865821
10 https://www.bbc.co.uk/news/uk-england-derbyshire-68868136
11 https://www.bbc.co.uk/news/entertainment-arts-68869153
12 https://www.bbc.co.uk/news/live/world-middle-east-68861338
13 https://www.bbc.co.uk/sport/disability-sport/68868155
14 https://www.bbc.co.uk/news/world-us-canada-68867732
15 https://www.bbc.co.uk/sport/live/formula1/67917668
16 https://www.bbc.co.uk/news/live/uk-politics-68861335
17 https://www.bbc.co.uk/news/blogs-the-papers-68861416
18 https://ww

In [166]:
# verify total number of articles is accurate
len(dataset)

100

#### Train Test Split

In [167]:
train, test = train_test_split(dataset, test_size=.2)
train, val = train_test_split(train, test_size=.25)

In [None]:
if small_dataset:
    np.savez('../data/news_api_data_small.npz' , train=train, test=test, val=val)
else:
    np.savez('../data/news_api_data.npz' , train=train, test=test, val=val)

#### Example Code for Loading Data

In [None]:
# How to load the data
data = np.load('../data/news_api_data_small.npz', allow_pickle=True)
train = data['train']
test = data['test']
val = data['val']
