## Notebook for Loading data from NewsAPI 

In [18]:
small_dataset = False
verbose = False

#### Imports

In [19]:
import os
import requests
from newspaper import Article
import sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#### Get NewsAPI key

In [20]:
api_key = os.environ['NEWS_API_KEY']
print(api_key)

5be0c655294c4f2091c64c2fcfa6abd3


#### Set up NewsAPI request 

In [21]:
url = 'https://newsapi.org/v2/everything'
country = 'us'
pageSize = '100'
page = '1'
from_date = '2024-04-20'
to_date = '2024-04-22'
domains = 'cnn.com,bbc.co.uk,nbc.com,nprnews.com,foxnews.com,washingtonpost.com,reuters.com,nytimes.com'
excludeDomains = 'cnnespanol.cnn.com,arabic.cnn.com'
sortBy = 'popularity'
request = f'%s?domains=%s&excludeDomains=%s&from=%s&to=%s&sortBy=%s&pageSize=%s&page=%s&apiKey=%s' % (url, domains, excludeDomains, from_date, to_date, sortBy, pageSize, page, api_key)

print(request)


https://newsapi.org/v2/everything?domains=cnn.com,bbc.co.uk,nbc.com,nprnews.com,foxnews.com,washingtonpost.com,reuters.com,nytimes.com&excludeDomains=cnnespanol.cnn.com,arabic.cnn.com&from=2024-04-20&to=2024-04-22&sortBy=popularity&pageSize=100&page=1&apiKey=5be0c655294c4f2091c64c2fcfa6abd3


#### Perform request

In [22]:
response = requests.get(request)

#### Get number of articles 

In [23]:
print(response.json()['totalResults'])

1174


#### Data processing 

In [24]:
data = response.json()['articles']

In [25]:
dataset = []
count = 0

if small_dataset:
    max_articles = 10
else:
    max_articles = 100
    

for article in data:
    url = article['url']

    # remove articles that NewsAPI did not return
    if url == "https://removed.com":
        continue 
        
    # remove foreign articles 
    if "espanol" in url or "arabic" in url:
        continue
        
    if verbose:
        print(str(count) + " " + url)    
    
    # remove articles that can't be scraped
    news_article = Article(url)
    try: 
        news_article.download()
        news_article.parse()
    except:
        print("failed to download")
        continue 
    
    content = news_article.text
    
    # remove articles that have a length of 0 
    if len(content) == 0 or len(article['description']) == 0:
        continue
    
    # save all other articles 
    dataset.append({'article' : content, 'highlights' : article['description']} )
    count += 1
    
    if count == max_articles: 
        break 

In [26]:
data = pd.DataFrame.from_dict(dataset)

len(data)

100

#### Train Test Split

In [14]:
train, test = train_test_split(data, test_size=.2)
train, val = train_test_split(train, test_size=.25)

X_train = train.iloc[:,0]
y_train = train.iloc[:,1]
X_test = test.iloc[:,0]
y_test = test.iloc[:,1]
X_val = val.iloc[:,0]
y_val = val.iloc[:,1]

In [17]:
if small_dataset:
    np.savez('../data/news_api_data_small.npz' ,  X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, X_val=X_val, y_val=y_val)
else:
    np.savez('../data/news_api_data.npz' ,  X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, X_val=X_val, y_val=y_val)

#### Example Code for Loading Data

In [27]:
# How to load the data
data = np.load('../data/news_api_data_small.npz', allow_pickle=True)
X_train = data['X_train']
y_train = data['y_train']
X_test = data['X_test']
y_test = data['y_test']
X_val = data['X_val']
y_val = data['y_val']
