In [1]:
import datetime
import logging


from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import dateutil.parser
from tqdm import tqdm
import json
import time
import requests
import praw

import azure.functions as func



In [7]:

def mask_df(in_df, col='', filt=''):
    """ Filters dataframe using user defined column and filter"""
    mask = in_df[col] == filt
    return in_df[mask]


def get_all_article_urls(url, website_attributes):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    base_url = re.match('^.+?[^\/:](?=[?\/]|$)', url).group(0)
    print('base url is', base_url)

    if base_url in website_attributes:
        print('web parameters found, parsing')
        params = website_attributes[base_url]

        all_articles = soup.find(params['all_article_tag'],
                                 attrs={'class': params['all_article_class']})
        all_article_urls = all_articles.find_all(params['article_tag'])
        return_urls = []
        for article_url in all_article_urls:
            if 'bbc' in base_url:
                try:
                    return_urls.append(base_url+article_url.find('a')['href'])
                except TypeError:
                    "error grabbing article url - maybe the article type is nonstandard"
            else:
                return_urls.append(article_url.find('a')['href'])
            time.sleep(1) # limit num requests in short period of time
        print('number of article URLs found:', len(return_urls))
    return return_urls


def get_article_details(url, website_attributes):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    base_url = re.match('^.+?[^\/:](?=[?\/]|$)', url).group(0)
#     print('base url is', base_url)

    article_title = soup.title.text.strip()

    if base_url in website_attributes:
        # print('web parameters found, parsing')
        params = website_attributes[base_url]

        try:
            article_date = soup.find(params['date_tag']
                                 , attrs={'class': params['date_class']})
            article_date = article_date.text.strip()
        except AttributeError:
            print('error - article date could not be found')
            return None, None, None, None

        try:
            postContent = soup.find(params['content_tag']
                                    , attrs={'class': params['content_class']})
        except AttributeError:
            print('error - article content could not be found')
            return None, None, None, None

        article_text = []
        for para in postContent.find_all('p'):
            article_text.append(para.text.strip())

        return article_title, article_date, article_text, base_url
    else:
        print('url not found in website parameters. returning None types')
        return None, None, None, None



In [3]:
def reddit_auth(auth_json):

    reddit = praw.Reddit(username=auth_json['username'],
                         password=auth_json['password'],
                         client_id=auth_json['client_id'],
                         client_secret=auth_json['client_secret'],
                         user_agent=auth_json['user_agent'])
    return reddit


def get_top_reddit_posts(reddit, subreddit, top, lim):
    posts = reddit.subreddit(subreddit).top(top)
    df = pd.DataFrame()
    cols = ['url', 'title', 'date', 'text', 'root_url']

    for post in posts:
        utc = post.created_utc
        fmt = '%d %B, %Y'
        date = datetime.datetime.utcfromtimestamp(utc).strftime(fmt)
        root_url = re.match('^.+?[^\/:](?=[?\/]|$)', post.url).group(0)

        data = [post.url, post.title, date, '', root_url]

        df = df.append(pd.DataFrame([data], columns=cols))

    return df.head(lim)

In [10]:

# we note down the HTML class types and names that contain the news. 
# This works by first looking at a page containing many news stories,
# such as today's news. Then it looks for URLs using all_article_{var}
# key:value pairs. Then it navigates to each article, pulls out the 
# title, date and core text content (although the text is unused as of 
# 27/08/2020 due to copyright concerns)
website_attributes = {
    'https://www.bbc.co.uk': {
        'all_article_tag': 'div',
        'all_article_class': 'gel-layout gel-layout--center',
        'article_tag': 'article',
        'date_tag': 'div',
        'date_class': 'date date--v2',
        'content_tag': 'div',
        'content_class': 'story-body__inner'
    }
}
bbc_url = 'https://www.bbc.co.uk/news/topics/cx2pk70323et/uplifting-stories'

df_scraped = pd.DataFrame()
cols = ['url', 'title', 'date', 'text', 'root_url']

# todays_bbc_articles = get_all_article_urls(bbc_url, website_attributes)

# for url in tqdm(todays_bbc_articles):
#     article_title, article_date, article_text, base_url = get_article_details(url, website_attributes)
#     df_scraped = df_scraped.append(pd.DataFrame([[url, article_title, article_date, 
#                                                   article_text, base_url]], 
#                                         columns=cols))

# df_scraped = df_scraped.dropna(how='any')

# df_scraped['date_parsed'] = [dateutil.parser.parse(x).date() for x in df_scraped['date']]
# df_scraped['provider_parsed'] = [x.split('.')[1] for x in df_scraped['root_url']]

# add in reddit /r/upliftingnews to dataframe
reddit_data = {"client_id": "9sIhMkT4rrMQjA",
    "client_secret": "LzQR8Qkql1FueFxrQ-5wxK5Fq9E",
    "user_agent": "dona_lic_app",
    "username": "dona_lic",
    "password": "Kgvv9LTy%%8@5WlY"}

reddit = reddit_auth(reddit_data)
reddit_df = get_top_reddit_posts(reddit, 'upliftingnews', 'day', 3)
df_scraped = df_scraped.append(reddit_df)



Version 7.0.0 of praw is outdated. Version 7.1.0 was released Tuesday June 23, 2020.


In [11]:
df_scraped

Unnamed: 0,url,title,date,text,root_url
0,https://www.autocar.co.uk/car-news/industry/an...,Volvo reports that EVs make up for their produ...,"04 October, 2020",,https://www.autocar.co.uk
0,https://www.bbc.com/news/world-australia-54417343,Tasmanian devils have been reintroduced into t...,"05 October, 2020",,https://www.bbc.com
0,https://www.nytimes.com/2020/09/27/opinion/pbs...,"Happy anniversary, PBS: today marks 50 years o...","04 October, 2020",,https://www.nytimes.com
