In [5]:
import datetime
import logging


from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import dateutil.parser
from tqdm import tqdm
import json
import time
import requests
import praw

import azure.functions as func


def mask_df(in_df, col='', filt=''):
    """ Filters dataframe using user defined column and filter"""
    mask = in_df[col] == filt
    return in_df[mask]


def get_all_article_urls(url, website_attributes):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    base_url = re.match('^.+?[^\/:](?=[?\/]|$)', url).group(0)
    print('base url is', base_url)

    if base_url in website_attributes:
        print('web parameters found, parsing')
        params = website_attributes[base_url]

        all_articles = soup.find(params['all_article_tag'],
                                 attrs={'class': params['all_article_class']})
        all_article_urls = all_articles.find_all(params['article_tag'])
        return_urls = []
        for article_url in all_article_urls:
            print('base url is', base_url)
            if 'bbc' in base_url:
                try:
                    return_urls.append(base_url+article_url.find('a')['href'])
                except TypeError:
                    "error grabbing article url - maybe the article type is nonstandard"
            else:
                return_urls.append(article_url.find('a')['href'])
        print('number of article URLs found:', len(return_urls))
    return return_urls


def get_article_details(url, website_attributes):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    base_url = re.match('^.+?[^\/:](?=[?\/]|$)', url).group(0)
#     print('base url is', base_url)

    article_title = soup.title.text.strip()

    if base_url in website_attributes:
        # print('web parameters found, parsing')
        params = website_attributes[base_url]

        try:
            article_date = soup.find(params['date_tag']
                                 , attrs={'class': params['date_class']})
            article_date = article_date.text.strip()
        except AttributeError:
            print('error - article date could not be found')
            return None, None, None, None

        try:
            postContent = soup.find(params['content_tag']
                                    , attrs={'class': params['content_class']})
        except AttributeError:
            print('error - article content could not be found')
            return None, None, None, None

        article_text = []
        for para in postContent.find_all('p'):
            article_text.append(para.text.strip())

        return article_title, article_date, article_text, base_url
    else:
        print('url not found in website parameters. returning None types')
        return None, None, None, None


def reddit_auth(auth_json):

    reddit = praw.Reddit(username=auth_json['username'],
                         password=auth_json['password'],
                         client_id=auth_json['client_id'],
                         client_secret=auth_json['client_secret'],
                         user_agent=auth_json['user_agent'])
    return reddit


def get_top_reddit_posts(reddit, subreddit, top, lim):
    posts = reddit.subreddit(subreddit).top(top)
    df = pd.DataFrame()
    cols = ['url', 'title', 'date', 'text', 'root_url']

    for post in posts:
        utc = post.created_utc
        fmt = '%d %B, %Y'
        date = datetime.datetime.utcfromtimestamp(utc).strftime(fmt)
        root_url = re.match('^.+?[^\/:](?=[?\/]|$)', post.url).group(0)

        data = [post.url, post.title, date, '', root_url]

        df = df.append(pd.DataFrame([data], columns=cols))

    return df.head(lim)



In [8]:

# def main(mytimer: func.TimerRequest) -> None:
#     utc_timestamp = datetime.datetime.utcnow().replace(
#         tzinfo=datetime.timezone.utc).isoformat()

    # we note down the HTML class types and names that contain the news. 
    # This works by first looking at a page containing many news stories,
    # such as today's news. Then it looks for URLs using all_article_{var}
    # key:value pairs. Then it navigates to each article, pulls out the 
    # title, date and core text content (although the text is unused as of 
    # 27/08/2020 due to copyright concerns)
    
website_attributes = {
    'https://www.bbc.co.uk': {
        'all_article_tag': 'div',
        'all_article_class': 'gel-layout gel-layout--center',
        'article_tag': 'article',
        'date_tag': 'div',
        'date_class': 'date date--v2',
        'content_tag': 'div',
        'content_class': 'story-body__inner'
    }
}
bbc_url = 'https://www.bbc.co.uk/news/topics/cx2pk70323et/uplifting-stories'

df_scraped = pd.DataFrame()
cols = ['url', 'title', 'date', 'text', 'root_url']

todays_bbc_articles = get_all_article_urls(bbc_url, website_attributes)

for url in tqdm(todays_bbc_articles):
    article_title, article_date, article_text, base_url = get_article_details(url, website_attributes)
    df_scraped = df_scraped.append(pd.DataFrame([[url, article_title, article_date, 
                                                  article_text, base_url]], 
                                        columns=cols))


# add in reddit /r/upliftingnews to dataframe
reddit_data = {"client_id": "9sIhMkT4rrMQjA",
    "client_secret": "LzQR8Qkql1FueFxrQ-5wxK5Fq9E",
    "user_agent": "dona_lic_app",
    "username": "dona_lic",
    "password": "Kgvv9LTy%%8@5WlY"}

reddit = reddit_auth(reddit_data)
reddit_df = get_top_reddit_posts(reddit, 'upliftingnews', 'day', 3)
df_scraped = df_scraped.append(reddit_df)

df_scraped = df_scraped.dropna(how='any')
df_scraped['date_parsed'] = [dateutil.parser.parse(x).date() for x in df_scraped['date']]
df_scraped['provider_parsed'] = [x.split('.')[1] for x in df_scraped['root_url']]

# Now we have all the data we read in the titles from Podio to
# check if any articles already exist. If not, they are posted to Podio.

# using james' shim layer to define api URLs
base_url = 'https://goodnewsmicroapp.azurewebsites.net/api'
auth_url = f'{base_url}/PodioAuth'

# store podio auth info locally and get auth token here 
podio_data = {
    "app_id": "25058801",
    "app_token": "683029008df9495a8947c90a38f75ce9",
    "client_id": "goodnews",
    "client_secret": "wrCUCZSxFuPmPZpm7f9iRWm9J4mS6VshbDuXxjNYAHL5RAMTKOFy4VSwHZ4w3csk",
    "grant_type": "client_credentials"
}
app_id = podio_data['app_id']
podio_resp = requests.post(auth_url, data=podio_data)

if not podio_resp.ok:
    raise Exception("Auth failed", podio_resp)

podio_resp = podio_resp.json()
token = podio_resp['access_token']

headers = {
"content-type": "application/json",
"authorization": "Bearer " + token,
"x-podio-client-id": 'x',
"x-podio-client-secret": 'x'
}

# now we have auth token in a header we can return item data
item_url = f'{base_url}/PodioProxy/item/app/{app_id}/filter/'
data = {
    "limit":500,
    "offset":0,
    "filters":{
        }
    }

all_data = requests.post(item_url, headers=headers, json=data).json()
df_from_podio = pd.json_normalize(all_data)
df_from_podio = df_from_podio.dropna()

# we treat the article titles as UIDs and check against them
# to see if they exist, and if not post to Podio
try:
    all_titles = df_from_podio['Title'].values
except KeyError:
    print('Error with the return JSON from Podio.\
        Have you sent in the correct format, or are you rate limited?')

# since James' shim doesn't work for posting, we redefine our request URL
# and also remove the two proprietary header entries
post_headers = {
    "content-type": "application/json",
    "authorization": "Bearer " + token,
    }
post_item_url = f"https://api.podio.com/item/app/{app_id}/"

for index, row in df_scraped.iterrows():  
    if row.title.strip() in all_titles:
        print(f'{row.title} already in podio... ignoring')
        pass
    else:    
        item = {
            "title": str(row.title),
            "url-3": str(row.url),
            "date-3": str(row.date_parsed),
            "provider": str(row.provider_parsed),
            "upvotes": 0,
            "downvotes": 0,
        }
        data = {
            "fields": item
        }

        item_resp = requests.post(post_item_url, json=data, headers=post_headers)
        if not item_resp.ok:
            raise Exception("Post failed", item_resp)

#     if mytimer.past_due:
#         logging.info('The timer is past due!')

#     logging.info('Python timer trigger function ran at %s', utc_timestamp)


  0%|                                                                                            | 0/8 [00:00<?, ?it/s]

base url is https://www.bbc.co.uk
web parameters found, parsing
base url is https://www.bbc.co.uk
base url is https://www.bbc.co.uk
base url is https://www.bbc.co.uk
base url is https://www.bbc.co.uk
base url is https://www.bbc.co.uk
base url is https://www.bbc.co.uk
base url is https://www.bbc.co.uk
base url is https://www.bbc.co.uk
base url is https://www.bbc.co.uk
base url is https://www.bbc.co.uk
number of article URLs found: 8


 12%|██████████▌                                                                         | 1/8 [00:00<00:02,  2.75it/s]

error - article date could not be found


 25%|█████████████████████                                                               | 2/8 [00:00<00:02,  2.90it/s]

error - article date could not be found


 38%|███████████████████████████████▌                                                    | 3/8 [00:01<00:02,  2.41it/s]

error - article date could not be found


 50%|██████████████████████████████████████████                                          | 4/8 [00:01<00:01,  2.22it/s]

error - article date could not be found


 62%|████████████████████████████████████████████████████▌                               | 5/8 [00:02<00:01,  2.34it/s]

error - article date could not be found


 88%|█████████████████████████████████████████████████████████████████████████▌          | 7/8 [00:04<00:00,  1.24it/s]

error - article date could not be found


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.53it/s]

error - article date could not be found





Rhyl lifeboat crew rescue seagull-chasing dog from sea - BBC News already in podio... ignoring


In [9]:
df_scraped

Unnamed: 0,url,title,date,text,root_url,date_parsed,provider_parsed
0,https://www.bbc.co.uk/news/uk-wales-53974775?i...,Rhyl lifeboat crew rescue seagull-chasing dog ...,31 August 2020,[A lifeboat crew had to rescue a dog after it ...,https://www.bbc.co.uk,2020-08-31,bbc
0,https://kval.com/news/local/national-guard-loa...,National Guard loaded helicopters 'to the abso...,"15 September, 2020",,https://kval.com,2020-09-15,com
0,https://www.latimes.com/california/story/2020-...,L.A. County coronavirus numbers fall back to p...,"15 September, 2020",,https://www.latimes.com,2020-09-15,latimes
0,https://www.msn.com/en-us/news/good-news/911-d...,911 Dispatcher Saves Lives of Baby and 71-Year...,"15 September, 2020",,https://www.msn.com,2020-09-15,msn
