In [1]:
import datetime
import logging


from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import dateutil.parser
from tqdm import tqdm
import json
import time
import requests

import azure.functions as func


In [2]:
def mask_df(in_df, col='', filt=''):
    """ Filters dataframe using user defined column and filter"""
    mask = in_df[col] == filt
    return in_df[mask]

In [3]:
def get_all_article_urls(url, website_attributes):
    r=requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    base_shim_url = re.match('^.+?[^\/:](?=[?\/]|$)', url).group(0)
    print('base url is', base_shim_url)
    
    if base_shim_url in website_attributes:
        print('web parameters found, parsing')
        params = website_attributes[base_shim_url]
        
        all_articles = soup.find(params['all_article_tag'],
                                 attrs={'class': params['all_article_class']})
        all_article_urls = all_articles.find_all(params['article_tag'])
        return_urls = []
        for article_url in all_article_urls:
            if 'bbc' in base_shim_url:
                return_urls.append(base_shim_url+article_url.find('a')['href'])
            else:
                return_urls.append(article_url.find('a')['href'])
        print('number of article URLs found:', len(return_urls))
    return return_urls

In [4]:
def get_article_details(url, website_attributes):
    
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    base_shim_url = re.match('^.+?[^\/:](?=[?\/]|$)', url).group(0)
#     print('base url is', base_shim_url)
    
    article_title = soup.title.text.strip()
    
    if base_shim_url in website_attributes:
#         print('web parameters found, parsing')
        params = website_attributes[base_shim_url]
        
        try:
            article_date = soup.find(params['date_tag']
                                 , attrs={'class': params['date_class']})
            article_date = article_date.text.strip()
        except AttributeError:
            print('error - article date could not be found')
            return None, None, None, None
        
        try:
            postContent = soup.find(params['content_tag']
                                    , attrs={'class': params['content_class']})
        except AttributeError:
            print('error - article content could not be found')
            return None, None, None, None
            
        article_text = []
        for para in postContent.find_all('p'):
            article_text.append(para.text.strip())
        
        return article_title, article_date, article_text, base_shim_url
    else:
        print('url not found in website parameters. returning None types')
        return None, None, None, None

In [5]:
# we note down the HTML class types and names that contain the news. 
# This works by first looking at a page containing many news stories,
# such as today's news. Then it looks for URLs using all_article_{var}
# key:value pairs. Then it navigates to each article, pulls out the 
# title, date and core text content (although the text is unused as of 
# 27/08/2020 due to copyright concerns)
website_attributes = {
    'https://www.optimistdaily.com': {
        'all_article_tag': 'div',
        'all_article_class': 'single-post-content-sidebar-wrap',
        'article_tag': 'article',
        'date_tag': 'time',
        'date_class': 'entry-time',
        'content_tag': 'div',
        'content_class': 'postContent'
    },
    'https://www.bbc.co.uk': {
        'all_article_tag': 'div',
        'all_article_class': 'gel-layout gel-layout--center',
        'article_tag': 'article',
        'date_tag': 'div',
        'date_class': 'date date--v2',
        'content_tag': 'div',
        'content_class': 'story-body__inner'
    }
}
url1 = 'https://www.optimistdaily.com/todays-solutions/'
url2 = 'https://www.bbc.co.uk/news/topics/cx2pk70323et/uplifting-stories'

df_scraped = pd.DataFrame()
cols = ['url', 'title', 'date', 'text', 'root_url']

todays_articles_1 = get_all_article_urls(url1, website_attributes)
todays_articles_2 = get_all_article_urls(url2, website_attributes)

todays_articles = todays_articles_1 + todays_articles_2

for url in tqdm(todays_articles):
    article_title, article_date, article_text, base_url = get_article_details(url, website_attributes)
    df_scraped = df_scraped.append(pd.DataFrame([[url, article_title, article_date, 
                                            article_text, base_url]], 
                                        columns=cols))

df_scraped = df_scraped.dropna(how='any')
df_scraped['date_parsed'] = [dateutil.parser.parse(x).date() for x in df_scraped['date']]
df_scraped['title_parsed'] = [x.split('|')[0] for x in df_scraped['title']]
df_scraped['provider_parsed'] = [x.split('.')[1] for x in df_scraped['root_url']]

# Now we have all the data we read in the titles from Podio to 
# check if any articles already exist. If not, they are posted to Podio.


# using james' shim layer to define api URLs
base_shim_url = 'https://goodnewsmicroapp.azurewebsites.net/api'
auth_url = f'{base_shim_url}/PodioAuth'

podio_data = {
    "app_id": "25058801",
    "app_token": "683029008df9495a8947c90a38f75ce9",
    "client_id": "goodnews",
    "client_secret": "wrCUCZSxFuPmPZpm7f9iRWm9J4mS6VshbDuXxjNYAHL5RAMTKOFy4VSwHZ4w3csk",
    "grant_type": "client_credentials"
}

# store podio auth info locally and get auth token here 

app_id = podio_data['app_id']
podio_resp = requests.post(auth_url, data=podio_data)

if not podio_resp.ok:
    raise Exception("Auth failed", podio_resp)

podio_resp = podio_resp.json()
token = podio_resp['access_token']

headers = {
"content-type": "application/json",
"authorization": "Bearer " + token,
"x-podio-client-id": 'x',
"x-podio-client-secret": 'x'
}

# now we have auth token in a header we can return item data
item_url = f'{base_shim_url}/PodioProxy/item/app/{app_id}/filter/'
data = {
    "limit":500,
    "offset":0,
    "filters":{
        }
    }

all_data = requests.post(item_url, headers=headers, json=data).json()
df_from_podio = pd.json_normalize(all_data)
df_from_podio = df_from_podio.dropna()

# we treat the article titles as UIDs and check against them
# to see if they exist, and if not post to Podio
all_titles = df_from_podio['Title'].values

# since James' shim doesn't work for posting, we redefine our request URL
# and also remove the two proprietary header entries
post_headers = {
    "content-type": "application/json",
    "authorization": "Bearer " + token,
    }


base url is https://www.optimistdaily.com
web parameters found, parsing
number of article URLs found: 10


  0%|                                                                                           | 0/20 [00:00<?, ?it/s]

base url is https://www.bbc.co.uk
web parameters found, parsing
number of article URLs found: 10


 55%|█████████████████████████████████████████████                                     | 11/20 [00:17<00:12,  1.36s/it]

error - article date could not be found


 60%|█████████████████████████████████████████████████▏                                | 12/20 [00:18<00:08,  1.06s/it]

error - article date could not be found


 75%|█████████████████████████████████████████████████████████████▌                    | 15/20 [00:19<00:02,  1.76it/s]

error - article date could not be found


 85%|█████████████████████████████████████████████████████████████████████▋            | 17/20 [00:19<00:01,  2.35it/s]

error - article date could not be found


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.02s/it]

error - article date could not be found





In [6]:
all_data = requests.post(item_url, headers=headers, json=data)


In [7]:
post_item_url = f"https://api.podio.com/item/app/{app_id}/"

for index, row in df_scraped.iterrows():  
    if row.title_parsed.strip() in all_titles:
        print(f'{row.title_parsed} already in podio... ignoring')
        pass
    else:    
        item = {
            "title": str(row.title_parsed),
            "url-3": str(row.url),
            "date-3": str(row.date_parsed),
            "provider": str(row.provider_parsed),
            "upvotes": 0,
            "downvotes": 0,
        }
        data = {
            "fields": item
        }

        item_resp = requests.post(post_item_url, json=data, headers=post_headers)
        if not item_resp.ok:
            raise Exception("Post failed", item_resp)


Five years on, many refugees are feeling at home in Germany  already in podio... ignoring
Craving a bedtime snack? These 9 foods promote deeper sleep  already in podio... ignoring
Finnish town gives locals free cake if they cut their carbon footprint  already in podio... ignoring
A giant virtual power plant is coming to California to avoid blackouts  already in podio... ignoring
6 ways to take back your motivation  already in podio... ignoring
Doubling down on diesel: CA regulations will slash nitrogen oxide emissions  already in podio... ignoring
Not a gram of salmon goes to waste with this new process  already in podio... ignoring
New lidar technology offers a glimpse into the unexplored depths of our oceans  already in podio... ignoring
These US firms are part of a global reforestation effort to plant 1tn trees  already in podio... ignoring
Long-awaited Giantess Geyser finally erupts  already in podio... ignoring
Rhyl lifeboat crew rescue seagull-chasing dog from sea - BBC News alre

In [8]:
df_from_podio

Unnamed: 0,Title,URL,Date,Provider,Upvotes,Downvotes,_podio_item_id,_podio_app_item_id
0,"Boy, 8, cuddles West Midlands Police pups on b...",https://www.bbc.co.uk/news/uk-england-birmingh...,2020-08-09,bbc,0,0,1508029305,21084
1,Missing ring found: Merthyr Tydfil mother reun...,https://www.bbc.co.uk/news/uk-wales-53748605?i...,2020-08-12,bbc,0,0,1508029283,21083
2,"Staithes sea rescue: Boy, 11, jumps in to save...",https://www.bbc.co.uk/news/uk-england-york-nor...,2020-08-21,bbc,0,0,1508029271,21082
3,Going for a Lake District walk with Max the Mi...,https://www.bbc.co.uk/news/uk-england-cumbria-...,2020-08-31,bbc,0,0,1508029261,21081
4,Rhyl lifeboat crew rescue seagull-chasing dog ...,https://www.bbc.co.uk/news/uk-wales-53974775?i...,2020-08-31,bbc,0,0,1508029246,21080
5,Long-awaited Giantess Geyser finally erupts,https://www.optimistdaily.com/2020/09/long-awa...,2020-09-01,optimistdaily,0,0,1508029234,21079
6,These US firms are part of a global reforestat...,https://www.optimistdaily.com/2020/09/these-us...,2020-09-01,optimistdaily,0,0,1508029226,21078
7,New lidar technology offers a glimpse into the...,https://www.optimistdaily.com/2020/09/new-lida...,2020-09-01,optimistdaily,0,0,1508029187,21077
8,Not a gram of salmon goes to waste with this n...,https://www.optimistdaily.com/2020/09/not-a-gr...,2020-09-01,optimistdaily,0,0,1508029173,21076
9,Doubling down on diesel: CA regulations will s...,https://www.optimistdaily.com/2020/09/doubling...,2020-09-01,optimistdaily,0,0,1508029163,21075
