# Scraping articles from websites that only post good news

Websites that only post good news:
* https://www.positive.news/
* https://www.bbc.co.uk/news/topics/cx2pk70323et/uplifting-stories **DONE**
* https://www.optimistdaily.com/ **DONE**

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import dateutil.parser
from datetime import *
from dateutil.relativedelta import *
from tqdm.notebook import tqdm

In [2]:
website_attributes = {
    'https://www.optimistdaily.com': {
        'all_article_tag': 'div',
        'all_article_class': 'single-post-content-sidebar-wrap',
        'article_tag': 'article',
        'date_tag': 'time',
        'date_class': 'entry-time',
        'content_tag': 'div',
        'content_class': 'postContent'
    },
    'https://www.bbc.co.uk': {
        'all_article_tag': 'div',
        'all_article_class': 'gel-layout gel-layout--center',
        'article_tag': 'article',
        'date_tag': 'div',
        'date_class': 'date date--v2',
        'content_tag': 'div',
        'content_class': 'story-body__inner'
    }
}

In [3]:
def get_all_article_urls(url, website_attributes):
    r=requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    base_url = re.match('^.+?[^\/:](?=[?\/]|$)', url).group(0)
    print('base url is', base_url)
    
    if base_url in website_attributes:
        print('web parameters found, parsing')
        params = website_attributes[base_url]
        
        all_articles = soup.find(params['all_article_tag'],
                                 attrs={'class': params['all_article_class']})
        all_article_urls = all_articles.find_all(params['article_tag'])
        return_urls = []
        for article_url in all_article_urls:
            if 'bbc' in base_url:
                return_urls.append(base_url+article_url.find('a')['href'])
            else:
                return_urls.append(article_url.find('a')['href'])
        print('number of article URLs found:', len(return_urls))
    return return_urls

In [4]:
def get_article_details(url, website_attributes):
    
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    base_url = re.match('^.+?[^\/:](?=[?\/]|$)', url).group(0)
#     print('base url is', base_url)
    
    article_title = soup.title.text.strip()
    
    if base_url in website_attributes:
#         print('web parameters found, parsing')
        params = website_attributes[base_url]
        
        try:
            article_date = soup.find(params['date_tag']
                                 , attrs={'class': params['date_class']})
            article_date = article_date.text.strip()
        except AttributeError:
            print('error - article date could not be found')
            return None, None, None, None
        
        try:
            postContent = soup.find(params['content_tag']
                                    , attrs={'class': params['content_class']})
        except AttributeError:
            print('error - article content could not be found')
            return None, None, None, None
            
        article_text = []
        for para in postContent.find_all('p'):
            article_text.append(para.text.strip())
        
        return article_title, article_date, article_text, base_url
    else:
        print('url not found in website parameters. returning None types')
        return None, None, None, None

In [5]:
df_test = pd.DataFrame()
cols = ['url', 'title', 'date', 'text', 'base_url']
url1 = 'https://www.optimistdaily.com/todays-solutions/'
url2 = 'https://www.bbc.co.uk/news/topics/cx2pk70323et/uplifting-stories'

todays_articles_1 = get_all_article_urls(url1, website_attributes)
todays_articles_2 = get_all_article_urls(url2, website_attributes)

todays_articles = todays_articles_1 + todays_articles_2

for url in tqdm(todays_articles):
    article_title, article_date, article_text, base_url = get_article_details(url, website_attributes)
    df_test = df_test.append(pd.DataFrame([[url, article_title, article_date, 
                                            article_text, base_url]], 
                                          columns=cols))
    
df_test.reset_index()

base url is https://www.optimistdaily.com
web parameters found, parsing
number of article URLs found: 10
base url is https://www.bbc.co.uk
web parameters found, parsing
number of article URLs found: 10


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

error - article date could not be found



Unnamed: 0,index,url,title,date,text,base_url
0,0,https://www.optimistdaily.com/2020/08/phew-cal...,"Phew! California's 2,000-year-old redwoods sur...","August 27, 2020","[With over 1 million acres torched and 12,000 ...",https://www.optimistdaily.com
1,0,https://www.optimistdaily.com/2020/08/painting...,Why painting eyes on the buttocks of cows can ...,"August 27, 2020",[Being a cattle-owning farmer in Botswana is n...,https://www.optimistdaily.com
2,0,https://www.optimistdaily.com/2020/08/this-com...,Now pets can take part in the alternative meat...,"August 27, 2020",[The alternative meat movement is growing and ...,https://www.optimistdaily.com
3,0,https://www.optimistdaily.com/2020/08/against-...,Smithsonian National Zoo celebrates birth of p...,"August 27, 2020",[It’s been a big month for animal conservation...,https://www.optimistdaily.com
4,0,https://www.optimistdaily.com/2020/08/forget-p...,Forget plastic: Here are some greener ways to ...,"August 27, 2020",[While Ziploc bags and plastic wrap can be use...,https://www.optimistdaily.com
5,0,https://www.optimistdaily.com/2020/08/this-tin...,This tiny robot could help surgeons with preci...,"August 27, 2020","[Drawing inspiration from Origami, researchers...",https://www.optimistdaily.com
6,0,https://www.optimistdaily.com/2020/08/refill-s...,Refill station in NYC laundromat offers waste-...,"August 27, 2020",[As we’ve seen with Nivea’s recent initiative ...,https://www.optimistdaily.com
7,0,https://www.optimistdaily.com/2020/08/what-lif...,What life is actually like at NYC's hotels-tur...,"August 27, 2020",[With the coronavirus pandemic leaving hotels ...,https://www.optimistdaily.com
8,0,https://www.optimistdaily.com/2020/08/food-bra...,Food brand KIND pledges to source bee-friendly...,"August 27, 2020",[Bee pollination is essential to coaxing fruit...,https://www.optimistdaily.com
9,0,https://www.optimistdaily.com/2020/08/parachut...,Parachutist completes world's first jump from ...,"August 27, 2020",[Parachuting from an airplane in itself is a m...,https://www.optimistdaily.com


In [6]:
df_test = df_test.dropna(how='any')
df_test['date_parsed'] = [dateutil.parser.parse(x).date() for x in df_test['date']]
df_test

Unnamed: 0,url,title,date,text,base_url,date_parsed
0,https://www.optimistdaily.com/2020/08/phew-cal...,"Phew! California's 2,000-year-old redwoods sur...","August 27, 2020","[With over 1 million acres torched and 12,000 ...",https://www.optimistdaily.com,2020-08-27
0,https://www.optimistdaily.com/2020/08/painting...,Why painting eyes on the buttocks of cows can ...,"August 27, 2020",[Being a cattle-owning farmer in Botswana is n...,https://www.optimistdaily.com,2020-08-27
0,https://www.optimistdaily.com/2020/08/this-com...,Now pets can take part in the alternative meat...,"August 27, 2020",[The alternative meat movement is growing and ...,https://www.optimistdaily.com,2020-08-27
0,https://www.optimistdaily.com/2020/08/against-...,Smithsonian National Zoo celebrates birth of p...,"August 27, 2020",[It’s been a big month for animal conservation...,https://www.optimistdaily.com,2020-08-27
0,https://www.optimistdaily.com/2020/08/forget-p...,Forget plastic: Here are some greener ways to ...,"August 27, 2020",[While Ziploc bags and plastic wrap can be use...,https://www.optimistdaily.com,2020-08-27
0,https://www.optimistdaily.com/2020/08/this-tin...,This tiny robot could help surgeons with preci...,"August 27, 2020","[Drawing inspiration from Origami, researchers...",https://www.optimistdaily.com,2020-08-27
0,https://www.optimistdaily.com/2020/08/refill-s...,Refill station in NYC laundromat offers waste-...,"August 27, 2020",[As we’ve seen with Nivea’s recent initiative ...,https://www.optimistdaily.com,2020-08-27
0,https://www.optimistdaily.com/2020/08/what-lif...,What life is actually like at NYC's hotels-tur...,"August 27, 2020",[With the coronavirus pandemic leaving hotels ...,https://www.optimistdaily.com,2020-08-27
0,https://www.optimistdaily.com/2020/08/food-bra...,Food brand KIND pledges to source bee-friendly...,"August 27, 2020",[Bee pollination is essential to coaxing fruit...,https://www.optimistdaily.com,2020-08-27
0,https://www.optimistdaily.com/2020/08/parachut...,Parachutist completes world's first jump from ...,"August 27, 2020",[Parachuting from an airplane in itself is a m...,https://www.optimistdaily.com,2020-08-27


In [7]:
dateutil.parser.parse(df_test.date.iloc[0]).date()

datetime.date(2020, 8, 27)

In [8]:
df_test['date_parsed'].iloc[0]

datetime.date(2020, 8, 27)

# Now we need to send this data to podio

## First parse in the podio data we have so we don't create duplicates

In [9]:
def mask_df(in_df, col='', filt=''):
    """ Filters dataframe using user defined column and filter"""
    mask = in_df[col] == filt
    return in_df[mask]

In [10]:
import logging
import json
import azure.functions as func
import time
import requests

podio_data = {
    "app_id": "25058801",
    "app_token": "683029008df9495a8947c90a38f75ce9",
    "client_id": "goodnews",
    "client_secret": "wrCUCZSxFuPmPZpm7f9iRWm9J4mS6VshbDuXxjNYAHL5RAMTKOFy4VSwHZ4w3csk",
    "grant_type": "app"
}

app_id = podio_data['app_id']
auth_url = 'https://podio.com/oauth/token'
podio_resp = requests.post(auth_url, data=podio_data)

if not podio_resp.ok:
    raise Exception("Auth failed", podio_resp)

podio_resp = podio_resp.json()
token = podio_resp['access_token']

headers = {
    "content-type": "application/json",
    "authorization": "Bearer " + token
}
token

'e2ca5614b5a6446fa7637a6c1e5b4111'

In [11]:
today = date.today()
back_to = today + relativedelta(days=-4)

In [12]:
item_url = "https://api.podio.com/item/app/" + app_id + "/filter/"
headers = {
    "content-type": "application/json",
    "authorization": "Bearer " + token
}


data = {
    "limit":500,
    "offset":0,
    "filters":{
        }
    }

all_data = requests.post(item_url, headers=headers, json=data).json()

In [13]:
all_data

{'filtered': 1,
 'total': 1,
 'items': [{'ratings': {'like': {'average': None,
     'counts': {'1': {'total': 0, 'users': []}}}},
   'sharefile_vault_url': None,
   'last_event_on': '2020-08-27 10:08:59',
   'rights': ['rate',
    'add_file',
    'comment',
    'subscribe',
    'add_task',
    'delete',
    'update',
    'view',
    'add_conversation'],
   'app_item_id': 101,
   'fields': [{'type': 'text',
     'field_id': 212301493,
     'label': 'Title',
     'values': [{'value': "Phew! California's 2,000-year-old redwoods survived the wildfires | The Optimist Daily: Making Solutions the News"}],
     'config': {'settings': {'format': 'plain', 'size': 'small'},
      'mapping': None,
      'label': 'Title'},
     'external_id': 'title'},
    {'type': 'text',
     'field_id': 212358246,
     'label': 'URL',
     'values': [{'value': 'https://www.optimistdaily.com/2020/08/phew-californias-2000-year-old-redwoods-survived-the-wildfires/'}],
     'config': {'settings': {'format': 'plain',

In [14]:
num_items = len(all_data['items'])
df = pd.DataFrame()
for i in range(0, num_items):
    num_item_entries= len(all_data['items'][i]['fields'])
    #grab item id
    item_id = all_data['items'][i]['item_id']
    
    for j in range(0, num_item_entries):
        temp_df = pd.json_normalize(all_data['items'][i]['fields'][j])
        temp_df['item_id'] = item_id
        df = df.append(temp_df)    

df_clean = df
df_clean['values'] = [x[0]['value'] for x in df['values']]
all_titles = mask_df(df_clean, col='label', filt='Title')['values'].values
all_titles

array(["Phew! California's 2,000-year-old redwoods survived the wildfires | The Optimist Daily: Making Solutions the News"],
      dtype=object)

## Now we check if we have already written those news stories to podio, and if not, upload them

In [15]:
from pprint import pprint

for index, row in df_test.iterrows():  
    if row.title in all_titles:
        print(f'{row.title} already in podio... ignoring')
        pass
    else:    
        item = {
            "title": str(row.title),
            "url-3": str(row.url),
            "date-3": str(row.date_parsed),
            "provider": str(row.base_url),
            "upvotes": 0,
            "downvotes": 0,
        }
        pprint(item)

        data = {
            "fields": item
        }

        item_url = f"https://api.podio.com/item/app/{app_id}/"
        item_resp = requests.post(item_url, json=data, headers=headers)
        if not item_resp.ok:
            raise Exception("Post failed", item_resp)


Phew! California's 2,000-year-old redwoods survived the wildfires | The Optimist Daily: Making Solutions the News already in podio... ignoring
{'date-3': '2020-08-27',
 'downvotes': 0,
 'provider': 'https://www.optimistdaily.com',
 'title': 'Why painting eyes on the buttocks of cows can save them from '
          'predators | The Optimist Daily: Making Solutions the News',
 'upvotes': 0,
 'url-3': 'https://www.optimistdaily.com/2020/08/painting-eyes-on-the-buttocks-of-cows-can-save-them-from-ambush-predators/'}
{'date-3': '2020-08-27',
 'downvotes': 0,
 'provider': 'https://www.optimistdaily.com',
 'title': 'Now pets can take part in the alternative meat movement | The '
          'Optimist Daily: Making Solutions the News',
 'upvotes': 0,
 'url-3': 'https://www.optimistdaily.com/2020/08/this-company-is-including-pets-in-the-alternative-meat-movement/'}
{'date-3': '2020-08-27',
 'downvotes': 0,
 'provider': 'https://www.optimistdaily.com',
 'title': 'Smithsonian National Zoo celebrates

In [16]:
df_test.url.iloc[0]

'https://www.optimistdaily.com/2020/08/phew-californias-2000-year-old-redwoods-survived-the-wildfires/'