In [1]:
from GoogleNews import GoogleNews
from newspaper import Article
import numpy as np
import pandas as pd
import random

In [2]:
# Load the list of climate change denier institutions as identified by desmoblog 
# (https://www.desmogblog.com/global-warming-denier-database)
institutions = pd.read_excel('denier institutions.xlsx')['denier institutions'].tolist()

In [3]:
for inst in institutions:
    print(inst)

55 Tufton Street
60 Plus Association
Accuracy in Media
Acton Institute
Adam Smith Institute
Aegis Strategic
Africa Fighting Malaria
African Centre for Advocacy and Human Development
Alberta Prosperity Fund
Alexis de Tocqueville Institution
Alternate Solutions Institute
America Rising LLC | PAC | America Rising Squared | Definers Public Affairs
American Association of Blacks in Energy
American Chemistry Council
American Coalition for Clean Coal Electricity
American Commitment
American Conservative Union
American Council for Capital Formation
American Council on Science and Health
American Encore
American Energy Alliance (AEA)
American Enterprise Institute
American Fuel & Petrochemical Manufacturers (AFPM) 
American Future Fund
American Institute for Economic Research
American Legislative Exchange Council (ALEC)
American Petroleum Institute
American Tradition Institute
Americans for Balanced Energy Choices
Americans for Prosperity
Americans for Tax Reform
Annapolis Center for Science-Bas

In [4]:
#Functions to retrieve google news articles from search terms and input range and extract full article text

def get_google_news(terms, date_range = ['01/01/1990' , '01/01/2020'], npages = 1):
    """
    Extract google news article search data (title, media, date, datetime, link, desc)
    
    terms: google news search input terms (str)
    date_range: range of dates the search needs to cover [start_date, end_date] 
    npages: number of return pages to search for (int)
    
    returns: list of dictionaries. One dict for each search results
    """
    # Define search terms
    # Should look at 1984 to start
    googlenews = GoogleNews(start =date_range[0] , end = date_range[1])
    googlenews.search(terms)

    # Get results for first 20 pages
    results = []
    for i in range(npages):
        googlenews.getpage(i)
        result = googlenews.result()
        results.append(result)

    results_flat = [item for sublist in results for item in sublist]
    return results_flat

def get_result_data(results):
    """
    Extracts article text, author and date from google news search result
    
    results: list of dictionaries. One dict for each search results
    
    returns: articles, dictionary of articles with keys: authors, year, text, link
    
    Notes: 
    - authors doesn't work well
    - the length of the output dictionary can be shorter than the input search results because links are often dead and 
    the article cannot be retrieved    
    - article text can be hidden by paywall
    """
    articles = []
    for n, art in enumerate(results):
        dico = {}
        dico['media'] = art['media']
        article = Article(art['link'])

        try:

            article.download()
            article.parse()
            dico['authors'] = article.authors
            try:
                dico['year'] = article.publish_date
            except:
                #print(article.publish_date)
                dico['year']=None
            dico['text'] = article.text
            #article.nlp()
            #dico['keywords'] = article.keywords
            #dico['summary'] = article.summary
            dico['link'] = art['link']
            articles.append(dico)
        except:
            #print(art['media'] + ' failed')
            pass
    return articles

def process_google_news(terms, date_range = ['01/01/1990' , '01/01/2020'], npages = 1, search_terms = None):
    """Webscrapes google news and returns full texts, authors, publication years and links
    
    terms: google news search input terms (str)
    date_range: range of dates the search needs to cover [start_date, end_date] 
    npages: number of result pages to scrape (int)
    
    returns: articles, dictionary of articles with keys: authors, year, text, link
    
    Notes: 
    - authors doesn't work well
    - the length of the output dictionary can be shorter than the input search results because links are often dead and 
    the article cannot be retrieved    
    - article text can be hidden by paywall
    """
    
    search_results = get_google_news(terms, date_range, npages)

    articles = get_result_data(search_results)
    if search_terms:
        for art in articles:
            art['search_terms'] = terms
    
    return articles

In [5]:
# Define search terms as ' "[INSTITUTION NAME]" "climate change" '
search_terms = ['"%s" "climate change"' %i for i in institutions ]

In [8]:
len(search_terms)

235

In [8]:
from datetime import date
import time

# Get current date to define search date range
today = date.today()
current_date = today.strftime('%d/%m/%Y')
print(current_date)

date_range = ['01/01/1990', current_date]
npages = 1

#data = [process_google_news(terms, date_range, npages) for terms in search_terms]
data = []
for terms in search_terms:
    time.sleep(90+30*random.uniform(0, 1))
    print(terms)
    results = process_google_news(terms, date_range, npages, terms)
    data.append(results)
    
    #if not results:
    #    break

18/01/2021
"55 Tufton Street" "climate change"
"60 Plus Association" "climate change"
"Accuracy in Media" "climate change"
"Acton Institute" "climate change"
"Adam Smith Institute" "climate change"
"Aegis Strategic" "climate change"
"Africa Fighting Malaria" "climate change"
"African Centre for Advocacy and Human Development" "climate change"
"Alberta Prosperity Fund" "climate change"
"Alexis de Tocqueville Institution" "climate change"
"Alternate Solutions Institute" "climate change"
"America Rising LLC | PAC | America Rising Squared | Definers Public Affairs" "climate change"
"American Association of Blacks in Energy" "climate change"
"American Chemistry Council" "climate change"
"American Coalition for Clean Coal Electricity" "climate change"
"American Commitment" "climate change"
"American Conservative Union" "climate change"
"American Council for Capital Formation" "climate change"
"American Council on Science and Health" "climate change"
"American Encore" "climate change"
"Americ

"Mercatus Center" "climate change"
"Mercer Family Foundation" "climate change"
"Modern Miracle Network" "climate change"
"Mont Pelerin Society" "climate change"
"Murray Energy" "climate change"
"National Association of Manufacturers" "climate change"
"National Black Chamber of Commerce" "climate change"
"National Center for Policy Analysis" "climate change"
"National Center for Public Policy Research" "climate change"
"National Chicken Council" "climate change"
"National Mining Association" "climate change"
"National Pork Producers Council" "climate change"
"Natural Resources Stewardship Project" "climate change"
"NERA Economic Consulting" "climate change"
"Nevada Policy Research Institute" "climate change"
"New Hope Environmental Services" "climate change"
"New Zealand Climate Science Coalition" "climate change"
"Oil and Gas Climate Initiative" "climate change"
"Oregon Institute of Science and Medicine" "climate change"
"Oregon Petition" "climate change"
"Pacific Research Institute" "

In [9]:
data_flat = np.concatenate(data) #np.array(data).flatten()

#merge the dictionaries for individual articles into one
big_dict = {}
for k in data_flat[0].keys():
    big_dict[k] = [d[k] for d in data_flat]
    
    
df = pd.DataFrame(big_dict).astype(str)
df.to_excel('denier_inst_articles_withMedia.xlsx')

In [78]:
df

Unnamed: 0,authors,year,text,link,search_terms
0,"['Byline Times', 'Https', 'Bylinetimes.Com', '...",2020-09-03 00:00:00,"Rupert Read spent last night in the cells, aft...",https://bylinetimes.com/2020/09/03/why-i-was-w...,"""55 Tufton Street"" ""climate change"""
1,"['Byline Times', 'Https', 'Bylinetimes.Com', '...",2020-09-08 00:00:00,Stephen Colegrave talks to co-founder of the W...,https://bylinetimes.com/2020/09/08/tufton-stre...,"""55 Tufton Street"" ""climate change"""
2,['Jessica Murray'],2020-09-02 00:00:00,"A number of famous novelists, poets and playwr...",https://www.theguardian.com/environment/2020/s...,"""55 Tufton Street"" ""climate change"""
3,"['Sian Cain', 'Jessica Murray']",2020-09-02 00:00:00,A group of artists and writers including Zadie...,https://www.theguardian.com/environment/2020/s...,"""55 Tufton Street"" ""climate change"""
4,[],2016-02-10 21:10:53+00:00,Tucked away on the side streets of Westminster...,https://www.independent.co.uk/news/uk/politics...,"""55 Tufton Street"" ""climate change"""
5,['Max Wakefield'],2020-03-03 00:00:00,"On Monday, the government did something remark...",https://www.theguardian.com/commentisfree/2020...,"""55 Tufton Street"" ""climate change"""
6,[],,"55 Tufton Street, where many of the meetings t...",https://www.opendemocracy.net/en/dark-money-in...,"""55 Tufton Street"" ""climate change"""
7,[],2020-09-03 13:49:46+01:00,EXTINCTION REBELLION (XR) activists glued them...,https://morningstaronline.co.uk/article/b/xr-a...,"""55 Tufton Street"" ""climate change"""
8,[],,Tufton Street network\n\nWhen Theresa May anno...,https://www.opendemocracy.net/en/opendemocracy...,"""55 Tufton Street"" ""climate change"""
9,"['Byline Times', 'Https', 'Bylinetimes.Com', '...",2020-09-03 00:00:00,"Rupert Read spent last night in the cells, aft...",https://bylinetimes.com/2020/09/03/why-i-was-w...,"""55 Tufton Street"" ""climate change"""
