## Date: April 2022
## Project: Repression events dataset. 
## Author: Almira Sadykova-DuMond

In [8]:
# Importing libraries
import pandas as pd
import os

In [9]:
# Setting working directory
os.chdir('C:/Users/16073/Desktop/RED')

In [10]:
## Fixing date parsing issue in the package code. 
import feedparser
from bs4 import BeautifulSoup
import urllib
from dateutil.parser import parse as date_parse
import requests


class GoogleNews:
    def __init__(self, lang = 'en', country = 'US'):
        self.lang = lang.lower()
        self.country = country.upper()
        self.BASE_URL = 'https://news.google.com/rss'

    def __top_news_parser(self, text):
        """Return subarticles from the main and topic feeds"""
        try:
            bs4_html = BeautifulSoup(text, "html.parser")
            # find all li tags
            lis = bs4_html.find_all('li')
            sub_articles = []
            for li in lis:
                try:
                    sub_articles.append({"url": li.a['href'],
                                         "title": li.a.text,
                                         "publisher": li.font.text})
                except:
                    pass
            return sub_articles
        except:
            return text

    def __ceid(self):
        """Compile correct country-lang parameters for Google News RSS URL"""
        return '?ceid={}:{}&hl={}&gl={}'.format(self.country,self.lang,self.lang,self.country)

    def __add_sub_articles(self, entries):
        for i, val in enumerate(entries):
            if 'summary' in entries[i].keys():
                entries[i]['sub_articles'] = self.__top_news_parser(entries[i]['summary'])
            else:
                entries[i]['sub_articles'] = None
        return entries

    def __scaping_bee_request(self, api_key, url):
        response = requests.get(
            url="https://app.scrapingbee.com/api/v1/",
            params={
                "api_key": api_key,
                "url": url,
                "render_js": "false"
            }
        )
        if response.status_code == 200:
            return response
        if response.status_code != 200:
            raise Exception("ScrapingBee status_code: "  + str(response.status_code) + " " + response.text)

    def __parse_feed(self, feed_url, proxies=None, scraping_bee = None):

        if scraping_bee and proxies:
            raise Exception("Pick either ScrapingBee or proxies. Not both!")

        if proxies:
            r = requests.get(feed_url, proxies = proxies)
        else:
            r = requests.get(feed_url)

        if scraping_bee:
            r = self.__scaping_bee_request(url = feed_url, api_key = scraping_bee)
        else:
            r = requests.get(feed_url)


        if 'https://news.google.com/rss/unsupported' in r.url:
            raise Exception('This feed is not available')

        d = feedparser.parse(r.text)

        if not scraping_bee and not proxies and len(d['entries']) == 0:
            d = feedparser.parse(feed_url)

        return dict((k, d[k]) for k in ('feed', 'entries'))

    def __search_helper(self, query):
        return urllib.parse.quote_plus(query)

    def __from_to_helper(self, validate=None):
        try:
            validate = date_parse(validate).strftime('%Y-%m-%d')
            return str(validate)
        except:
            raise Exception('Could not parse your date')



    def top_news(self, proxies=None, scraping_bee = None):
        """Return a list of all articles from the main page of Google News
        given a country and a language"""
        d = self.__parse_feed(self.BASE_URL + self.__ceid(), proxies=proxies, scraping_bee=scraping_bee)
        d['entries'] = self.__add_sub_articles(d['entries'])
        return d

    def topic_headlines(self, topic: str, proxies=None, scraping_bee=None):
        """Return a list of all articles from the topic page of Google News
        given a country and a language"""
        #topic = topic.upper()
        if topic.upper() in ['WORLD', 'NATION', 'BUSINESS', 'TECHNOLOGY', 'ENTERTAINMENT', 'SCIENCE', 'SPORTS', 'HEALTH']:
            d = self.__parse_feed(self.BASE_URL + '/headlines/section/topic/{}'.format(topic.upper()) + self.__ceid(), proxies = proxies, scraping_bee=scraping_bee)

        else:
            d = self.__parse_feed(self.BASE_URL + '/topics/{}'.format(topic) + self.__ceid(), proxies = proxies, scraping_bee=scraping_bee)

        d['entries'] = self.__add_sub_articles(d['entries'])
        if len(d['entries']) > 0:
            return d
        else:
            raise Exception('unsupported topic')

    def geo_headlines(self, geo: str, proxies=None, scraping_bee=None):
        """Return a list of all articles about a specific geolocation
        given a country and a language"""
        d = self.__parse_feed(self.BASE_URL + '/headlines/section/geo/{}'.format(geo) + self.__ceid(), proxies = proxies, scraping_bee=scraping_bee)

        d['entries'] = self.__add_sub_articles(d['entries'])
        return d

    def search(self, query: str, helper = True, when = None, from_ = None, to_ = None, proxies=None, scraping_bee=None):
        """
        Return a list of all articles given a full-text search parameter,
        a country and a language
        :param bool helper: When True helps with URL quoting
        :param str when: Sets a time range for the artiles that can be found
        """

        if when:
            query += ' when:' + when

        if from_ and not when:
            from_ = self.__from_to_helper(validate=from_)
            query += ' after:' + from_

        if to_ and not when:
            to_ = self.__from_to_helper(validate=to_)
            query += ' before:' + to_

        if helper == True:
            query = self.__search_helper(query)

        search_ceid = self.__ceid()
        search_ceid = search_ceid.replace('?', '&')

        d = self.__parse_feed(self.BASE_URL + '/search?q={}'.format(query) + search_ceid, proxies = proxies, scraping_bee=scraping_bee)

        d['entries'] = self.__add_sub_articles(d['entries'])
        return d

In [11]:
# setting up search language and country/region
gn = GoogleNews(lang = 'en', country='CA')

In [17]:
# search terms and search period
search = gn.search('arrest+The New York Times', from_='2000-02-02', to_="2010-02-15")

In [18]:
print(search)

{'feed': {'generator_detail': {'name': 'NFE/5.0'}, 'generator': 'NFE/5.0', 'title': '"arrest+The New York Times after:2000-02-02 before:2010-02-15" - Google News', 'title_detail': {'type': 'text/plain', 'language': None, 'base': '', 'value': '"arrest+The New York Times after:2000-02-02 before:2010-02-15" - Google News'}, 'links': [{'rel': 'alternate', 'type': 'text/html', 'href': 'https://news.google.com/search?q=arrest%2BThe+New+York+Times+after:2000-02-02+before:2010-02-15&ceid=CA:en&hl=en-CA&gl=CA'}], 'link': 'https://news.google.com/search?q=arrest%2BThe+New+York+Times+after:2000-02-02+before:2010-02-15&ceid=CA:en&hl=en-CA&gl=CA', 'language': 'en-CA', 'publisher': 'news-webmaster@google.com', 'publisher_detail': {'email': 'news-webmaster@google.com'}, 'rights': '2022 Google Inc.', 'rights_detail': {'type': 'text/plain', 'language': None, 'base': '', 'value': '2022 Google Inc.'}, 'updated': 'Wed, 04 May 2022 12:25:43 GMT', 'updated_parsed': time.struct_time(tm_year=2022, tm_mon=5, t

In [14]:
print(search['entries'])

[]


In [18]:
for item in search['entries']:
    print(item['title'], item['published'],item['summary'])

Until Latest Arrest, Simpson Enjoyed a Mostly Placid Life of Leisure in Miami's Suburbs (Published 2007) - The New York Times Wed, 19 Sep 2007 07:00:00 GMT <a href="https://www.nytimes.com/2007/09/19/us/19oj.html" target="_blank">Until Latest Arrest, Simpson Enjoyed a Mostly Placid Life of Leisure in Miami's Suburbs (Published 2007)</a>&nbsp;&nbsp;<font color="#6f6f6f">The New York Times</font>
No Photo Ban in Subways, Yet an Arrest (Published 2009) - The New York Times Tue, 17 Feb 2009 08:00:00 GMT <a href="https://www.nytimes.com/2009/02/18/nyregion/18about.html" target="_blank">No Photo Ban in Subways, Yet an Arrest (Published 2009)</a>&nbsp;&nbsp;<font color="#6f6f6f">The New York Times</font>
Racial Disparities Found to Persist as Drug Arrests Rise (Published 2008) - The New York Times Tue, 06 May 2008 07:00:00 GMT <a href="https://www.nytimes.com/2008/05/06/us/05cnd-disparities.html" target="_blank">Racial Disparities Found to Persist as Drug Arrests Rise (Published 2008)</a>&nbs

In [19]:
print(len(search['entries']))

100


In [20]:
newsitem=search['entries']
stories=[]
for item in newsitem:
    story={
        'title': item.title,
        'link':item.link,
        'date':item.published
    }
    stories.append(story)
   

In [21]:
print(stories)

[{'title': 'Opinion | Why Arrest Roman Polanski Now? - The New York Times', 'link': 'https://www.nytimes.com/2009/09/30/opinion/30harris.html', 'date': 'Tue, 29 Sep 2009 07:00:00 GMT'}, {'title': "Lawyer's Arrest in Canada Has His Firm in Chaos (Published 2008) - The New York Times", 'link': 'https://www.nytimes.com/2008/12/06/nyregion/06lawyer.html', 'date': 'Fri, 05 Dec 2008 08:00:00 GMT'}, {'title': 'No Photo Ban in Subways, Yet an Arrest (Published 2009) - The New York Times', 'link': 'https://www.nytimes.com/2009/02/18/nyregion/18about.html', 'date': 'Tue, 17 Feb 2009 08:00:00 GMT'}, {'title': 'Racial Disparities Found to Persist as Drug Arrests Rise (Published 2008) - The New York Times', 'link': 'https://www.nytimes.com/2008/05/06/us/05cnd-disparities.html', 'date': 'Tue, 06 May 2008 07:00:00 GMT'}, {'title': '2 Men Scale New York Times Building Hours Apart (Published 2008) - The New York Times', 'link': 'https://www.nytimes.com/2008/06/06/nyregion/06climber.html', 'date': 'Fri,

In [22]:
df = pd.DataFrame(stories)

In [23]:
df.head()

Unnamed: 0,title,link,date
0,Opinion | Why Arrest Roman Polanski Now? - The...,https://www.nytimes.com/2009/09/30/opinion/30h...,"Tue, 29 Sep 2009 07:00:00 GMT"
1,Lawyer's Arrest in Canada Has His Firm in Chao...,https://www.nytimes.com/2008/12/06/nyregion/06...,"Fri, 05 Dec 2008 08:00:00 GMT"
2,"No Photo Ban in Subways, Yet an Arrest (Publis...",https://www.nytimes.com/2009/02/18/nyregion/18...,"Tue, 17 Feb 2009 08:00:00 GMT"
3,Racial Disparities Found to Persist as Drug Ar...,https://www.nytimes.com/2008/05/06/us/05cnd-di...,"Tue, 06 May 2008 07:00:00 GMT"
4,2 Men Scale New York Times Building Hours Apar...,https://www.nytimes.com/2008/06/06/nyregion/06...,"Fri, 06 Jun 2008 07:00:00 GMT"


In [24]:
df.to_csv('NYT_search.csv',index=False)