# Functions to find all articles from a set of websites by keyword

## Websites currently included:

* CNN.com

* Bloomberg.com

* BBC.com

* Euronews.com

* RT.com

* CNBC.com

* France24.com



Idea: 
Instead of Google Searches try scraping webarchive if the XPATH keeps changing.


In [1]:
# Necessary
import requests
from bs4 import BeautifulSoup
# Webscrape
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

from urllib.parse import quote_plus

from googlesearch import search

# Processing
import numpy as np
import pandas as pd
# newspaper
import newspaper as news
# RSS
import feedparser

In [None]:
def make_keywords(string):
    '''
    Takes an input string of comma separated keywords and
    parses it into a list
    
    (string) -> (list of strings)
    '''
    keywords = string.split(",")
    return keywords

class website:
    '''
    The website class, contains the link to the search site, 
    and some parameters for performing the search on it.
    '''
    def __init__(self, link):
        '''
        Website constructor, requires a link (string) to the screen which supports searching, 
        preferably just the search page if it exists.
        (string) -> (website)
        '''
        self.address = link
        self.newspaper = news.build(link)
        

def search_website(keywords, website):
    '''
    Takes in a list of keywords from make_keywords, and a website name in the list of websites,
    
    Returns a data frame of variables about each article
    (list, website) -> (data frame)
    '''
    driver.get(website.address) # move the driver to the website address
    time.wait(20) # wait for 20 seconds for the website to load
    

#### Newspaper Example:


In [None]:
CNN_news = news.build("http://cnn.com")
print(50*"*")
print("Length:")
print(50*"*")
CNN_news.size()

In [None]:
Bloomberg_news = news.build("https://www.bloomberg.com")
print(50*"*")
print("Length:")
print(50*"*")
Bloomberg_news.size()

In [None]:
print(Bloomberg_news.size())
print(50*"*")
print("Titles:")
for article in Bloomberg_news.articles:
    print(article.title)

#### RSS Feeds Example:

In [None]:
CNN_world_news = feedparser.parse("http://rss.cnn.com/rss/cnn_world.rss")
CNN_money_top_news = feedparser.parse("http://rss.cnn.com/rss/money_topstories.rss")

In [None]:
CNN_world_news

print(50*"*")
print("Titles:")
print(50*"*")
i = 0
while i < len(CNN_world_news["entries"]):
    print(CNN_world_news["entries"][i]["title"])
    i=i+1
i = 0 
print(50*"*")
print("Titles:")
print(50*"*")
while i < len(CNN_money_top_news["entries"]):
    print(CNN_money_top_news["entries"][i]["title"])
    i=i+1
print(50*"*")
print("Structure:")
print(50*"*")
print(CNN_money_top_news["entries"][1])

#### Webscraping:

In [2]:
class Browser:

    def __init__(self, initiate=True, implicit_wait_time = 10, explicit_wait_time = 2):
        self.implicit_wait_time = implicit_wait_time    # http://www.aptuz.com/blog/selenium-implicit-vs-explicit-waits/
        self.explicit_wait_time = explicit_wait_time    # http://www.aptuz.com/blog/selenium-implicit-vs-explicit-waits/
        if initiate:
            self.start()
        return

    def start(self):
        self.driver = webdriver.Chrome("/Users/michalmalyska/Programming/chromedriver")
        self.driver.implicitly_wait(self.implicit_wait_time)
        return

    def end(self):
        self.driver.quit()
        return

    def go_to_url(self, url, wait_time = None):
        if wait_time is None:
            wait_time = self.explicit_wait_time
        self.driver.get(url)
        print('[*] Fetching results from: {}'.format(url))
        time.sleep(wait_time)
        return

    def get_search_url(self, query, page_num=0, per_page=100, lang='en'):
        query = quote_plus(query)
        url = 'https://www.google.hr/search?q={}&num={}&start={}&nl={}'.format(query, per_page, page_num*per_page, lang)
        return url

    def scrape(self, n):
        #xpath migth change in future
        i = 1
        links = []
        while i < n:
            links.append(self.driver.find_elements_by_xpath("//*[@id='rso']/div/div/div[" + str(i) + "]/div/div/div[1]/span/div/div/ol/li/a")) # searches for all links insede h3 tags with class "r"
            i += 1
            # //*[@id="rso"]/div/div/div[1]/div/div/div[1]/span/div/div/ol/li/a
            # //*[@id="rso"]/div/div/div[2]/div/div/div[1]/span/div/div/ol/li/a
        results = []
        i = 1
        for link in links:
            print("Loop Number", i)
            print("Link", link)
            try:
                print("Try 1")
                d = {'url': link[0].get_attribute('href')}
                results.append(d)
                print(d)
            except Exception as e:
                print("Exception 1")
                print(e)
            i += 1
                
        return results

    def search(self, query, page_num=0, per_page=10, lang='en', wait_time = None, n = 2):
        if wait_time is None:
            wait_time = self.explicit_wait_time
        url = self.get_search_url(query, page_num, per_page, lang)
        self.go_to_url(url, wait_time)
        results = self.scrape(n)
        return results

In [None]:
# Activate the browser
browser = Browser()
# Let it sleep for 10 seconds to make sure it loads properly
time.sleep(3)

In [None]:
# Go to google.ca
browser.go_to_url("https://www.google.ca/")

# Temporary keyword, can be later passed in a loop or sth
keyword = "Deloitte"
# We can restrict this to just certain sites by the parameter:
# site:<url address>
# (CNN example)
site = "bloomberg.com"

#elem = browser.find_element_by_name("q")  # Find the search box
#elem.send_keys("site:" + site + " " + keyword + Keys.RETURN) # Type in the stuff into the search box

query = "site:" + site + " " + keyword
# Or we can use the class above:
results = browser.search(query, per_page = 100)

In [None]:
keyword = "Deloitte"
site = "bloomberg.com/news/articles"
query = "site:" + site + " " + keyword

for term in browser.search(query):
    browser.go_to_url(term["url"])

In [15]:
def search_bloomberg(keywords, n, debug = False):
    '''
    (list) -> (data frame)
    Performs a search on bloomberg.com for news about the keyword.
    Returns a dataframe with:
    
    keyword | link | author | headline | date | website | # likes | #shares | # views | language
    
    '''
    # Add one to n to make it have expected behaviour
    n = n + 1
    
    # Start the search browser (for the class Browser defined above)
    browser = Browser()
    
    # Start another web browser (regular web driver)
    driver = webdriver.Chrome("/Users/michalmalyska/Programming/chromedriver")
    
    main = pd.DataFrame(
        {
            "keyword" : [],
            "link" : [],
            "author" : [],
            "headline" : [],
            "Topic" : [],
            "date" : [],
            "website" : [],
            "likes" : [],
            "shares" : [],
            "views" : [],
            "language" :[]
        })
    for keyword in keywords:
        # initialize the dataframe
        df = pd.DataFrame(
        {
            "keyword" : [],
            "link" : [],
            "author" : [],
            "headline" : [],
            "Topic" : [],
            "date" : [],
            "website" : [],
            "likes" : [],
            "shares" : [],
            "views" : [],
            "language" :[]
        })
        site = "bloomberg.com/news"
        query = "site:" + site + " " + keyword
        for term in browser.search(query, n = n):
            
            if debug:
                print(term)
            
            # Go to the url
            driver.get(term["url"])
            
            # /html/body/main/div/article/div[2]/div/div/address/div/a
            # Author xpath
            #Author1 = driver.find_element_by_xpath("/html/body/main/div/article/div[2]/div/div/address/div/a")
            # Currently does not work, element.text is an empty string, so I will resort to using classes.
            
            
            # class : author-v2__byline
            try:
                Author = driver.find_element_by_class_name("author-v2__byline").text
            except:
                try:
                    Author = driver.find_element_by_class_name("author").text
                except: 
                    Author = "NA"
            
            # /html/body/main/div/article/div[2]/div/div/div[2]/time
            # Date xpath
            # Date = driver.find_element_by_xpath("/html/body/main/div/article/div[2]/div/div/div[2]/time")
            
            # class : article-timestamp
            try:
                Date = driver.find_element_by_class_name("article-timestamp").text
            except:
                Date = "NA"
            # /html/body/main/div/article/div[2]/div/div/h1
            # Headline 
            # Headline = driver.find_element_by_xpath("/html/body/main/div/article/div[2]/div/div/h1")
            
            # class: lede-text-v2__hed
            try: 
                Headline = driver.find_element_by_class_name("lede-text-v2__hed").text
            except:
                try:
                    Headline = driver.find_element_by_class_name("lede-text-only__highlight").text
                except:
                    Headline = "NA"
            
            # class eyebrow-v2
            try:
                Topic = driver.find_element_by_class_name("eyebrow-v2").text
            except:
                try: 
                    Topic = driver.find_element_by_class_name("eyebrow").text
                except: 
                    Topic = "NA"
            # Shares Likes and Views unavailable
            Shares = -1
            Likes = -1
            Views = -1
            
            # Language fixed to english
            Language = "ENG"
            
            # Keyword
            Keyword = keyword
            
            # Link
            Link = term["url"]
            
            # Website 
            Website = "Bloomberg"
            
            df2 = pd.DataFrame(
            {
            "keyword" : [Keyword],
            "link" : [Link],
            "author" : [Author],
            "headline" : [Headline],
            "Topic" : [Topic],
            "date" : [Date],
            "website" : [Website],
            "likes" : [Likes],
            "shares" : [Shares],
            "views" : [Views],
            "language" :[Language]
            })
            
            df = df.append(df2)
        main = main.append(df)
    return(main)

In [16]:
search_bloomberg(keywords=["Deloitte AI"], n = 3, debug=True)

[*] Fetching results from: https://www.google.hr/search?q=site%3Abloomberg.com%2Fnews+Deloitte+AI&num=10&start=0&nl=en
Loop Number 1
Link [<selenium.webdriver.remote.webelement.WebElement (session="b4976ea0bbf92ce21d0f7dc2ed9ae73f", element="0.8907667958497989-1")>, <selenium.webdriver.remote.webelement.WebElement (session="b4976ea0bbf92ce21d0f7dc2ed9ae73f", element="0.8907667958497989-2")>]
Try 1
{'url': 'https://webcache.googleusercontent.com/search?q=cache:zBoGrZ3o0m8J:https://www.bloomberg.com/news/articles/2018-12-12/artificial-intelligence-has-some-explaining-to-do+&cd=1&hl=en&ct=clnk&gl=ca'}
Loop Number 2
Link [<selenium.webdriver.remote.webelement.WebElement (session="b4976ea0bbf92ce21d0f7dc2ed9ae73f", element="0.8907667958497989-3")>, <selenium.webdriver.remote.webelement.WebElement (session="b4976ea0bbf92ce21d0f7dc2ed9ae73f", element="0.8907667958497989-4")>]
Try 1
{'url': 'https://webcache.googleusercontent.com/search?q=cache:YM6cb-9e4ckJ:https://www.bloomberg.com/news/artic

Unnamed: 0,keyword,link,author,headline,Topic,date,website,likes,shares,views,language
0,Deloitte AI,https://webcache.googleusercontent.com/search?...,Jeremy Kahn,Artificial Intelligence Has Some Explaining to Do,,"December 12, 2018, 6:00 AM EST",Bloomberg,-1.0,-1.0,-1.0,ENG
0,Deloitte AI,https://webcache.googleusercontent.com/search?...,Emma Kinery,"AI to Reshape Finance, Say Executives Who Stru...",Technology,"August 15, 2018, 6:00 PM EDT",Bloomberg,-1.0,-1.0,-1.0,ENG
0,Deloitte AI,https://webcache.googleusercontent.com/search?...,Giles Turner,Deloitte Email Platform and Client Data Hit by...,Technology,"September 25, 2017, 10:38 AM EDT",Bloomberg,-1.0,-1.0,-1.0,ENG


In [31]:
def search_CNN(keywords, n, debug = False):
    '''
    (list) -> (data frame)
    Performs a search on CNN.com for news about the keyword.
    Returns a dataframe with:
    
    keyword | link | author | headline | date | website | # likes | #shares | # views | language
    
    '''
    # Add one to n to make it have expected behaviour
    n = n + 1
    
    # Start the search browser (for the class Browser defined above)
    browser = Browser()
    
    # Start another web browser (regular web driver)
    driver = webdriver.Chrome("/Users/michalmalyska/Programming/chromedriver")
    
    main = pd.DataFrame(
        {
            "keyword" : [],
            "link" : [],
            "author" : [],
            "headline" : [],
            "Topic" : [],
            "date" : [],
            "website" : [],
            "likes" : [],
            "shares" : [],
            "views" : [],
            "language" :[]
        })
    for keyword in keywords:
        # initialize the dataframe
        df = pd.DataFrame(
        {
            "keyword" : [],
            "link" : [],
            "author" : [],
            "headline" : [],
            "Topic" : [],
            "date" : [],
            "website" : [],
            "likes" : [],
            "shares" : [],
            "views" : [],
            "language" :[]
        })
        site = "cnn.com"
        query = "site:" + site + " " + keyword
        for term in browser.search(query, n = n):
            
            if debug:
                print(term)
            
            # Go to the url
            driver.get(term["url"])
            
            # /html/body/main/div/article/div[2]/div/div/address/div/a
            # Author xpath
            #Author1 = driver.find_element_by_xpath("/html/body/main/div/article/div[2]/div/div/address/div/a")
            # Currently does not work, element.text is an empty string, so I will resort to using classes.
            
            
            # class : author-v2__byline
            try:
                Author = driver.find_element_by_class_name("metadata__byline__author").text
            except:
                try:
                    Author = driver.find_element_by_class_name("author").text
                except: 
                    try: 
                        Author = driver.find_element_by_class_name("bold author-name").text
                    except:
                        Author = "NA"
            
            # /html/body/main/div/article/div[2]/div/div/div[2]/time
            # Date xpath
            # Date = driver.find_element_by_xpath("/html/body/main/div/article/div[2]/div/div/div[2]/time")
            
            # class : article-timestamp
            try:
                Date = driver.find_element_by_class_name("update-time").text
            except:
                try: 
                    Date = driver.find_element_by_class_name("timestamp published-date padding-12-left").text
                except:
                    Date = "NA"
            # /html/body/main/div/article/div[2]/div/div/h1
            # Headline 
            # Headline = driver.find_element_by_xpath("/html/body/main/div/article/div[2]/div/div/h1")
            
            # class: lede-text-v2__hed
            try: 
                Headline = driver.find_element_by_class_name("article-headline_articleHeadlineContainer__qoLpy_885427b2").text
            except:
                try:
                    Headline = driver.find_element_by_class_name("heading-content").text
                except:
                    Headline = "NA"
            
            # class eyebrow-v2
            try:
                Topic = driver.find_element_by_class_name("eyebrow-v2").text
            except:
                try: 
                    Topic = driver.find_element_by_class_name("eyebrow").text
                except: 
                    Topic = "NA"
            # Shares Likes and Views unavailable
            Shares = -1
            Likes = -1
            Views = -1
            
            # Language fixed to english
            Language = "ENG"
            
            # Keyword
            Keyword = keyword
            
            # Link
            Link = term["url"]
            
            # Website 
            Website = "CNN"
            
            df2 = pd.DataFrame(
            {
            "keyword" : [Keyword],
            "link" : [Link],
            "author" : [Author],
            "headline" : [Headline],
            "Topic" : [Topic],
            "date" : [Date],
            "website" : [Website],
            "likes" : [Likes],
            "shares" : [Shares],
            "views" : [Views],
            "language" :[Language]
            })
            
            df = df.append(df2)
        main = main.append(df)
    return(main)

In [32]:
search_CNN(keywords=["Deloitte AI"], n = 3, debug=True)

[*] Fetching results from: https://www.google.hr/search?q=site%3Acnn.com+Deloitte+AI&num=10&start=0&nl=en
Loop Number 1
Link [<selenium.webdriver.remote.webelement.WebElement (session="21e8f39f7fe13e630b71771509e08bd3", element="0.1860618389828277-1")>, <selenium.webdriver.remote.webelement.WebElement (session="21e8f39f7fe13e630b71771509e08bd3", element="0.1860618389828277-2")>, <selenium.webdriver.remote.webelement.WebElement (session="21e8f39f7fe13e630b71771509e08bd3", element="0.1860618389828277-3")>, <selenium.webdriver.remote.webelement.WebElement (session="21e8f39f7fe13e630b71771509e08bd3", element="0.1860618389828277-4")>]
Try 1
{'url': 'http://webcache.googleusercontent.com/search?q=cache:s5vL7lJ6aDMJ:tech.fortune.cnn.com/2016/12/22/robots-jobs-ai/+&cd=4&hl=en&ct=clnk&gl=ca'}
Loop Number 2
Link [<selenium.webdriver.remote.webelement.WebElement (session="21e8f39f7fe13e630b71771509e08bd3", element="0.1860618389828277-5")>]
Try 1
{'url': 'http://webcache.googleusercontent.com/sear

Unnamed: 0,keyword,link,author,headline,Topic,date,website,likes,shares,views,language
0,Deloitte AI,http://webcache.googleusercontent.com/search?q...,"By REUTERS December 22, 2016",Why You Shouldn’t Worry About Robots Stealing ...,,,CNN,-1.0,-1.0,-1.0,ENG
0,Deloitte AI,http://webcache.googleusercontent.com/search?q...,"By JAY SAMIT July 22, 2017",4 Ways Augmented Reality Could Change Corporat...,,,CNN,-1.0,-1.0,-1.0,ENG
0,Deloitte AI,https://webcache.googleusercontent.com/search?...,,,,,CNN,-1.0,-1.0,-1.0,ENG


In [22]:
driver = webdriver.Chrome("/Users/michalmalyska/Programming/chromedriver")

In [23]:
driver.get("http://webcache.googleusercontent.com/search?q=cache:s5vL7lJ6aDMJ:tech.fortune.cnn.com/2016/12/22/robots-jobs-ai/+&cd=4&hl=en&ct=clnk&gl=ca")

In [24]:
driver.find_element_by_class_name("metadata__byline__author").text

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"class name","selector":"metadata__byline__author"}
  (Session info: chrome=71.0.3578.98)
  (Driver info: chromedriver=2.44.609545 (c2f88692e98ce7233d2df7c724465ecacfe74df5),platform=Mac OS X 10.14.1 x86_64)


In [30]:
driver.find_element_by_class_name("heading-content").text

'Why You Shouldn’t Worry About Robots Stealing Your Jobs'