# scrape stock data from google news

In [9]:
import requests
import random 
from collections import OrderedDict

# List of header that contain User-Agent
def list_header():
    headers_list = [
        # Firefox 24 Linux
        {
            'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        },
        # Firefox Mac
        {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }
    ]
    return headers_list

def list_dict():
    # Get headers list
    headers_list = list_header()
    # Create ordered dict from Headers above
    ordered_headers_list = []
    for headers in headers_list:
        h = OrderedDict()
        for header,value in headers.items():
            h[header]=value
        ordered_headers_list.append(h)
    return ordered_headers_list

def list_test():
    headers_list = list_dict()
    max = len(headers_list)
    url = 'https://httpbin.org/headers'
    for i in range(0,max):
        #Pick a random browser headers
        headers = random.choice(headers_list)
        #Create a request session
        r = requests.Session()
        r.headers = headers
        
        response = r.get(url)
        print("Request #%d\nUser-Agent Sent:%s\n\nHeaders Recevied by HTTPBin:"%(i,headers['User-Agent']))
        print(response.json())
        print("-------------------")

def random_header():
    headers_list = list_dict()
    headers = random.choice(headers_list)
    return headers

In [10]:
# INGESTION THROUGH WEB SCRAPING USING BEAUTIFULSOUP
import requests
import pandas as pd
import bs4
from datetime import date


def ingest_google_news():
    ticker_list = ['AAPL', 'GOOG', 'FB','NFLX', 'NVDA', 'ZM', 'ADBE', 'MSFT', 'TSLA']

    sep = '.'
    
    df = pd.DataFrame()
    t_news = []
    t_publisher = []
    t_urls = []
    t_dates = []
    t_tickers = []

    for t in ticker_list:
        news = []
        publisher = []
        urls = []
        dates = []
        tickers = []

        # cleaning ticker
        ticker = t
        t = t.split(sep, 1)[0]

        # set header by random user agent 
        r = requests.Session()
        headers = random_header()
        r.headers = headers
        # print(headers)

        # set query for google
        query = '{} stock news'.format(t)
        url = f"https://www.google.com/search?q={query}&tbm=nws&lr=lang_en&hl=en&sort=date&num=5"
        res = r.get(url, headers=headers)
        soup = bs4.BeautifulSoup(res.text, "html.parser")
        
        links = soup.select(".dbsr a")
        for l in links:
            tickers.append(t)
            try:
                url_w = l.get("href")
                print("Google news URL to scrape = " + url_w)
                urls.append(url_w)
                dt = find_date(url_w)
                dates.append(dt)

                res = requests.get(url_w, headers=headers)
                parsed_article = bs4.BeautifulSoup(res.text,'lxml')
                paragraphs = parsed_article.find_all('p')

                article_text = ""
                for p in paragraphs:
                    article_text += p.text

            except Exception as e:
                article_text = ''

            news.append(article_text)

        sources = soup.select(".XTjFC g-img")
        for s in sources:
            publisher.append(s.next_sibling.lower())

        t_urls += urls
        t_news += news
        t_publisher += publisher
        t_dates += dates
        t_tickers += tickers

    df['ticker'] = t_tickers
    df['links'] = t_urls
    df['article_text'] = t_news
    df['publisher'] = t_publisher
 #   df['created_at'] = t_dates

    # import to csv
    today = date.today()
    d1 = today.strftime("%d%m%Y")
    df.to_csv(f'./datasets/google_news_{d1}.csv')

    del news, publisher, urls, dates, tickers
    del t_news, t_publisher, t_urls, t_dates, ticker


ingest_google_news()

Google news URL to scrape = https://www.cnbc.com/2021/07/21/a-key-stat-hidden-in-verizons-report-could-spell-good-news-for-apple.html
Google news URL to scrape = https://www.fool.com/investing/2021/07/19/why-apple-stock-was-falling-monday/
Google news URL to scrape = https://www.fxstreet.com/news/apple-aapl-stock-forecast-why-is-apple-falling-will-results-boost-stock-the-stock-price-202107191034
Google news URL to scrape = https://www.fool.com/investing/2021/07/20/what-disney-and-apple-investors-should-look-for-du/
Google news URL to scrape = https://www.fool.com/investing/2021/07/21/why-netflix-stock-fell-nearly-5-on-wednesday/
Google news URL to scrape = https://www.fxstreet.com/news/netflix-nflx-stock-price-and-forecast-earnings-are-mixed-but-let-the-games-begin-202107211026
Google news URL to scrape = https://www.investors.com/news/technology/netflix-stock-netflix-edges-subscriber-target-but-earnings-miss/
Google news URL to scrape = https://www.forbes.com/sites/greatspeculations/2