In [1]:
import re 
import csv
from time import sleep
from bs4 import BeautifulSoup
import requests

In [2]:
template = 'https://news.search.yahoo.com/search?p=brexit'

In [3]:
url = template.format('brexit')

In [4]:
headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'referer': 'https://www.google.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
}

In [5]:
response = requests.get(url,headers=headers)

In [7]:
soup = BeautifulSoup(response.text,'lxml')

Get the collection

In [9]:
cards = soup.find_all('div','NewsArticle')

In [10]:
len(cards)

10

### Create a prototype model

In [11]:
card = cards[0]

In [15]:
headline = card.find('h4','s-title').text
headline

'‘Worse Than Brexit’: Scottish Independence Weighs on U.K. Assets'

In [16]:
source = card.find('span','s-source').text
source 

'Bloomberg via Yahoo Finance'

In [18]:
posted = card.find('span','s-time').text.replace('.','').strip()
posted

'· 3 hours ago'

In [20]:
description = card.find('p','s-desc').text.strip()
description
print(len(description))

99


In [21]:
raw_link = card.find('a').get('href')
raw_link

'https://r.search.yahoo.com/_ylt=AwrC1DGXVJFg6GEAzRvQtDMD;_ylu=Y29sbwNiZjEEcG9zAzEEdnRpZAMEc2VjA3Ny/RV=2/RE=1620165912/RO=10/RU=https%3a%2f%2ffinance.yahoo.com%2fnews%2fscottish-hangs-over-u-k-040000427.html/RK=2/RS=1EqQdPNBWiUlyXHtYKQ4HfaPuBI-'

In [22]:
unquoted_link = requests.utils.unquote(raw_link)
unquoted_link

'https://r.search.yahoo.com/_ylt=AwrC1DGXVJFg6GEAzRvQtDMD;_ylu=Y29sbwNiZjEEcG9zAzEEdnRpZAMEc2VjA3Ny/RV=2/RE=1620165912/RO=10/RU=https://finance.yahoo.com/news/scottish-hangs-over-u-k-040000427.html/RK=2/RS=1EqQdPNBWiUlyXHtYKQ4HfaPuBI-'

In [23]:
pattern = re.compile(r'RU=(.+)\/RK')
clear_link = re.search(pattern,unquoted_link).group(1)
clear_link

'https://finance.yahoo.com/news/scottish-hangs-over-u-k-040000427.html'

### Generalize the model


In [30]:
def get_article(card):
    headline = card.find('h4','s-title').text
    source = card.find('span','s-source').text
    posted = card.find('span','s-time').text.replace('.','').strip()
    description = card.find('p','s-desc').text.strip()
    raw_link = card.find('a').get('href')
    unquoted_link = requests.utils.unquote(raw_link)
    pattern = re.compile(r'RU=(.+)\/RK')
    clear_link = re.search(pattern,unquoted_link).group(1)
    
    article = (headline,source,posted,description,clear_link)
    return article 

    

In [31]:
articles = []
links = set()

for card in cards:
    article = get_article(card)
    link = article[-1]
    if not link in links:
        links.add(link)
        articles.append(article)

In [32]:
print(articles[0])

('‘Worse Than Brexit’: Scottish Independence Weighs on U.K. Assets', 'Bloomberg via Yahoo Finance', '· 3 hours ago', 'As Scots enter a May 6 vote pitched on whether there should be a second independence referendum,...', 'https://finance.yahoo.com/news/scottish-hangs-over-u-k-040000427.html')


### Get the next page 

In [33]:
url = soup.find('a','next').get('href')
url

'https://news.search.yahoo.com/search;_ylt=AwrC1DGXVJFg6GEA7BvQtDMD;_ylu=Y29sbwNiZjEEcG9zAzEEdnRpZAMEc2VjA3BhZ2luYXRpb24-?p=brexit&b=11&pz=10&bct=0&xargs=0'

In [34]:
print(len(articles))

10


### Bringing it all Together

In [35]:
import re 
import csv
from time import sleep
from bs4 import BeautifulSoup
import requests

headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'referer': 'https://www.google.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
}

def get_article(card):
    headline = card.find('h4','s-title').text
    source = card.find('span','s-source').text
    posted = card.find('span','s-time').text.replace('.','').strip()
    description = card.find('p','s-desc').text.strip()
    raw_link = card.find('a').get('href')
    unquoted_link = requests.utils.unquote(raw_link)
    pattern = re.compile(r'RU=(.+)\/RK')
    clear_link = re.search(pattern,unquoted_link).group(1)
    
    article = (headline,source,posted,description,clear_link)
    return article 

def get_the_news(search):
    #Run the main program
    template = 'https://news.search.yahoo.com/search?p={}'
    url = template.format(search)
    articles = []
    links = set()
    
    while True:
        response = requests.get(url,headers=headers)
        soup = BeautifulSoup(response.text,'lxml')
        cards = soup.find_all('div','NewsArticle')
        
        #extract articles from page
        for card in cards:
            article = get_article(card)
            link = article[-1]
            if not link in links:
                links.add(link)
                articles.append(article)
                
        #Find the next page 
        try:
            url = soup.find('a','next').get('href')
            sleep(2)
        except AttributeError:
            break
    
    #Save article data
    with open('newsArticle.csv','w',newline='',encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Headline','Source','Posted','Description','Link'])
        writer.writerows(articles)
    
    return articles
        

In [36]:
articles = get_the_news('brexit')

In [37]:
print(len(articles))

714
