# Web Scraping: Daily Mail

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re
import time

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# url definition
url = "https://www.dailymail.co.uk"

# Request
r1 = requests.get(url)
r1.status_code

# We'll save in coverpage the cover page content
coverpage = r1.content

# Soup creation
soup1 = BeautifulSoup(coverpage, 'html5lib')

# News identification
coverpage_news = soup1.find_all('h2', class_='linkro-darkred')
len(coverpage_news)

189

## Let's extract the text from the articles

In [4]:
number_of_articles = len(coverpage_news)

# Empty lists for content, links and titles
news_contents = []
list_links = []
list_titles = []

for n in np.arange(0, number_of_articles):
        
    # Getting the link of the article
    link = url + coverpage_news[n].find('a')['href']
    list_links.append(link)
    
    # Getting the title
    title = coverpage_news[n].find('a').get_text()
    list_titles.append(title)
    
    # Reading the content (it is divided in paragraphs)
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    body = soup_article.find_all('p', class_='mol-para-with-font')
    
    # Unifying the paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(body)):
        paragraph = body[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    # Removing special characters
    final_article = re.sub("\\xa0", "", final_article)
        
    news_contents.append(final_article)

Let's put them into:

1. a dataset which will the input of the models (df_features)
2. a dataset with the title and the link (df_show_info)

In [5]:
# df_features
df_features = pd.DataFrame(
     {'Article Content': news_contents 
    })

# df_show_info
df_show_info = pd.DataFrame(
    {'Article Title': list_titles,
     'Article Link': list_links})

In [6]:
df_features

Unnamed: 0,Article Content
0,The number of coronavirus infections in South ...
1,The United States death toll from the coronavi...
2,Furious demonstrators gathered Wednesday at Mi...
3,"In early April, the CDC recommended that all A..."
4,"A pair of scientists believe that up to 270,00..."
5,The Internal Revenue Service's online tool tha...
6,A crash federal government lending program mea...
7,A Michigan doctor has claimed he will prove wh...
8,Two Democratic congressmen have proposed expan...
9,The director of the Centers for Disease Contro...


In [7]:
df_show_info

Unnamed: 0,Article Title,Article Link
0,South Dakota's Republican governor Kristi Noem...,https://www.dailymail.co.uk/news/article-82224...
1,"Death toll in the US nears 30,000 after 1,500 ...",https://www.dailymail.co.uk/news/article-82234...
2,Protesters flood the streets and block traffic...,https://www.dailymail.co.uk/news/article-82230...
3,What face mask should YOU choose? As several s...,https://www.dailymail.co.uk/femail/article-822...
4,California has 10 times more coronavirus cases...,https://www.dailymail.co.uk/health/article-822...
5,"Chaos over $1,200 stimulus checks: IRS trackin...",https://www.dailymail.co.uk/news/article-82222...
6,$350 billion paycheck protection program is 'o...,https://www.dailymail.co.uk/news/article-82231...
7,Michigan doctor claims he will prove how long ...,https://www.dailymail.co.uk/news/article-82219...
8,"Two Democratic congressman propose $2,000-a-mo...",https://www.dailymail.co.uk/news/article-82229...
9,Trump's CDC director contradicts him over WHO ...,https://www.dailymail.co.uk/news/article-82219...


In [8]:
df_features.to_csv("df_features.csv", sep=',', index=False)
df_show_info.to_csv("df_show_info.csv", sep=',', index=False)
