In [1]:
#Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

#Display options
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

#Looping through 450 most recent pages to get the article urls to scrape

links_full = []

for each_page in range(1, 450):
    base_url = 'https://www.wired.com/most-recent/page/' + str(each_page) + '/'
    for each in base_url:
        link = []
        resp = requests.get(base_url)
        site = resp.content
        soup = BeautifulSoup(site, 'html.parser')
        ul = soup.find_all('ul', {'class': 'archive-list-component__items'})
        for item in ul:
            for li in item.find_all('li'):
                a = li.find('a')
                link.append('https://www.wired.com' + a['href'])
    links_full.append(link)
    
flattened_links = [each for sublist in links_full for each in sublist]
len(flattened_links)

4490

In [2]:
#Filtering for only article urls
story_urls = [s for s in flattened_links if '/story/' in s] 
story_urls[:5]

['https://www.wired.com/story/children-emoji-language-learning/',
 'https://www.wired.com/story/tor-anonymity-easier-than-ever/',
 'https://www.wired.com/story/best-wired-photo-stories-2018/',
 'https://www.wired.com/story/best-internet-moments-2018/',
 'https://www.wired.com/story/how-big-tech-co-opted-time-well-spent/']

In [4]:
title, descr, url,  news_kw, pub_date, pub_time, text = [], [], [], [], [], [], []

for each in story_urls:   
    resp = requests.get(each)
    site = resp.content
    soup = BeautifulSoup(site, 'html.parser')

    title.append(soup.find('meta', {'property': 'og:title'})["content"])
    descr.append(soup.find('meta', {'property': 'og:description'})["content"])
    url.append(soup.find('meta', {'property': 'og:url'})["content"])
    news_kw.append(soup.find('meta', {'name':'news_keywords'})["content"])
    pub_date.append(soup.find('time', {'class': "date-mdy"}).text)
    pub_time.append(soup.find('time', {'class': "date-gia"}).text)

    paragraphs = soup.find_all('p')
    para_all = [each.text for each in paragraphs]
    text.append(para_all)

In [5]:
len(title)

4220

In [7]:
#Putting it into a df
df = pd.DataFrame([title, descr, url, news_kw, pub_date, pub_time, text]).T
df.columns = ['title', 'descr', 'url', 'news_kw', 'pub_date', 'pub_time', 'text']
df['pub_date'] = pd.to_datetime(df['pub_date'])
df['pub_time'] = pd.to_datetime(df['pub_time'])
df['pub_hour'] = df['pub_time'].apply(lambda x: x.strftime('%H'))
df = df.drop(['pub_time'], 1)
df.head()

Unnamed: 0,title,descr,url,news_kw,pub_date,text,pub_hour
0,Children Are Using Emoji for Digital-Age Langu...,"When preliterate kids type strings of emoji, i...",https://www.wired.com/story/children-emoji-lan...,"ideas,Emoji,children,reading,language",2019-01-01,"[A couple of months ago, NPR reporter Lulu Mil...",8
1,Tor Is Easier Than Ever. Time to Give It a Try,Been curious about Tor but worried it's too co...,https://www.wired.com/story/tor-anonymity-easi...,"security,Tor,privacy,Internet,anonymity,Year i...",2019-01-01,[You probably know about the digital anonymity...,7
2,The Top WIRED Photo Stories of 2018,Want to give your eyes a break from the news c...,https://www.wired.com/story/best-wired-photo-s...,"photo,Photography",2018-12-31,[The Hellish E-Waste Graveyards Where Computer...,9
3,The 2018 Internet Moments That Made Being Onli...,'A Star Is Born' birthed great memes and Zenda...,https://www.wired.com/story/best-internet-mome...,"culture,Year in Review,memes,internet culture",2018-12-31,"[The internet, as recent Senate reports have s...",9
4,Big Tech Is Here to Help You Fight Excessive P...,"How the large tech corporations turned ""digita...",https://www.wired.com/story/how-big-tech-co-op...,"gear,Year in Review,digital wellness,Tristan H...",2018-12-31,"[In early February, the technologist Tristan H...",7


In [21]:
#Only selecting articles in 2018. Since I scraped the 'most recent' pages there's some overlap with 2017 and 2018 articles
df_18 = df[(df['pub_date'] >= pd.to_datetime('2018-01-01')) & (df['pub_date'] <= pd.to_datetime('2018-12-31'))]
print(df_18['pub_date'].min())
print(df_18['pub_date'].max())

2018-01-01 00:00:00
2018-12-31 00:00:00


In [23]:
df_18.shape

(4102, 7)