In [8]:
# This script can be run to create a dataset of google news headlines. 

import pandas as pd
from datetime import datetime
import numpy as np
import requests
from bs4 import BeautifulSoup as bs4

In [72]:
%%time
searchTerm = "something"
count=0
url = ("https://www.google.com/search?q=" + searchTerm + "&hl=en&gl=us&authuser=0&tbs=qdr:y&tbm=nws&start=$count$&cad=h")

source_time = []
headlines = []

while (count<50):
    new_url = url.replace("$count$", str(count))
    r = requests.get(new_url)
    count = count+10 
    soup = bs4(r.text, 'html.parser')
    
    for d in soup.find_all("div", "slp"):
        source_time.append(d.text)   
    
    for h in soup.find_all("div", "st"):
        headlines.append(h.text)

Wall time: 3.34 s


In [73]:
sourceTime_df = pd.DataFrame(source_time)
headline_df = pd.DataFrame(headlines, columns=["Headline"])

In [74]:
#split sourceTime dataframe to columns containing Publication Name (Publication) and Time
sourceTime_df['Publication'],sourceTime_df['Time'] = zip(*sourceTime_df[0].apply(lambda x: x.split(' - ')))

In [75]:
# df1 is a intermediate dataframe before final clean dataset
df1 = pd.merge(sourceTime_df,headline_df,left_index=True, right_index=True) # merge the publication/time df with headline df

In [76]:
df1 = df1.drop(df1.columns[0], axis=1) #axis  =1 means columns, vs. 0 for rows. this drops the first column. 
df1.head()

Unnamed: 0,Publication,Time,Headline
0,The San Diego Union-Tribune,16 hours ago,I am not opposed to new therapeutic modalities...
1,The Daily Times,10 hours ago,“I'm gonna' do a column about how everybody's ...
2,New York Post,3 days ago,Antarctica is a weird place. Scientists have l...
3,Devon Live,23 hours ago,If you're looking for a bargain this Christmas...
4,New York Times,2 days ago,We usually try every time to eat something wei...


In [77]:
df2 = df1 # make new dataframe called "df2"
df2.head()

Unnamed: 0,Publication,Time,Headline
0,The San Diego Union-Tribune,16 hours ago,I am not opposed to new therapeutic modalities...
1,The Daily Times,10 hours ago,“I'm gonna' do a column about how everybody's ...
2,New York Post,3 days ago,Antarctica is a weird place. Scientists have l...
3,Devon Live,23 hours ago,If you're looking for a bargain this Christmas...
4,New York Times,2 days ago,We usually try every time to eat something wei...


In [78]:
# create columns to solve issue of relative time/dates in the scraping results. we use to_datetime a pandas fucntion and Timedelta 
# to get absolute dates by subrtracting with today's datetime

absolute_date = pd.to_datetime(df1.Time, errors="coerce")
relative_date = (datetime.today() - df1.Time.str.extract("(.*) ago", expand=False).apply(pd.Timedelta))
relative_date[relative_date.isnull()] = absolute_date[absolute_date.notnull()]

## add columns for months and year to "df"
date = relative_date.dt.date
df2.insert(loc=0, column='Month', value= relative_date.dt.month)
df2.insert(loc=1, column='Year', value= relative_date.dt.year)

In [79]:
df2.head()

Unnamed: 0,Month,Year,Publication,Time,Headline
0,11,2017,The San Diego Union-Tribune,16 hours ago,I am not opposed to new therapeutic modalities...
1,11,2017,The Daily Times,10 hours ago,“I'm gonna' do a column about how everybody's ...
2,11,2017,New York Post,3 days ago,Antarctica is a weird place. Scientists have l...
3,11,2017,Devon Live,23 hours ago,If you're looking for a bargain this Christmas...
4,11,2017,New York Times,2 days ago,We usually try every time to eat something wei...


In [80]:
final_df = df2.drop('Time', axis=1)
final_df.head()

Unnamed: 0,Month,Year,Publication,Headline
0,11,2017,The San Diego Union-Tribune,I am not opposed to new therapeutic modalities...
1,11,2017,The Daily Times,“I'm gonna' do a column about how everybody's ...
2,11,2017,New York Post,Antarctica is a weird place. Scientists have l...
3,11,2017,Devon Live,If you're looking for a bargain this Christmas...
4,11,2017,New York Times,We usually try every time to eat something wei...
