In [3]:
""" This scraper was used to scrape articles from the Mother Jones news site.
I used some of Rosie's code to create this scraper
"""

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [23]:

def get_links(result_link):
    """Gets all article links on a given page on a news site"""
    links=[]
    titles=[]
    authors=[]
    r = requests.get(result_link)
    soup = BeautifulSoup(r.text, 'html.parser')
    links_results = soup.findAll('h3', {"class":"hed"})
    byline = soup.findAll('p',{'class':'byline'})
    for result in byline:
        authors.append(result.text)
        
    for result in links_results:
        titles.append(result.text.strip())
        links.append(result.contents[1]['href'].strip())

    return links,titles,authors

def get_text(article_link):
    """Takes in a link to an article and returns the formatted article as text"""
    date=''
    article=""
    r = requests.get(article_link)
    soup = BeautifulSoup(r.text, 'html.parser')
    date=soup.find('span', {'class':'dateline'}).text
    
    #get text
    article_body = soup.find('article', {"class":"entry-content"})
    image = article_body.find('div',{'class':'is-image'})
    if image:
        image.decompose()
    for p in article_body.findAll('p'):
        excerpt = p.text.replace(u'\xa0', u' ')
        article = article + "¶" + excerpt
        
    return article, date

In [24]:
print(get_links("https://www.motherjones.com/topics/climate-change/page/10"))
# links
print(get_text('https://www.motherjones.com/environment/2019/02/stop-freaking-out-about-the-future-of-climate-change-and-start-worrying-about-the-present/'))

(['https://www.motherjones.com/environment/2019/03/this-is-how-the-age-of-plastics-began/', 'https://www.motherjones.com/environment/2019/03/environmentalists-didnt-expect-this-would-happen-when-they-busted-up-dams/', 'https://www.motherjones.com/environment/2019/03/jay-inslee-the-2020-race-has-its-first-climate-candidate/', 'https://www.motherjones.com/environment/2019/03/what-do-you-want-to-know-about-the-green-new-deal/', 'https://www.motherjones.com/environment/2019/02/andrew-wheeler-was-just-confirmed-as-the-nations-15th-epa-administrator/', 'https://www.motherjones.com/environment/2019/02/stop-freaking-out-about-the-future-of-climate-change-and-start-worrying-about-the-present/', 'https://www.motherjones.com/politics/2019/02/thanks-to-this-advocacy-group-the-trump-administration-believes-a-little-radiation-is-good-for-you/', 'https://www.motherjones.com/politics/2019/02/hundreds-of-young-protesters-confront-mcconnell-over-green-new-deal/', 'https://www.motherjones.com/environment

In [25]:
#Going through each page of the Mother Jones climate change tag to gather articles
mj_articles=[]
mj_titles=[]
mj_authors=[]
mj_dates=[]
MAX_PAGES = 35

for i in range(1, MAX_PAGES):
    print("Page: {}".format(i)) #Printing current page being scrapped for articles
    links, titles,authors = get_links("https://www.motherjones.com/topics/climate-change/page/{}/".format(i))
    mj_titles.extend(titles)
    mj_authors.extend(authors)
    for link in links:
        article,date=get_text(link)
        mj_articles.append(article)
        mj_dates.append(date)

Page: 1
Page: 2
Page: 3
Page: 4
Page: 5
Page: 6
Page: 7
Page: 8
Page: 9
Page: 10
Page: 11
Page: 12
Page: 13
Page: 14
Page: 15
Page: 16
Page: 17
Page: 18
Page: 19
Page: 20
Page: 21
Page: 22
Page: 23
Page: 24
Page: 25
Page: 26
Page: 27
Page: 28
Page: 29
Page: 30
Page: 31
Page: 32
Page: 33
Page: 34


In [26]:
print(len(mj_articles))
print(len(mj_titles))
print(len(mj_dates))
print(len(mj_authors))

680
680
680
680


In [27]:
df = pd.DataFrame(mj_articles, columns=['content'])
df['title']=mj_titles
df['author']=mj_authors
df['date']=mj_dates
df['denial?'] = [0]*len(df.index)
cols=['title','author','date','content','denial?']
df=df[cols]
df = df.drop_duplicates()
df.to_csv('../../data/left/data_mj_articles.csv', index=False)

In [28]:
df

Unnamed: 0,title,author,date,content,denial?
0,"30,000 Blue Tarps, 2.4 Million Downed Trees, B...",Justin Agrelo,21 hours ago,¶Tropical Storm Dorian skirted Puerto Rico’s w...,0
1,A 23-Year-Old Sunrise Movement Founder Says th...,\nNicole Javorsky,"August 28, 2019","¶On Saturday, the Democratic National Committe...",0
2,Look No Further Than Brazil’s Amazon Fire for ...,"\nAlexander C. Kaufman, Travis Waldron, and Ch...","August 24, 2019",¶This story was originally published by HuffPo...,0
3,Democratic National Committee Rejects Call for...,\nNicole Javorsky,"August 22, 2019",¶The Democratic National Committee on Thursday...,0
4,Jay Inslee Wants to Pay Farmers to Pull Carbon...,\nTom Philpott,"August 21, 2019",¶True to his fixation on battling climate chan...,0
5,A Radical Idea for Reclaiming Our Toxic Reality,\nJordan Gass-Poore,"August 21, 2019",¶Are you having a hard time wrapping your head...,0
6,Anti-Immigration White Supremacy Has Deep Root...,\nSusie Cagle,"August 19, 2019",¶This story was originally published by the Gu...,0
7,Superstorm Sandy Was One of the Worst and Best...,\nLaura Bliss,"August 18, 2019",¶This story was originally published by CityLa...,0
8,“People Should Be Terrified”: One Teen’s Hunge...,\nEmily Holden,"August 12, 2019",¶This story was originally published by the Gu...,0
9,The Shift Toward Renewable Energy Is Coming. W...,\nCarl Segerstrom,"August 10, 2019",¶This story was originally published by High C...,0
