##### Import packages for web scraping:

In [15]:
# Import packages:
import requests
from bs4 import BeautifulSoup
import json
import unicodedata
import time
from datetime import datetime
import pandas as pd
import io
import copy
import gzip
import re

##### Read in political blog urls .csv file from dropbox and view some urls:

In [18]:
# Read in the political blogs from prioritized_blogs.csv:
url = "https://www.dropbox.com/s/i99fsdewsnfhoba/prioritized_blogs.csv?dl=1" # Didn't work until changed dl=0 to dl=1
s=requests.get(url).content
blog_urls = pd.read_csv(io.StringIO(s.decode('latin-1')))
blog_urls = blog_urls.drop('Unnamed: 0', 1)

CParserError: Error tokenizing data. C error: Expected 1 fields in line 4, saw 7


In [None]:
blog_urls.head()

##### Create functions to find the archive, to find paragraphs which is used by the function to scrape an article, all of which are used by the main function which turns pages while until there are no more 2016 articles, and extracts links to 2016 articles, to scrape a blog. The last function is used to edit the found links of an article to exclude self links and to only include links to blogs in the blog_urls .csv:

In [4]:
# Create function to find top blogs from 2016:
def find_top_blogs(top_blogs_site_url):
    full_top_blog_urls = []
    req = requests.get(top_blogs_site_url)
    time.sleep(5.01)
    page = BeautifulSoup(req.text, "lxml")
    for card in page.find_all("div", class_ = "data"):
        full_top_blog_urls.append(card.find("p").a["href"])
    top_blog_urls = copy.deepcopy(full_top_blog_urls)
    after_slash = re.compile("/.*")
    for i in range(len(top_blog_urls)):
        top_blog_urls[i] = top_blog_urls[i].replace("http://","")
        top_blog_urls[i] = top_blog_urls[i].replace("https://", "")
        top_blog_urls[i] = top_blog_urls[i].replace("www.","")
        top_blog_urls[i] = after_slash.sub("",top_blog_urls[i])
    return full_top_blog_urls, top_blog_urls

In [5]:
# Create function to find archive url from blog url:
def find_prospect_archive(blog_url): 
    full_blog_url = "http://www." + blog_url
    req = requests.get(full_blog_url)
    time.sleep(5.01)
    page = BeautifulSoup(req.text, "lxml")
    archive_url = full_blog_url
    for link in page.find_all("a"):
        if link.text.lower() == "archive":
            archive_url += link["href"]
    return archive_url

In [6]:
# Create function to find field which contains paragraphs (by collecting all div 
# with class = fields-item even, and checking how many paragraphs each div has, 
# and then collecting all paragraphs from the div with the most paragraphs):
def find_paragraphs(page):
    field_items = page.find_all("div", class_="field-item even")
    num_fields = len(field_items)
    field_lengths = []
    max_field_length = 1
    longest_field_ind = 0
    for field_num in range(num_fields):
        field_length = len(field_items[field_num].find_all("p"))
        field_lengths.append(field_length)
        if field_length > max_field_length:
            longest_field_ind = field_num
    field = field_items[longest_field_ind]
    pars_iter = field.find_all("p")
    return pars_iter

In [7]:
# Create function to scrape date, domain, links, and text:
def scrape_prospect_article(link, blog_url):
    # Get html from article url provided by link:
    req = requests.get(link)
    time.sleep(5.01)
    page = BeautifulSoup(req.text, "lxml")
    # Find article date and format it in date-time format:
    try:
        article_date = page.find("div", class_="date-longform").text
    except AttributeError:
        article_date = page.find("p", class_="post-date").text
    article_date = datetime.strptime(article_date, "%B %d, %Y")
    # If year is anything aside from 2016, return an empty article:
    if article_date.year != 2016:
        article = ""
        return(article)
    # If year is 2016, extract article data:
    article = {"domain": "", "links": [], "text_body": "", "date": ""}
    # Format and extract date:
    article_date = article_date.strftime("%m/%d/%Y")
    article["date"] = article_date
    # Extract domain:
    article["domain"] = blog_url
    # Extract paragraphs:
    text_string = ""
    pars_iter = find_paragraphs(page)
    # Iterate through paragraphs:
    #count = 0
    for p in pars_iter:
        #count += 1
        #print(count)
        # Extract text by concatonating to empty string:
        text_string += "\n" + unicodedata.normalize("NFKD", p.text)
        for link in p.find_all("a"):
            # Extract links by appending them to empty list:
            article["links"].append(link["href"])
    article["text_body"] = text_string
    return article

In [8]:
# Create function to scrape articles from the prospect blog url:
def scrape_prospect(blog_url):
    articles = []
    # Find url of the articles archive:
    archive_url = find_prospect_archive(blog_url)
    more_pages = True 
    page_number = 1
    prev_page_had_2016 = False
    have_found_articles = False
    # Step through pages of the archive while there are more pages to step through AND the page number is less than 3:
    while (not(((have_found_articles == True) & (prev_page_had_2016 == False))|(more_pages == False))):
        print("page: " + str(page_number))
        page_has_2016 = False
        # Get the html from the page:
        req = requests.get(archive_url)
        time.sleep(5.01)
        page = BeautifulSoup(req.text, "lxml")
        # Loop through the article cards on the page:
        count = 0
        for article_card in page.find_all("h3"):
            count += 1 
            print("article number " + str(count))
            # Look for a hyperlink to the article:
            try:
                article_link_ext = article_card.a["href"]
                found_ext = True
            except TypeError:
                found_ext = False
            # If found, scrape the article at article_link:
            if found_ext == True:
                article_link = "http://www." + blog_url + article_link_ext
                #links.append(article_link)
                current_article = scrape_prospect_article(article_link, blog_url)
                # If current article has date == 2016, append it to the list of articles
                # and change page_has_2016 to True:
                if current_article != "":
                    print(" is from 2016")
                    have_found_articles = True
                    page_has_2016 = True
                    articles.append(current_article)
        # Set prev_page_had_2016 = page_has_2016 for checking condition to iterate through next page:            
        prev_page_had_2016 = page_has_2016
        # Look for a hyperlink to the next page of the archive:        
        try: 
            next_page_ext = page.find("li", class_="pager-next").a["href"]
            page_found = True
        except AttributeError:
            page_found = False
        # If found, change archive_url to the new page url, iterate page number, 
        # if conditions are met, proceed to next hyperlink scrape:   
        if page_found == True:
            archive_url = "http://www." + blog_url + next_page_ext
            page_number += 1
        else: 
            more_pages = False
    return articles

In [9]:
# Create function to convert links to within blog network links:
def convert_links(political_blogs, links, blog_url):
    internal_links = []
    # Loop through blogs in the blog urls csv:
    for political_blog in political_blogs:
        # And for each blog, loop through the links in the post and see if it's contained in any of them:
        for link in links:
            # If link is None, change it to "":
            if link is None:
                link = ""
            # If the blog is contained as a link and it is not a self link, append the blog url to the post's links:
            if ((political_blog in link) & (political_blog != blog_url)):
                internal_links.append(political_blog)
    internal_links = list(set(internal_links))
    return internal_links

##### Scrape urls of top political blogs from 2016 from blog.feedspot.com/political_blogs:

In [3]:
# Scrape top blog urls from site which lists top blogs:
top_blogs_site_url = "http://blog.feedspot.com/political_blogs/"
full_top_blog_urls, top_blog_urls = find_top_blogs(top_blogs_site_url)
top_blog_urls
full_top_blog_urls

['https://thinkprogress.org/',
 'http://www.dailykos.com/',
 'http://www.nytimes.com/pages/politics/index.html',
 'https://www.reddit.com/r/politics/',
 'http://www.politicususa.com/',
 'http://www.newsbusters.org/',
 'http://front.moveon.org/',
 'http://www.thegatewaypundit.com/',
 'http://michellemalkin.com/',
 'http://thepoliticalinsider.com/',
 'http://hotair.com/',
 'http://talkingpointsmemo.com/',
 'http://www.redstate.com/',
 'http://order-order.com/',
 'http://www.headlinepolitics.com/',
 'http://redalertpolitics.com/',
 'http://www.conservativehome.com/',
 'http://ipolitics.ca/',
 'http://www.politics.co.uk/',
 'http://www.realclearpolitics.com/',
 'http://www.politico.eu/',
 'https://www.jihadwatch.org/',
 'http://www.weeklystandard.com/',
 'http://crooksandliars.com/politics',
 'https://www.reddit.com/r/PoliticalDiscussion/',
 'http://www.latimes.com/politics/',
 'http://www.huffingtonpost.com/section/politics',
 'http://www.vox.com/policy-and-politics',
 'https://www.thegua

In [23]:
req = requests.get("http://redalertpolitics.com/")
page = BeautifulSoup(req.text, "lxml")
print(page)

<html>
<head><title>403 Forbidden</title></head>
<body bgcolor="white">
<center><h1>403 Forbidden</h1></center>
<hr/><center>nginx</center>
</body>
</html>



In [17]:
for top_blog in full_top_blog_urls[0:100]: 
    req = requests.get(top_blog)
    page = BeautifulSoup(req.text, "lxml")
    num_divs = len(page.find_all("div"))
    if (num_divs > 0) == False:
        num_divs = len(page.find("body").find("div"))
    print(num_divs)

98
725
198
354
109
204
54
151
84
124
337
777
333
261
112


TypeError: object of type 'NoneType' has no len()

In [8]:
# Save 2016 blog urls to a .csv:
blogs_2016 = pd.DataFrame({"blog_url" : top_blog_urls,"full_blog_url" : full_top_blog_urls})
blogs_2016.to_csv("blogs_2016.csv")
blogs_2016.head()


Unnamed: 0,blog_url,full_blog_url
0,thinkprogress.org,https://thinkprogress.org/
1,dailykos.com,http://www.dailykos.com/
2,nytimes.com,http://www.nytimes.com/pages/politics/index.html
3,reddit.com,https://www.reddit.com/r/politics/
4,politicususa.com,http://www.politicususa.com/


##### Scrape blog posts from prospect.com:

In [None]:
# Specify blog url:
blog_url = "prospect.org"

# Scrape prospect.org blog:
articles = scrape_prospect(blog_url)

In [None]:
# Show articles:
#articles

In [None]:
# Loop through blog posts and convert links to internal links (excluding links 
# to urls which are not in blog_urls and removing self links):
articles_edited = copy.deepcopy(articles)
for article in articles_edited:
    article["links"] = convert_links(blog_urls.Blog, article["links"], blog_url)

In [None]:
# Show edited articles:
#articles_edited
#for art_ed in articles_edited:
#    print(art_ed["links"])

In [None]:
# Save blog posts to a .json.gz file:
out_path = "prospect_posts_2016.json.gz"    
with gzip.GzipFile(out_path, 'w') as fout:
    for i in range(len(articles_edited)):

        data = articles_edited[i]                    # 1. data

        json_str = json.dumps(data) + "\n"           # 2. string
        json_bytes = json_str.encode('utf-8')        # 3. bytes (i.e. UTF-8)

        fout.write(json_bytes)                       # 4. gzip

In [10]:
# Read in scraped_articles.json file provided by MaxPoint:
#with open('scraped_articles.json') as data_file:    
with open('scraped_articles_iso.json') as data_file:    
    scraped_articles_json = json.load(data_file)

In [13]:
# Loop through blog posts and convert links to internal links (excluding links 
# to urls which are not in blog_urls and removing self links):
scraped_articles_json_edited = copy.deepcopy(scraped_articles_json)
count=0
for scraped_article_json_edited in scraped_articles_json_edited[0:100]:
    scraped_article_json_edited["links"] = convert_links(blog_urls.Blog, scraped_article_json_edited["links"], scraped_article_json_edited["domain"])
    count += 1
    print(count)

NameError: name 'blog_urls' is not defined

In [None]:
#scraped_articles_json_edited = copy.deepcopy(scraped_articles_json)
#scraped_articles_json_edited[2]
#convert_links(blog_urls.Blog, scraped_articles_json_edited[1]["links"], scraped_articles_json_edited[1]["domain"])

In [None]:
# Show edited articles:
#articles_edited
for scrape_art_ed in scraped_articles_json_edited[0:100]:
    print(scrape_art_ed["links"])

In [None]:
# Convert .json file to .json.gz file:
def json_to_json_gz(path_to_json, output_path):
    
    # Read in path_to_json .json file:
    with open(path_to_json) as data_file:    
        file_json = json.load(data_file)

    # Write output_path .json.gz file, line by line, from the .json file: 
    with gzip.GzipFile(output_path, 'w') as fout:
        for i in range(len(file_json)):

            data = file_json[i]              # 1. data

            json_str = json.dumps(data) + "\n"           # 2. string
            json_bytes = json_str.encode('utf-8')        # 3. bytes (i.e. UTF-8)

            fout.write(json_bytes)                       # 4. gzip

In [None]:
# Convert scraped_articles.json file (provided by MaxPoint) to blogs_2016.json.gz using json_to_json_gz function:
scraped_articles_json = 'scraped_articles.json'
blogs_2016_json_gz = "blogs_2016.json.gz"
json_to_json_gz(scraped_articles_json, blogs_2016_json_gz)

In [None]:
# # Save blog posts to a .json file:
# out_path = "prospect_posts.json"    
# with open(out_path, "w") as of:
#     json.dump(articles, of)