In [1]:
import numpy as np
from IPython.core.debugger import set_trace
import re
import difflib
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
import time
import random
import pickle
from lxml.html import fromstring
from requests.packages.urllib3.util import Retry
from requests.adapters import HTTPAdapter
from requests import Session, exceptions

# Get URL Links for writers from muckrack.com
The goal of this notebook is to scrape muckrack.com for the url links for all nba writers. I am using muckrack.com as a way to get all relevant content for each NBA writer regardless of the platform hosting the article. Muckrack.com is a website for journalists and each writer has a page with a link and brief metadata about all of the most recent articles by a given writer. If the writer has a login to this website they can customize their username, otherwise the webpage for that writer is first_name-last_name. I manually went and searched all of the writers to be included in my database to make sure I had the right link to get the right articles.

In [3]:
# if already have muckrack data
with open('/Users/rohanramesh/Documents/Insight/data_bball_writers/writer_df.pickle', 'rb') as handle:
    writer_df = pickle.load(handle)

In [None]:
# source of names of NBA writers - from NBA writer poll on twitter
writer_filepath = '/Users/rohanramesh/Documents/Insight/data_bball_writers/writers.csv'
writer_df = pd.read_csv(writer_filepath)

In [None]:
# These are the writers that have personalized usernames that I manually had to find and update
# make new column that has edited name for search and save this df
curr_name = writer_df['Idea Text'][0]
website_name = []
for curr_name in writer_df['Idea Text']:
    new_name = curr_name.lower().replace(' ','-')
    new_name = new_name.lower().replace('.','')
    if '/' in new_name:
        idx = new_name.index('/')
        new_name = new_name[0:idx]
    if (new_name == 'bill-simmons') or (new_name == 'rob-mahoney'):
        new_name = new_name.replace('-','') # bc multiple bill simmons
    if (new_name == 'scott-howard-cooper'):
        new_name = 'showardcooper'
    if (curr_name == 'Zach Lowe'):
        new_name = 'zachlowe_nba'
    if (curr_name == 'Sam Amick'):
        new_name = 'sam_amick'
    if (curr_name == 'Jonathan Abrams'):
        new_name = 'jpdabrams'
    if (curr_name == 'Marc Spears'):
        new_name = 'marc-j-spears'
    if (curr_name == "Kevin O'Connor"):
        new_name = 'kevin-p-oconnor'
    if (curr_name == 'Marcus Thompson'):
        new_name = 'thompsonscribe'
    if (curr_name == 'Katie Nolan'):
        new_name = 'meet-katie-nolan'
    if (curr_name == 'Eric Pincus'):
        new_name = 'ericpincus'
    if (curr_name == 'Tim Kawakami'):
        new_name = 'timkawakami'
    if (curr_name == 'Josh Robbins'):
        new_name = 'joshrobbins'
    if (curr_name == "Ian O'Connor"):
        new_name = 'ian-oconnor-1027184'
    if (curr_name == "Chris Broussard"):
        new_name = 'chris_broussard'
    if (curr_name == "Howard Beck"):
        new_name = 'howardbeck'
    if (curr_name == "Jeff Zillgitt"):
        new_name = 'jeffzillgitt'
    if (curr_name == "Jon Krawczynski"):
        new_name = 'apkrawczynski'
    if (curr_name == "Bob Ryan"):
        new_name = 'globebobryan'
    if (curr_name == "Ric Bucher"):
        new_name = 'ricbucher'
    if (curr_name == "Scott Cacciola"):
        new_name = 'scottcacciola'
    if (curr_name == "David Morrow"):
        new_name = 'david-a-morrow'
    if (curr_name == "Peter Vecsey"):
        new_name = 'petervecsey1'
    if (curr_name == "Jimmy Spencer"):
        new_name = 'jimmy-spencer-1'
    if (curr_name == "Seb Dumitru"):
        new_name = 'sebdumitru'
    website_name.append(new_name)


# website_name = [i.lower().replace(' ','-') for i in writer_df['Idea Text']]
writer_df['website_name'] = website_name
writer_df.head(100)
# save writer df
with open('/Users/rohanramesh/Documents/Insight/data_bball_writers/writer_df.pickle', 'wb') as handle:
    pickle.dump(writer_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Scraping muckrack
In the upcoming cell I scrape all of the top 20 pages worth of articles for each writer and store the url and metadata into a variable called scrapevar that I can save for later use to scrape the actual article content. I attempted this two ways:

1. First I just randomize a wait time but use my IP address 
2. I grab other IP addresses and headers to avoid rate limiting issues

While I got kicked off initially, I found by further randomizing my wait time I could use option one but I have included both sets of code below

In [4]:
# preallocate the scrapevar dict
scrapevar = {}
cats_collect = {'blurbs', 'links', 'titles', 'sources', 'pubdates'}
for i in cats_collect:
    scrapevar[i] = {}

# iterate through all writer names
for curr_name in writer_df['website_name']:
    print(curr_name)
    # this is the formatting for the muckrack page and name
    page_address = 'https://muckrack.com/%s/articles?page=' % curr_name
    all_soups = []
    all_blurbs = []
    all_links = []
    all_titles = []
    all_sources = []
    all_pubdates = []
    npage = 1
    n_pages_to_scrape = 20 # this is the number of pages to scrape for each writer (25 hits per page)
    while npage <= n_pages_to_scrape:
        # use requests to try and grab url if fail move on to next writer
        try:
            page = requests.get(page_address+str(npage), timeout=10)
        except Exception:
            npage = n_pages_to_scrape+10
            continue
        # using beautifulsoupt to parse the text for muckrack
        curr_soup = BeautifulSoup(page.text)
        time.sleep(random.uniform(0.5,2))
        all_soups.append(curr_soup)
        # now digging into metadata to get urls
        allstories = curr_soup.findAll("div", {"class": "news-story-meta"})
        if not allstories: # jump to next writer if fails
            npage = n_pages_to_scrape+10
            break
        # if should keep on searching till next page bc of an endless container
        a = curr_soup.find("div", {"class": "endless_container"})
        if a is not None:
            npage += 1
        else:
            npage = n_pages_to_scrape+10
        for curr_story in allstories:
            # save and append info
            b = curr_story.find("a", {"target": "_blank"})
            all_blurbs.append(b.attrs['data-description'])
            all_links.append(b.attrs['data-link'])
            all_titles.append(b.attrs['data-title'])
            all_sources.append(b.attrs['data-source'])
            timelabel = curr_story.find("a", {"class": "timeago"})
            all_pubdates.append(timelabel.attrs['title'])
    # put into scrapevar
    scrapevar['blurbs'][curr_name] = all_blurbs
    scrapevar['links'][curr_name] = all_links
    scrapevar['titles'][curr_name] = all_titles
    print(len(scrapevar['titles'][curr_name]))
    scrapevar['sources'][curr_name] = all_sources
    scrapevar['pubdates'][curr_name] = all_pubdates


zachlowe_nba
25
adrian-wojnarowski
25
lee-jenkins
25
howardbeck
25
marc-stein
25
ethan-sherwood-strauss
25
jason-concepcion
25
kevin-arnovitz
25
tom-haberstroh
25
nate-duncan
24
zach-harper
25
brian-windhorst
25
sam_amick
25
david-aldridge
25
ramona-shelburne
25
jpdabrams
25
kevin-pelton
25
marc-j-spears
25
matt-moore
25
shams-charania
25
kevin-p-oconnor
25
chris-haynes
25
dave-mcmenamin
25
shea-serrano
25
ian-begley
25
rachel-nichols
25
ben-golliver
25
mike-prada
25
robmahoney
25
tim-macmahon
25
chris-herring
25
billsimmons
25
jonathan-tjarks
25
amin-elhassen
0
bobby-marks
25
danny-leroux
25
chris-mannix
25
dan-devine
25
michael-pina
25
thompsonscribe
25
ben-golliver
25
ben-falk
25
ian-levy
25
tim-bontemps
25
henry-abbott
25
scott-rafferty
25
ja-dubin
0
michael-lee
25
alex-kennedy
25
derek-bodner
25
tom-ziller
25
chris-ballard
25
josh-eberley
25
adi-joseph
25
adam-mares
25
sam-vecenie
25
meet-katie-nolan
1
andrew-sharp
25
david-thorpe
25
royce-webb
1
ericpincus
25
nick-sciria
19
paul-

In [3]:
# functions written to get around rate limits using proxies and different user agents
# get_proxies():
def get_proxies(nprox):
    """
    grab from free-proxy-list.net to avoid rate limits
    :param: nprox: number of proxies you want out
    :return: list of proxies to use
    """
    url = 'https://free-proxy-list.net/'
    response = requests.get(url)
    curr_soup = BeautifulSoup(response.text)
    rand_ipadd = []
    alllist = curr_soup.findAll("tr")
    # alllist[1].next.text
    rvec = random.sample(range(1, 15), nprox)
    for i in rvec:
        rand_ipadd.append((alllist[i].next.text + ":" + alllist[i].next.next.next.text))
    return rand_ipadd

def get_user_agents(nagents):
    """
    grab n user agents to avoid rate limist
    :param: nagents: number of agents you want out
    :return: list of agents to use
    """
    useragent_add = 'https://developers.whatismybrowser.com/useragents/explore/software_type_specific/web-browser/'
    page = requests.get(useragent_add)
    curr_soup = BeautifulSoup(page.text)
    rand_useragents = []
    alllist = curr_soup.findAll("td", {"class": "useragent"})
    rvec = random.sample(range(0, 25), nagents)
    for i in rvec:
        a = alllist[i].find("a")
        rand_useragents.append(a.text)
    return rand_useragents

def build_proxy_agent(proxy_list,user_agent_list):
    """
    combine proxies and agents to build new header and userdict to scrape with
    :param: proxy_list: proxy list generate from free-proxy-list
    :param: user_agent_list: user agents from whatismybrowser
    :return: proxyDict: Dict formatted for http and https
    :return: header: randomly grab user agent from list and return
    """
    proxy = random.choice(proxy_list)
    http_proxy  = "http://" + proxy
    https_proxy = "https://" + proxy

    proxyDict = { 
                  "http"  : http_proxy, 
                  "https" : https_proxy
                }
    user_agent = random.choice(user_agent_list)
    #Set the headers 
    headers = {'User-Agent': user_agent}
    return proxyDict, headers

def requests_retry_session(
    retries=3,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 504),
    session=None,
):
    """
    iteratively try with requests and proxies and user agents to download url
    often need to do this bc might have issues with certain addresses
    :return: session: the session for scraping
    """    
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session


In [None]:
# same process but using proxies and user agents
proxy_list = get_proxies(12)
user_agent_list = get_user_agents(12)

scrapevar = {}
cats_collect = {'blurbs', 'links', 'titles', 'sources', 'pubdates'}
for i in cats_collect:
    scrapevar[i] = {}

# iterate through all writer names
for curr_name in writer_df['website_name']:
    print(curr_name)
    # this is the formatting for the muckrack page and name
    page_address = 'https://muckrack.com/%s/articles?page=' % curr_name
    all_soups = []
    all_blurbs = []
    all_links = []
    all_titles = []
    all_sources = []
    all_pubdates = []
    npage = 1
    n_pages_to_scrape = 20 # this is the number of pages to scrape for each writer (25 hits per page)
    while npage <= n_pages_to_scrape:
        # use requests to try and grab url if fail move on to next writer
        try:
            page = requests_retry_session(retries=10).get(
                page_address+str(npage), headers=headers, proxies=proxyDict, timeout=10)
        except Exception:
            npage = n_pages_to_scrape+10
            continue
        # using beautifulsoupt to parse the text for muckrack
        curr_soup = BeautifulSoup(page.text)
        time.sleep(random.uniform(0.5,2))
        all_soups.append(curr_soup)
        # now digging into metadata to get urls
        allstories = curr_soup.findAll("div", {"class": "news-story-meta"})
        if not allstories: # jump to next writer if fails
            npage = n_pages_to_scrape+10
            break
        # if should keep on searching till next page bc of an endless container
        a = curr_soup.find("div", {"class": "endless_container"})
        if a is not None:
            npage += 1
        else:
            npage = n_pages_to_scrape+10
        for curr_story in allstories:
            # save and append info
            b = curr_story.find("a", {"target": "_blank"})
            all_blurbs.append(b.attrs['data-description'])
            all_links.append(b.attrs['data-link'])
            all_titles.append(b.attrs['data-title'])
            all_sources.append(b.attrs['data-source'])
            timelabel = curr_story.find("a", {"class": "timeago"})
            all_pubdates.append(timelabel.attrs['title'])
    # put into scrapevar
    scrapevar['blurbs'][curr_name] = all_blurbs
    scrapevar['links'][curr_name] = all_links
    scrapevar['titles'][curr_name] = all_titles
    print(len(scrapevar['titles'][curr_name]))
    scrapevar['sources'][curr_name] = all_sources
    scrapevar['pubdates'][curr_name] = all_pubdates


In [7]:
# save scrapevar for later use
with open('/Users/rohanramesh/Documents/Insight/data_bball_writers/Attempt4_mr_scrape.pickle', 'wb') as handle:
    pickle.dump(scrapevar, handle, protocol=pickle.HIGHEST_PROTOCOL)