In [1]:
import pandas as pd
import urllib3
import requests
from bs4 import BeautifulSoup
from io import BytesIO
import gzip

In [2]:
df = pd.read_csv("input/raw/cc_mlb.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,charset,digest,filename,languages,length,mime,mime-detected,offset,status,timestamp,url,urlkey
0,0,UTF-8,LQMEKH2VUTYILOQRFAIPGNA25VS4AX64,crawl-data/CC-MAIN-2019-13/segments/1552912203...,eng,34907,text/html,text/html,279018202,200,20190325053439,http://www.espn.com/mlb/story/_/id/10015217/ml...,"com,espn)/mlb/story/_/id/10015217/mlb-players-..."
1,1,UTF-8,BZMV7IMCPPK7WMRASVZADI3JPPFW3QYG,crawl-data/CC-MAIN-2019-13/segments/1552912201...,eng,33236,text/html,text/html,280546961,200,20190319033258,http://www.espn.com/mlb/story/_/id/10019043/co...,"com,espn)/mlb/story/_/id/10019043/colorado-roc..."
2,2,UTF-8,4MR466DR32C5T6B3U7D2RDZMAKZYKTEZ,crawl-data/CC-MAIN-2019-13/segments/1552912202...,eng,38336,text/html,text/html,295127884,200,20190323005221,http://www.espn.com/mlb/story/_/id/10042646/ja...,"com,espn)/mlb/story/_/id/10042646/javier-lopez..."
3,3,UTF-8,7Z5TT4LMUUFGOJYXRU4FM3NKAMLNMBCJ,crawl-data/CC-MAIN-2019-13/segments/1552912203...,eng,38751,text/html,text/html,283761478,200,20190323211500,http://www.espn.com/mlb/story/_/id/10074360/ta...,"com,espn)/mlb/story/_/id/10074360/tampa-bay-ra..."
4,4,UTF-8,RHSK6WX26QBULNJTEML6V4362US6664T,crawl-data/CC-MAIN-2019-13/segments/1552912202...,eng,30356,text/html,text/html,281365425,200,20190320054445,http://www.espn.com/mlb/story/_/id/10083830/se...,"com,espn)/mlb/story/_/id/10083830/seattle-mari..."


In [3]:
def download_page(record):

    offset, length = int(record['offset']), int(record['length'])
    offset_end = offset + length - 1

    # We'll get the file via HTTPS so we don't need to worry about S3 credentials
    # Getting the file on S3 is equivalent however - you can request a Range
    prefix = 'https://commoncrawl.s3.amazonaws.com/'
    
    try:
        # We can then use the Range header to ask for just this set of bytes
        resp = requests.get(
            prefix + record['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})

        # The page is stored compressed (gzip) to save space
        # We can extract it using the GZIP library
        raw_data = BytesIO(resp.content)
        f = gzip.GzipFile(fileobj=raw_data)

    except OSError as e:
        print(e)
        return ""
        
    
    # What we have now is just the WARC response, formatted:
    data = f.read()

    response = ""

    if len(data):

        try:
            # data = data.decode("utf-8")
            # print(data)

            [warc, header, response] = data.strip().split(b'\r\n\r\n', 2)
            # print(warc, header)
        except Exception as e:
            print(e)
            return response

    return response

In [4]:
def get_text_from_html(html_content):
    text = ""
    soup = BeautifulSoup(html_content, features="html.parser")

    matches = soup.select(
        "section > article:first-child > div.container > div.article-body > p")

    for item in matches:
        text += item.get_text()+"\n"
    return text

In [5]:
text = ""
for idx, row in df.iterrows():
    if row['status'] == 200:
        html_content = download_page(row)

        print("[*] Retrieved %d bytes for %s" %
              (len(html_content), row['url']))
        if len(html_content) != 0:
            text += get_text_from_html(html_content)

[*] Retrieved 163003 bytes for http://www.espn.com/mlb/story/_/id/10015217/mlb-players-union-head-michael-weiner-dies-51
[*] Retrieved 164616 bytes for http://www.espn.com/mlb/story/_/id/10019043/colorado-rockies-latroy-hawkins-finalize-deal
[*] Retrieved 208845 bytes for http://www.espn.com/mlb/story/_/id/10042646/javier-lopez-re-signs-san-francisco-giants-three-years-13-million
[*] Retrieved 201985 bytes for http://www.espn.com/mlb/story/_/id/10074360/tampa-bay-rays-acquire-ryan-hanigan-heath-bell-three-team-trade
[*] Retrieved 146785 bytes for http://www.espn.com/mlb/story/_/id/10083830/seattle-mariners-sign-willie-bloomquist-2-year-deal
[*] Retrieved 238636 bytes for http://www.espn.com/mlb/story/_/id/10084018/masahiro-tanaka-not-made-available-mlb-teams-rakuten-golden-eagles
[*] Retrieved 208531 bytes for http://www.espn.com/mlb/story/_/id/10107900/curt-schilling-replacing-orel-hershiser-espn-sunday-night-baseball-booth
[*] Retrieved 173374 bytes for http://www.espn.com/mlb/story/

[*] Retrieved 164844 bytes for http://www.espn.com/mlb/story/_/id/11038984/minnesota-twins-selection-nick-gordon-emotional
[*] Retrieved 160694 bytes for http://www.espn.com/mlb/story/_/id/11044704/johan-santana-baltimore-orioles-tears-achilles-season
[*] Retrieved 172700 bytes for http://www.espn.com/mlb/story/_/id/11049084/philadelphia-phillies-place-mike-adams-dl-call-kenny-giles
[*] Retrieved 226427 bytes for http://www.espn.com/mlb/story/_/id/11051832/mitch-moreland-texas-rangers-ankle-surgery-miss-three-months
[*] Retrieved 159616 bytes for http://www.espn.com/mlb/story/_/id/11056097/tampa-bay-rays-replace-closer-grant-balfour-committee
[*] Retrieved 233116 bytes for http://www.espn.com/mlb/story/_/id/11091626/hall-famer-tony-gwynn-san-diego-padres-died?utm_source=hootsuite&utm_campaign=hootsuite
[*] Retrieved 190329 bytes for http://www.espn.com/mlb/story/_/id/11120049/san-diego-padres-fire-general-manager-josh-byrnes
[*] Retrieved 228240 bytes for http://www.espn.com/mlb/story/

[*] Retrieved 172647 bytes for http://www.espn.com/mlb/story/_/id/12279487/carlos-villanueva-agrees-minor-league-deal-st-louis-cardinals
[*] Retrieved 158523 bytes for http://www.espn.com/mlb/story/_/id/12320133/aroldis-chapman-cincinnati-reds-reaches-one-year-deal-avoids-arbitration
[*] Retrieved 157609 bytes for http://www.espn.com/mlb/story/_/id/12337085/cleveland-indians-lhp-bruce-chen-reach-minor-league-deal
[*] Retrieved 158156 bytes for http://www.espn.com/mlb/story/_/id/12340701/kenley-jansen-los-angeles-dodgers-8-12-weeks-foot-surgery
[*] Retrieved 171959 bytes for http://www.espn.com/mlb/story/_/id/12351883/major-league-baseball-announce-pace-play-rules
[*] Retrieved 172961 bytes for http://www.espn.com/mlb/story/_/id/12366824/bj-upton-atlanta-braves-go-melvin-upton-jr-2015
[*] Retrieved 186510 bytes for http://www.espn.com/mlb/story/_/id/12373346/joba-chamberlain-signs-1-year-deal-detroit-tigers
[*] Retrieved 154685 bytes for http://www.espn.com/mlb/story/_/id/12394519/juan-

[*] Retrieved 201009 bytes for http://www.espn.com/mlb/story/_/id/13437738/heart-recipient-throwing-first-pitch-fenway
[*] Retrieved 208342 bytes for http://www.espn.com/mlb/story/_/id/13439652/jose-valverde-suspended-80-games-positive-drug-test-minor-leagues
[*] Retrieved 190309 bytes for http://www.espn.com/mlb/story/_/id/13449370/peter-moylan-makes-back-majors-atlanta-braves
[*] Retrieved 165380 bytes for http://www.espn.com/mlb/story/_/id/13454628/toronto-blue-jays-everything-baseball-aint
[*] Retrieved 181613 bytes for http://www.espn.com/mlb/story/_/id/13463881/juan-marichal-hit-john-roseboro-bat-ugly-baseball-brawl-50-years-ago
[*] Retrieved 153782 bytes for http://www.espn.com/mlb/story/_/id/13470907/san-diego-padres-promote-outfielder-travis-jankowski-triple-a
[*] Retrieved 187895 bytes for http://www.espn.com/mlb/story/_/id/13477400/pittsburgh-pirates-sign-travis-snider-minor-league-deal
[*] Retrieved 159749 bytes for http://www.espn.com/mlb/story/_/id/13485353/mlb-players-un

[*] Retrieved 207838 bytes for http://www.espn.com/mlb/story/_/id/14424328/lefty-ross-detwiler-agrees-minor-league-deal-indians
[*] Retrieved 235037 bytes for http://www.espn.com/mlb/story/_/id/14428286/logan-kensing-nate-schierholtz-agree-detroit-tigers
[*] Retrieved 197035 bytes for http://www.espn.com/mlb/story/_/id/14454106/reds-deal-closer-aroldis-chapman-yankees
[*] Retrieved 156613 bytes for http://www.espn.com/mlb/story/_/id/14455628/oakland-athletics-henderson-alvarez-agree-1-year-deal
[*] Retrieved 191549 bytes for http://www.espn.com/mlb/story/_/id/14515169/los-angeles-dodgers-re-sign-brandon-beachy
[*] Retrieved 197638 bytes for http://www.espn.com/mlb/story/_/id/14515952/washington-nationals-announce-deal-daniel-murphy-pays-just-8m-2016
[*] Retrieved 241622 bytes for http://www.espn.com/mlb/story/_/id/14568028/odd-troubling-state-national-league
[*] Retrieved 162896 bytes for http://www.espn.com/mlb/story/_/id/14579447/jake-arrieta-chicago-cubs-55-million-apart-salary-nego

[*] Retrieved 140478 bytes for http://www.espn.com/mlb/story/_/id/15703256/jason-heyward-returns-starting-lineup-chicago-cubs
[*] Retrieved 145320 bytes for http://www.espn.com/mlb/story/_/id/15711295/francisco-rodriguez-detroit-tigers-closer-becomes-6th-pitcher-record-400-saves
[*] Retrieved 738638 bytes for http://www.espn.com/mlb/story/_/id/16045668/madison-bumgarner-san-francisco-giants-not-worried-get-hurt-derby
[*] Retrieved 214503 bytes for http://www.espn.com/mlb/story/_/id/16046557/new-york-mets-acquire-infielder-kelly-johnson-atlanta-braves-minor-leaguer
[*] Retrieved 208229 bytes for http://www.espn.com/mlb/story/_/id/16076520/oakland-athletics-draft-left-hander-aj-puk-no-6
[*] Retrieved 189349 bytes for http://www.espn.com/mlb/story/_/id/16102889/atlanta-braves-recall-former-starting-2b-jace-peterson-minors
[*] Retrieved 240733 bytes for http://www.espn.com/mlb/story/_/id/16181987/life-ranch-future-cattle-farmer-victor-martinez
[*] Retrieved 167214 bytes for http://www.espn

[*] Retrieved 161378 bytes for http://www.espn.com/mlb/story/_/id/17688715/chicago-cubs-pitcher-jon-lester-says-team-anything-yet-100-win-season
[*] Retrieved 207063 bytes for http://www.espn.com/mlb/story/_/id/17696579/atlanta-braves-give-turner-field-rousing-send-final-game
[*] Retrieved 159674 bytes for http://www.espn.com/mlb/story/_/id/17708060/miami-marlins-fire-barry-bonds-hitting-coach-1-season
[*] Retrieved 208302 bytes for http://www.espn.com/mlb/story/_/id/17718487/detroit-tigers-exercise-option-bring-back-manager-brad-ausmus
[*] Retrieved 156687 bytes for http://www.espn.com/mlb/story/_/id/17718547
[*] Retrieved 160028 bytes for http://www.espn.com/mlb/story/_/id/17722935/new-york-yankees-resist-trading-prospects-veteran-stars
[*] Retrieved 212915 bytes for http://www.espn.com/mlb/story/_/id/17746464/los-angeles-dodgers-washington-nationals-rained-game-2-national-league-division-series-play-sunday
[*] Retrieved 190543 bytes for http://www.espn.com/mlb/story/_/id/17753880/ga

[*] Retrieved 200180 bytes for http://www.espn.com/mlb/story/_/id/19022745/donald-trump-not-throw-first-pitch-washington-nationals-opening-day
[*] Retrieved 178778 bytes for http://www.espn.com/mlb/story/_/id/19024734/cincinnati-reds-claim-2b-scooter-gennett-waivers-milwaukee-brewers
[*] Retrieved 155847 bytes for http://www.espn.com/mlb/story/_/id/19025576/baltimore-orioles-acquire-starting-pitcher-alec-asher-philadelphia-phillies
[*] Retrieved 226136 bytes for http://www.espn.com/mlb/story/_/id/19025583/pitcher-julio-urias-open-season-los-angeles-dodgers
[*] Retrieved 168146 bytes for http://www.espn.com/mlb/story/_/id/19038953/texas-rangers-2b-rougned-odor-reach-6-year-deal-option-2023
[*] Retrieved 158718 bytes for http://www.espn.com/mlb/story/_/id/19066381/st-louis-cardinals-stephen-piscotty-agree-six-year-extension
[*] Retrieved 230427 bytes for http://www.espn.com/mlb/story/_/id/19074938/where-all-mlb-superstars-gone
[*] Retrieved 179586 bytes for http://www.espn.com/mlb/story/

[*] Retrieved 192412 bytes for http://www.espn.com/mlb/story/_/id/20210036/chicago-cubs-giving-world-series-ring-steve-bartman
[*] Retrieved 164081 bytes for http://www.espn.com/mlb/story/_/id/20210730/new-york-yankees-trade-sonny-gray-oakland-athletics
[*] Retrieved 168871 bytes for http://www.espn.com/mlb/story/_/id/20211142/tony-watson-traded-los-angeles-dodgers-pittsburgh-pirates
[*] Retrieved 181678 bytes for http://www.espn.com/mlb/story/_/id/20211702/philadelphia-phillies-trade-reliever-joaquin-benoit-pittsburgh-pirates
[*] Retrieved 243412 bytes for http://www.espn.com/mlb/story/_/id/20219881/the-history-houston-iconic-rainbow-uniforms-story-worth-telling
[*] Retrieved 186610 bytes for http://www.espn.com/mlb/story/_/id/20236495/dodgers-kyle-farmer-alex-wood-share-tattoo-tribute-paralyzed-former-georgia-teammate
[*] Retrieved 158211 bytes for http://www.espn.com/mlb/story/_/id/20247050/chicago-cubs-place-ss-addison-russell-10-day-disabled-list-strained-foot?sf103771546=1
[*] Re

[*] Retrieved 185910 bytes for http://www.espn.com/mlb/story/_/id/21409314/boston-red-sox-great-bobby-doerr-dies-99
[*] Retrieved 161673 bytes for http://www.espn.com/mlb/story/_/id/21426922/corey-kluber-cleveland-indians-wins-2017-al-cy-young-award
[*] Retrieved 157448 bytes for http://www.espn.com/mlb/story/_/id/21427966/scott-boras-says-mlb-teams-cut-payroll-suffer-economic-penalties
[*] Retrieved 164114 bytes for http://www.espn.com/mlb/story/_/id/21502885/joe-morgan-asks-voters-block-ped-users-baseball-hall-fame
[*] Retrieved 175707 bytes for http://www.espn.com/mlb/story/_/id/21506598/ex-atlanta-braves-gm-john-coppolella-placed-mlb-banned-list-team-loses-prospects
[*] Retrieved 211657 bytes for http://www.espn.com/mlb/story/_/id/21587602/to-fist-bump-not-fist-bump-do-players-deserve-reward-intentional-walk
[*] Retrieved 168921 bytes for http://www.espn.com/mlb/story/_/id/21587947/need-know-babe-ruth-japan-move-major-league-baseball
[*] Retrieved 199931 bytes for http://www.espn.c

[*] Retrieved 215120 bytes for http://www.espn.com/mlb/story/_/id/22425345/los-angeles-angels-plan-use-shohei-ohtani-part-six-man-rotation-manager-mike-scioscia-says
[*] Retrieved 169203 bytes for http://www.espn.com/mlb/story/_/id/22425345/los-angeles-angels-plans-use-shohei-ohtani-part-six-man-rotation-manager-mike-scioscia-says
[*] Retrieved 206686 bytes for http://www.espn.com/mlb/story/_/id/22425402/luke-gregerson-rejoins-st-louis-cardinals-decade-away
[*] Retrieved 191871 bytes for http://www.espn.com/mlb/story/_/id/22425905/new-york-mets-manager-mickey-callaway-big-expectations-debut-season
[*] Retrieved 208550 bytes for http://www.espn.com/mlb/story/_/id/22426181/collin-mchugh-houston-astros-avisail-garcia-chicago-white-sox-go-salary-arbitration-hearings
[*] Retrieved 195492 bytes for http://www.espn.com/mlb/story/_/id/22437522/former-major-league-outfielder-tito-francona-father-cleveland-indians-manager-terry-francona-dies-age-84
[*] Retrieved 158730 bytes for http://www.espn.

[*] Retrieved 201270 bytes for http://www.espn.com/mlb/story/_/id/22701222/former-boston-red-sox-closer-koji-uehara-signs-yomiuri-giants
[*] Retrieved 185389 bytes for http://www.espn.com/mlb/story/_/id/22701832/which-team-mlb-best-every-player-were-prime
[*] Retrieved 226043 bytes for http://www.espn.com/mlb/story/_/id/22701832/which-team-mlb-best-every-player-were-prime
[*] Retrieved 193897 bytes for http://www.espn.com/mlb/story/_/id/22701832/which-team-mlb-best-every-player-were-prime?src=rss
[*] Retrieved 174373 bytes for http://www.espn.com/mlb/story/_/id/22701832/which-team-mlb-best-every-player-were-prime?src=rss?src=rss
[*] Retrieved 183576 bytes for http://www.espn.com/mlb/story/_/id/22704231/the-oakland-nearly-boring-think
[*] Retrieved 158838 bytes for http://www.espn.com/mlb/story/_/id/22717252/dick-enberg-remembered-long-career-oh-my-catchphrase
[*] Retrieved 16916 bytes for http://www.espn.com/mlb/story/_/id/22717252/dick-enberg-remembered-long-career-oh-my-catchphrase?d

[*] Retrieved 143714 bytes for http://www.espn.com/mlb/story/_/id/23018584/trayce-thompson-brother-klay-thompson-joins-new-york-yankees
[*] Retrieved 164115 bytes for http://www.espn.com/mlb/story/_/id/23021154/giancarlo-stanton-new-york-yankees-strikes-five-s-gets-booed-home-crowd
[*] Retrieved 173548 bytes for http://www.espn.com/mlb/story/_/id/23040756/mlb-rank-roundtable-answering-biggest-questions-nos-50-1
[*] Retrieved 178968 bytes for http://www.espn.com/mlb/story/_/id/23041537/oakland-athletics-claim-trayce-thompson-waivers-new-york-yankees
[*] Retrieved 246611 bytes for http://www.espn.com/mlb/story/_/id/23042225/ranking-mlb-teams-week-1
[*] Retrieved 185272 bytes for http://www.espn.com/mlb/story/_/id/23042225/ranking-mlb-teams-week-1
[*] Retrieved 158213 bytes for http://www.espn.com/mlb/story/_/id/23042261/philadelphia-phillies-fans-boo-manager-gabe-kapler-home-opener
[*] Retrieved 198267 bytes for http://www.espn.com/mlb/story/_/id/23042481/chicago-white-sox-groundskeeper-

[*] Retrieved 156604 bytes for http://www.espn.com/mlb/story/_/id/23202868/under-armour-deal-make-mlb-uniforms-pushed-2020
[*] Retrieved 141335 bytes for http://www.espn.com/mlb/story/_/id/23202868/under-armour-deal-make-mlb-uniforms-pushed-2020
[*] Retrieved 196100 bytes for http://www.espn.com/mlb/story/_/id/23203677/kansas-city-royals-toronto-blue-jays-game-risk-hole-rogers-centre-roof
[*] Retrieved 147444 bytes for http://www.espn.com/mlb/story/_/id/23203677/kansas-city-royals-toronto-blue-jays-game-risk-hole-rogers-centre-roof
[*] Retrieved 17527 bytes for http://www.espn.com/mlb/story/_/id/23203677/kansas-city-royals-toronto-blue-jays-game-risk-hole-rogers-centre-roof?device=featurephone
[*] Retrieved 146724 bytes for http://www.espn.com/mlb/story/_/id/23203885/early-start-season-leaves-mlb-options-deal-rash-postponements
[*] Retrieved 172344 bytes for http://www.espn.com/mlb/story/_/id/23204582/new-york-yankees-outfielder-jacoby-ellsbury-dealing-heel-pain-addition-injured-hip
[*

[*] Retrieved 214634 bytes for http://www.espn.com/mlb/story/_/id/23516648/real-not-why-expect-more-noah-syndergaard
[*] Retrieved 158513 bytes for http://www.espn.com/mlb/story/_/id/23519497/houston-astros-sent-slumping-jake-marisnick-minors
[*] Retrieved 159099 bytes for http://www.espn.com/mlb/story/_/id/23519697/detroit-tigers-miguel-cabrera-says-done-playing-hurt
[*] Retrieved 239851 bytes for http://www.espn.com/mlb/story/_/id/23520017/los-angeles-angels-mike-trout-pace-greatest-season-mlb-history
[*] Retrieved 160404 bytes for http://www.espn.com/mlb/story/_/id/23520017/los-angeles-angels-mike-trout-pace-greatest-season-mlb-history
[*] Retrieved 157599 bytes for http://www.espn.com/mlb/story/_/id/23520072/blake-swihart-agent-asks-boston-red-sox-trade-client
[*] Retrieved 186146 bytes for http://www.espn.com/mlb/story/_/id/23520072/blake-swihart-agent-asks-boston-red-sox-trade-client?src=rss
[*] Retrieved 193200 bytes for http://www.espn.com/mlb/story/_/id/23520685/new-york-mets-

[*] Retrieved 156603 bytes for http://www.espn.com/mlb/story/_/id/23735514/pittsburgh-pirates-place-richard-rodriguez-10-day-disabled-list
[*] Retrieved 162045 bytes for http://www.espn.com/mlb/story/_/id/23735815/the-los-angeles-angels-placed-shohei-ohtani-disabled-list-sprained-ucl
[*] Retrieved 157176 bytes for http://www.espn.com/mlb/story/_/id/23742640/washington-nationals-activate-adam-eaton-two-months-dl-ankle-injury
[*] Retrieved 142650 bytes for http://www.espn.com/mlb/story/_/id/23744576/philadelphia-phillies-slugger-rhys-hoskins-homers-return-dl
[*] Retrieved 147937 bytes for http://www.espn.com/mlb/story/_/id/23745351/masahiro-tanaka-new-york-yankees-lands-disabled-list-strains-hamstrings
[*] Retrieved 159601 bytes for http://www.espn.com/mlb/story/_/id/23745392/washington-nationals-place-stephen-strasburg-dl-shoulder-inflammation
[*] Retrieved 145596 bytes for http://www.espn.com/mlb/story/_/id/23747419/walker-buehler-los-angeles-dodgers-bruised-ribs-uncertain-return
[*] R

[*] Retrieved 263293 bytes for http://www.espn.com/mlb/story/_/id/23863138/pump-brakes-universal-dh-mlb-next-big-thing
[*] Retrieved 191581 bytes for http://www.espn.com/mlb/story/_/id/23865725/detroit-tigers-play-kansas-city-royals-omaha-ahead-2019-college-world-series
[*] Retrieved 197971 bytes for http://www.espn.com/mlb/story/_/id/23878504/roberto-osuna-toronto-blue-jays-suspended-75-games-mlb
[*] Retrieved 174846 bytes for http://www.espn.com/mlb/story/_/id/23879696/party-crashing-philadelphia-phillies-going-anywhere
[*] Retrieved 214452 bytes for http://www.espn.com/mlb/story/_/id/23889530/clayton-kershaw-los-angeles-dodgers-starts-strong-struggles-3-inning-return-dl
[*] Retrieved 218182 bytes for http://www.espn.com/mlb/story/_/id/23894948/arodys-vizcaino-atlanta-braves-placed-disabled-list-shoulder-soreness
[*] Retrieved 217312 bytes for http://www.espn.com/mlb/story/_/id/23897647/gary-sanchez-new-york-yankees-likely-go-disabled-list
[*] Retrieved 230558 bytes for http://www.es

[*] Retrieved 205499 bytes for http://www.espn.com/mlb/story/_/id/24076553/your-complete-guide-all-star-week
[*] Retrieved 162749 bytes for http://www.espn.com/mlb/story/_/id/24078820/extra-innings-rule-adapted-minor-leagues-receiving-positive-reviews
[*] Retrieved 226996 bytes for http://www.espn.com/mlb/story/_/id/24078820/extra-innings-rule-adapted-minor-leagues-receiving-positive-reviews?ex_cid=espnapi_affiliate_EarthLink+Sports
[*] Retrieved 162817 bytes for http://www.espn.com/mlb/story/_/id/24078820/extra-innings-rule-adapted-minor-leagues-receiving-positive-reviews?src=rss
[*] Retrieved 189770 bytes for http://www.espn.com/mlb/story/_/id/24079223/ranking-mlb-teams-week-15
[*] Retrieved 157081 bytes for http://www.espn.com/mlb/story/_/id/24079298/boston-red-sox-put-3b-rafael-devers-dl-shoulder-inflammation
[*] Retrieved 209098 bytes for http://www.espn.com/mlb/story/_/id/24079298/boston-red-sox-put-3b-rafael-devers-dl-shoulder-inflammation?ex_cid=espnapi_affiliate_EarthLink+Spor

[*] Retrieved 203513 bytes for http://www.espn.com/mlb/story/_/id/24249622/george-springer-houston-astros-center-fielder-leaves-game-left-shoulder-soreness
[*] Retrieved 176511 bytes for http://www.espn.com/mlb/story/_/id/24256860/mike-fiers-detroit-tigers-pitcher-leaves-start-shin-injury
[*] Retrieved 177878 bytes for http://www.espn.com/mlb/story/_/id/24269478/chicago-anti-violence-marchers-take-message-cubs-wrigley-field
[*] Retrieved 166831 bytes for http://www.espn.com/mlb/story/_/id/24276796/jung-ho-kang-pittsburgh-pirates-surgery-left-wrist
[*] Retrieved 230071 bytes for http://www.espn.com/mlb/story/_/id/24280019/dexter-fowler-st-louis-cardinals-fractures-foot
[*] Retrieved 232894 bytes for http://www.espn.com/mlb/story/_/id/24286638/mickey-mantle-1955-bowman-card-found-pack-national-sports-collectors-convention
[*] Retrieved 225505 bytes for http://www.espn.com/mlb/story/_/id/24286762/nathan-eovaldi-boston-red-sox-keeps-new-york-yankees-bats-silent-4-1-win
[*] Retrieved 186486

[*] Retrieved 179338 bytes for http://www.espn.com/mlb/story/_/id/24417699/didi-gregorius-new-york-yankees-leaves-game-injury
[*] Retrieved 153275 bytes for http://www.espn.com/mlb/story/_/id/24437992/washington-nationals-trade-daniel-murphy-chicago-cubs-matt-adams-st-louis-cardinals
[*] Retrieved 145152 bytes for http://www.espn.com/mlb/story/_/id/24439216/yu-darvish-chicago-cubs-miss-rest-season-elbow-triceps-injuries
[*] Retrieved 168400 bytes for http://www.espn.com/mlb/story/_/id/24454467/the-disciplined-aggression-mookie-betts-boston-red-sox
[*] Retrieved 200302 bytes for http://www.espn.com/mlb/story/_/id/24472914/buster-posey-san-francisco-giants-season-ending-hip-surgery
[*] Retrieved 236259 bytes for http://www.espn.com/mlb/story/_/id/24478833/mlb-miguel-andujar-win-year-rookie-year-honors
[*] Retrieved 192074 bytes for http://www.espn.com/mlb/story/_/id/24511476/happy-birthday-ted-williams-our-nine-favorite-stats-no-9
[*] Retrieved 158199 bytes for http://www.espn.com/mlb/st

[*] Retrieved 174856 bytes for http://www.espn.com/mlb/story/_/id/24826487/yankees-host-al-wild-card-game-match-mlb-home-run-record-264
[*] Retrieved 168712 bytes for http://www.espn.com/mlb/story/_/id/24826548/christian-yelich-milwaukee-brewers-homers-brother-cameron-former-marine
[*] Retrieved 172339 bytes for http://www.espn.com/mlb/story/_/id/24860833/anonymous-ready-bronx-actually-pretty-damn-good-team
[*] Retrieved 157626 bytes for http://www.espn.com/mlb/story/_/id/24863365/arizona-diamondbacks-hitting-coach-dave-magadan-mutually-part
[*] Retrieved 159009 bytes for http://www.espn.com/mlb/story/_/id/24905596/braves-plan-start-kevin-gausman-game-3-dodgers-call-walker-buehler
[*] Retrieved 201298 bytes for http://www.espn.com/mlb/story/_/id/24907237/clayton-kershaw-los-angeles-dodgers-delivers-career-best-playoff-performance
[*] Retrieved 207424 bytes for http://www.espn.com/mlb/story/_/id/24915672/gary-sanchez-lifts-new-york-yankees-479-foot-moonshot
[*] Retrieved 164560 bytes fo

[*] Retrieved 201292 bytes for http://www.espn.com/mlb/story/_/id/25066444/mlb-boston-red-sox-los-angeles-dodgers-show-more-aces-kershaw-sale-duel-fizzles
[*] Retrieved 196791 bytes for http://www.espn.com/mlb/story/_/id/25066445/mlb-all-little-things-cost-los-angeles-dodgers-game-1
[*] Retrieved 159644 bytes for http://www.espn.com/mlb/story/_/id/25072934/cardinals-yadier-molina-wins-roberto-clemente-award
[*] Retrieved 222075 bytes for http://www.espn.com/mlb/story/_/id/25082574/when-was-your-city-last-mlb-nba-nfl-nhl-championship-parade
[*] Retrieved 201705 bytes for http://www.espn.com/mlb/story/_/id/25085286/mlb-boston-red-sox-all-great-team-world-series-sweep
[*] Retrieved 189988 bytes for http://www.espn.com/mlb/story/_/id/25085332
[*] Retrieved 227484 bytes for http://www.espn.com/mlb/story/_/id/25093436/mlb-celebrities-world-series-game-3-los-angeles
[*] Retrieved 191802 bytes for http://www.espn.com/mlb/story/_/id/25094012/mlb-max-muncy-gives-los-angeles-dodgers-epic-world-se

[*] Retrieved 196208 bytes for http://www.espn.com/mlb/story/_/id/25226392/chicago-cubs-open-trading-3b-kris-bryant?ex_cid=espnapi_affiliate_EarthLink+Sports
[*] Retrieved 221481 bytes for http://www.espn.com/mlb/story/_/id/25228176/special-assistant-jp-ricciardi-leaving-new-york-mets-8-seasons
[*] Retrieved 162598 bytes for http://www.espn.com/mlb/story/_/id/25228494/joe-mauer-retire-15-seasons-minnesota-twins
[*] Retrieved 161066 bytes for http://www.espn.com/mlb/story/_/id/25235880/bryce-harper-nearly-dealt-houston-astros-washington-nationals
[*] Retrieved 182278 bytes for http://www.espn.com/mlb/story/_/id/25235880/bryce-harper-nearly-dealt-houston-astros-washington-nationals
[*] Retrieved 189908 bytes for http://www.espn.com/mlb/story/_/id/25237148/jerry-remy-says-cancer-free
[*] Retrieved 184123 bytes for http://www.espn.com/mlb/story/_/id/25237982/justin-verlander-houston-astros-kate-upton-welcome-baby-girl
[*] Retrieved 208920 bytes for http://www.espn.com/mlb/story/_/id/252562

[*] Retrieved 166128 bytes for http://www.espn.com/mlb/story/_/id/25513371/mlb-predictions-last-days-winter-meetings
[*] Retrieved 168517 bytes for http://www.espn.com/mlb/story/_/id/25513371/mlb-predictions-last-days-winter-meetings
[*] Retrieved 223499 bytes for http://www.espn.com/mlb/story/_/id/25513872/los-angeles-dodgers-dave-roberts-hopeful-clayton-kershaw-regains-velocity
[*] Retrieved 214714 bytes for http://www.espn.com/mlb/story/_/id/25514002/mlb-angels-no-choice-rebuild-fly-work
[*] Retrieved 176164 bytes for http://www.espn.com/mlb/story/_/id/25514257/new-york-yankees-express-firm-support-catcher-gary-sanchez
[*] Retrieved 166329 bytes for http://www.espn.com/mlb/story/_/id/25516611/yankees-officially-ink-ja-happ-manny-machado-next
[*] Retrieved 192819 bytes for http://www.espn.com/mlb/story/_/id/25517594/al-helfer-radio-pioneer-wins-hall-fame-frick-award
[*] Retrieved 161373 bytes for http://www.espn.com/mlb/story/_/id/25518933/tony-la-russa-calls-harold-baines-detractors

[*] Retrieved 172494 bytes for http://www.espn.com/mlb/story/_/id/25715085/mlb-payrolls-dropped-18-million-2018-first-decrease-2010
[*] Retrieved 141426 bytes for http://www.espn.com/mlb/story/_/id/25715919/new-york-mets-hector-santiago-agree-minor-league-contract
[*] Retrieved 176497 bytes for http://www.espn.com/mlb/story/_/id/25722258/mlb-2019-most-interesting-player-already-hit-home-run-year
[*] Retrieved 207626 bytes for http://www.espn.com/mlb/story/_/id/25722707/jeff-passan-wild-wonky-world-mlb-salary-arbitration
[*] Retrieved 141786 bytes for http://www.espn.com/mlb/story/_/id/25722927/cleveland-indians-catcher-kevin-plawecki-agree-one-year-deal
[*] Retrieved 166357 bytes for http://www.espn.com/mlb/story/_/id/25723131/new-york-mets-travis-darnaud-agree-35-million-deal
[*] Retrieved 161661 bytes for http://www.espn.com/mlb/story/_/id/25723411/shelby-miller-texas-rangers-reach-1-year-deal
[*] Retrieved 166156 bytes for http://www.espn.com/mlb/story/_/id/25724040/dustin-ackley-re

[*] Retrieved 161383 bytes for http://www.espn.com/mlb/story/_/id/25737768/new-york-mets-jacob-degrom-agree-17-million-1-year-contract?ex_cid=espnapi_affiliate_EarthLink+Sports
[*] Retrieved 164836 bytes for http://www.espn.com/mlb/story/_/id/25737768/new-york-mets-jacob-degrom-agree-17-million-1-year-contract?src=rss
[*] Retrieved 156458 bytes for http://www.espn.com/mlb/story/_/id/25738296/pittsburgh-pirates-avoid-arbitration-corey-dickerson-reliever-keone-kela
[*] Retrieved 191509 bytes for http://www.espn.com/mlb/story/_/id/25738369/leonys-martin-cleveland-indians-feels-blessed-alive-health-scare
[*] Retrieved 159492 bytes for http://www.espn.com/mlb/story/_/id/25738369/leonys-martin-cleveland-indians-feels-blessed-alive-health-scare?ex_cid=espnapi_affiliate_EarthLink+Sports
[*] Retrieved 218167 bytes for http://www.espn.com/mlb/story/_/id/25738443/st-louis-cardinals-michael-wacha-avoid-arbitration-635m-deal
[*] Retrieved 154907 bytes for http://www.espn.com/mlb/story/_/id/25738936

[*] Retrieved 160898 bytes for http://www.espn.com/mlb/story/_/id/25976813/brett-anderson-agrees-one-year-deal-stay-oakland-as?ex_cid=espnapi_affiliate_EarthLink+Sports
[*] Retrieved 190027 bytes for http://www.espn.com/mlb/story/_/id/25980629/mlb-releases-design-150th-anniversary-logo
[*] Retrieved 222581 bytes for http://www.espn.com/mlb/story/_/id/25980650/giants-catcher-buster-posey-hoping-healthy-productive-2019-season
[*] Retrieved 164056 bytes for http://www.espn.com/mlb/story/_/id/25981518/brad-ausmus-hopes-shohei-ohtani-bat-back-angels-lineup-some-may
[*] Retrieved 234221 bytes for http://www.espn.com/mlb/story/_/id/25986579/with-jt-realmuto-camp-phillies-know-their-now
[*] Retrieved 225971 bytes for http://www.espn.com/mlb/story/_/id/25987860/philadelphia-phillies-aaron-nola-agree-four-year-extension
[*] Retrieved 228201 bytes for http://www.espn.com/mlb/story/_/id/25988739/luis-severino-new-york-yankees-changes-diet-late-season-fatigue
[*] Retrieved 229616 bytes for http://w

[*] Retrieved 164478 bytes for http://www.espn.com/mlb/story/_/id/26195127/nola-start-opening-day-phils-kapler-says
[*] Retrieved 185416 bytes for http://www.espn.com/mlb/story/_/id/26195342/loaiza-gets-3-year-prison-sentence-cocaine
[*] Retrieved 165632 bytes for http://www.espn.com/mlb/story/_/id/26195342/loaiza-gets-3-year-prison-sentence-cocaine
[*] Retrieved 188249 bytes for http://www.espn.com/mlb/story/_/id/26195806/after-surgeries-chapman-ready-help-contend-again
[*] Retrieved 184710 bytes for http://www.espn.com/mlb/story/_/id/26196748/cooking-too-long-gives-dodgers-kelly-sore-back
[*] Retrieved 183487 bytes for http://www.espn.com/mlb/story/_/id/26197547/tigers-acquire-catcher-rupp-giants-cash
[*] Retrieved 169562 bytes for http://www.espn.com/mlb/story/_/id/26205574/new-padre-machado-always-villain
[*] Retrieved 166700 bytes for http://www.espn.com/mlb/story/_/id/26209937/sources-royals-adding-maldonado-perez-sub
[*] Retrieved 174718 bytes for http://www.espn.com/mlb/story/_

[*] Retrieved 187546 bytes for http://www.espn.com/mlb/story/_/id/6808219/kansas-city-royals-sign-dominican-prospect-adalberto-mondesi
[*] Retrieved 224211 bytes for http://www.espn.com/mlb/story/_/id/6808357/pittsburgh-pirates-file-complaint-umpire-jerry-meals-call-19-inning-loss
[*] Retrieved 157097 bytes for http://www.espn.com/mlb/story/_/id/6831225/cincinnati-reds-scott-rolen-surgery-repair-left-shoulder
[*] Retrieved 157011 bytes for http://www.espn.com/mlb/story/_/id/6839125/mlb-warns-players-deer-antler-spray-report-says
[*] Retrieved 157909 bytes for http://www.espn.com/mlb/story/_/id/6841751/san-francisco-giants-bruce-rips-radio-host-racist-ramon-ramirez-remark
[*] Retrieved 155379 bytes for http://www.espn.com/mlb/story/_/id/6868616/pittsburgh-pirates-activate-left-fielder-jose-tabata-option-third-baseman-pedro-alvarez
[*] Retrieved 198162 bytes for http://www.espn.com/mlb/story/_/id/6885807/kevin-correia-pittsburgh-pirates-put-15-day-disabled-list
[*] Retrieved 207521 bytes

[*] Retrieved 161149 bytes for http://www.espn.com/mlb/story/_/id/7498279/baltimore-orioles-sign-infielder-wilson-betemit-two-year-deal
[*] Retrieved 200787 bytes for http://www.espn.com/mlb/story/_/id/7498284/detroit-tigers-reportedly-land-prince-fielder-214m-offer
[*] Retrieved 176280 bytes for http://www.espn.com/mlb/story/_/id/7499291/cincicincinnati-reds-agree-minor-league-deal-veteran-willie-harris
[*] Retrieved 211816 bytes for http://www.espn.com/mlb/story/_/id/7506666/cleveland-indians-put-fausto-carmona-roberto-hernandez-heredia-restricted-list
[*] Retrieved 225832 bytes for http://www.espn.com/mlb/story/_/id/7522833/cleveland-indians-get-international-league-mvp-russ-canzler-tampa-bay-rays
[*] Retrieved 205743 bytes for http://www.espn.com/mlb/story/_/id/7544803/caribbean-series-francisco-liriano-andy-dirks-lift-dominican-republic-escogido-leones
[*] Retrieved 192266 bytes for http://www.espn.com/mlb/story/_/id/7551464/danny-clyburn-jr-former-major-league-outfielder-shot-kil

[*] Retrieved 184584 bytes for http://www.espn.com/mlb/story/_/id/8458298/detroit-tigers-miguel-cabrera-wins-first-triple-crown-1967
[*] Retrieved 156475 bytes for http://www.espn.com/mlb/story/_/id/8461744/seattle-mariners-fire-hitting-coach-chris-chambliss
[*] Retrieved 224615 bytes for http://www.espn.com/mlb/story/_/id/8462688/kansas-city-royals-part-ways-hitting-coach-kevin-seitzer
[*] Retrieved 154391 bytes for http://www.espn.com/mlb/story/_/id/8463398/washington-nationals-gio-gonzalez-wins-warren-spahn-award
[*] Retrieved 176068 bytes for http://www.espn.com/mlb/story/_/id/8465102/jeter-era-ending
[*] Retrieved 160245 bytes for http://www.espn.com/mlb/story/_/id/8466928/minnesota-twins-gm-terry-ryan-removes-interim-tag-title
[*] Retrieved 196114 bytes for http://www.espn.com/mlb/story/_/id/8480395/2012-alds-detroit-tigers-al-alburquerque-says-kiss-meant-no-disrespect
[*] Retrieved 194993 bytes for http://www.espn.com/mlb/story/_/id/8518040/arizona-diamondbacks-fire-1b-coach-eri

[*] Retrieved 160569 bytes for http://www.espn.com/mlb/story/_/id/9387977/johnny-vander-meer-consecutive-no-hitters-75-years-ago
[*] Retrieved 188460 bytes for http://www.espn.com/mlb/story/_/id/9404248/hamstring-forces-san-diego-padres-everth-cabrera-disabled-list
[*] Retrieved 165766 bytes for http://www.espn.com/mlb/story/_/id/9442231/bryce-harper-washington-nationals-returns-missing-31-games
[*] Retrieved 246064 bytes for http://www.espn.com/mlb/story/_/id/9452014/pitcher-tomohiro-anraku-future-japanese-baseball-espn-magazine
[*] Retrieved 220514 bytes for http://www.espn.com/mlb/story/_/id/9475310/houston-astros-sign-jose-altuve-2017
[*] Retrieved 164167 bytes for http://www.espn.com/mlb/story/_/id/9482248/white-sox-pitcher-chris-sale-skinny-stature-lasting-career-espn-magazine
[*] Retrieved 177077 bytes for http://www.espn.com/mlb/story/_/id/9500252/ryan-braun-milwaukee-brewers-suspended-remainder-2013-season
[*] Retrieved 171353 bytes for http://www.espn.com/mlb/story/_/id/95037

[*] Retrieved 200722 bytes for http://www.espn.com/mlb/story/_/page/rumblings120608/five-teams-shocked-most-2012
[*] Retrieved 193663 bytes for http://www.espn.com/mlb/story/_/page/seasonpreview_mlbteampredictions/mlb-team-predictions-2016-season
[*] Retrieved 223783 bytes for http://www.espn.com/mlb/story/_/page/SeasonpreviewMLB_2017predictions/2017-mlb-experts-mlb-team-predictions
[*] Retrieved 238422 bytes for http://www.espn.com/mlb/story/_/page/springtraining_daviddahl/david-dahl-lost-spleen-outfield-collision-rising-colorado-rockies
[*] Retrieved 155648 bytes for http://www.espn.com/mlb/story/_/page/springtraining_lucasgiolito/washington-nationals-lucas-giolito-change-national-league-east
[*] Retrieved 179070 bytes for http://www.espn.com/mlb/story/_/page/springtraining_yearofthecubs/st-louis-cardinals-chime-being-year-chicago-cubs


In [6]:
#2 change file name here based on sub-topic
file = open("./input/mr/cc_mlb.txt", "w", encoding="utf-8") 
file.write(text)
file.close()