In [2]:
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import re
import time
import csv

In [17]:
HTTP_PATTERN = re.compile(r"https?://.*")
DEFAULT_PARSER = "lxml"

def tinyurl(url):
    if not HTTP_PATTERN.match(url):
        return None
    try:
        encoded_url = urllib.parse.urlencode({"url": url})
        body = urllib.request.urlopen('https://tinyurl.com/create.php?' + encoded_url).read()
        soup = BeautifulSoup(body, DEFAULT_PARSER)
        return soup.find_all("b")[1].get_text()
    except:
        return None

In [18]:
tinyurl("https://tinyurl.com/")

'https://tinyurl.com/gm9e3'

In [22]:
def fetch_tinyurls(batch, sleep=1):
    urls = []
    i = 1
    for url in batch:
        if not url:
            break
        su = url.strip()
        tu = tinyurl(su)
        if tu:
            t = (tu, su)
            urls.append(t)
            print('\r', i, t, end='')
        i += 1
        time.sleep(sleep)
    return urls

In [6]:
import itertools

def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return itertools.zip_longest(*args, fillvalue=fillvalue)


In [10]:
def write_tinyurls(tinyurl_tuples, output_filename):
    with open(output_filename, "a", newline='') as out:
        csv.writer(out).writerows(tinyurl_tuples)
        

In [25]:
def bulk_shorten(input_filename, output_filename, sleep=1, skip_to=1, batch_size=1000):
    batch_num = 1
    with open(input_filename) as file:
        for batch in grouper(file, batch_size):
            if batch_num >= skip_to:
                tinyurl_tuples = fetch_tinyurls(batch, sleep)
                write_tinyurls(tinyurl_tuples, output_filename)
                print("\nCompleted batch:", batch_num)
            else:
                print("\nSkipping batch:", batch_num)
            print()
            batch_num += 1

In [27]:
bulk_shorten('sample-urls.txt', 'sample-tinyurls.csv')


 1000 ('https://tinyurl.com/gvzwpqr', 'http://c2.com/cgi/wiki?JouleLanguage')
Completed batch: 1

 1000 ('https://tinyurl.com/jsz5uxb', 'http://www.lib.ua.edu/wiki/digcoll/index.php?title=Austin_Dixon&diff=4435&oldid=4433')
Completed batch: 2

 1000 ('https://tinyurl.com/zbqselp', 'http://espn.go.com/blog/nflnation/tag/_/name/robert-mcclain')
Completed batch: 3

 1000 ('https://tinyurl.com/gm4wn59', 'http://freecode.com/tags/english?page=1&with=&without=838%2C3943')
Completed batch: 4

 1000 ('https://tinyurl.com/hc84rdu', 'https://techcrunch.com/tag/sling/')
Completed batch: 5

 1000 ('https://tinyurl.com/hbmkmjw', 'http://www.webmd.com/drugs/2/drug-17840/medi-seltzer-oral/details/list-interaction-medication')
Completed batch: 6

 1000 ('https://tinyurl.com/zmmyzo6', 'http://www.cnn.com/2012/08/09/business/korea-standard-chartered-hsbc/index.html')
Completed batch: 7

 1000 ('https://tinyurl.com/zag997a', 'http://www.thetimes.co.uk/tto/sport/football/clubs/chelsea/article3973847.ece')

In [26]:
bulk_shorten('sample-urls.txt', 'sample-tinyurls.csv',
             0.8, 10)



Skipping batch: 1


Skipping batch: 2


Skipping batch: 3


Skipping batch: 4


Skipping batch: 5


Skipping batch: 6


Skipping batch: 7


Skipping batch: 8


Skipping batch: 9

 995 ('https://tinyurl.com/j3bjqna', 'http://www.marieclaire.com/beauty/articles/a5992/best-beauty-products/')
Completed batch: 10

