In [1]:
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import re
import time
import csv

In [2]:
HTTP_PATTERN = re.compile(r"https?://.*")
DEFAULT_PARSER = "lxml"

def tinyurl(url):
    if not HTTP_PATTERN.match(url):
        return None
    encoded_url = urllib.parse.urlencode({"url": url})
    body = urllib.request.urlopen('https://tinyurl.com/create.php?' + encoded_url).read()
    soup = BeautifulSoup(body, DEFAULT_PARSER)
    return soup.find_all("b")[1].get_text()

In [3]:
tinyurl("https://tinyurl.com/")

'https://tinyurl.com/gm9e3'

In [4]:
def fetch_tinyurls(batch, sleep=1):
    urls = []
    i = 1
    for url in batch:
        su = url.strip()
        tu = tinyurl(su)
        if tu:
            t = (tu, su)
            urls.append(t)
            print(i, t)
        i += 1
        time.sleep(sleep)
    return urls

In [None]:
import itertools

def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return itertools.zip_longest(*args, fillvalue=fillvalue)


In [6]:
def write_tinyurls(tinyurl_tuples, output_filename):
    with open(output_filename, "a", newline='') as out:
        csv.writer(out).writerows(tinyurl_tuples)
        

In [7]:
def bulk_shorten(input_filename, output_filename, sleep=1):
    with open(input_filename) as file:
        
        tinyurl_tuples = fetch_tinyurls(input_filename, sleep)
        write_tinyurls(tinyurl_tuples, output_filename)

In [None]:
bulk_shorten('sample-urls.txt', 'sample-tinyurls.csv')

In [None]:
bulk_shorten('/Volumes/HDD-2/zalando-aragog-crowdflower/golden-datasets/fashion-classifier/en/actual-experiments/sampled-training-urls/fashion-urls.txt', 
             '/Volumes/HDD-2/zalando-aragog-crowdflower/golden-datasets/fashion-classifier/en/actual-experiments/sampled-training-urls/fashion-tinyurls.csv')
