In [None]:
from __future__ import print_function

In [None]:
# Python 2 and 3: alternative 4
try:
    from urllib.parse import urlparse, urlencode
    from urllib.request import urlopen, Request
    from urllib.error import HTTPError
except ImportError:
    from urlparse import urlparse
    from urllib import urlencode
    from urllib2 import urlopen, Request, HTTPError

# test existence of file of form:
https://raw.githubusercontent.com/GITenberg/---------------------------------------------------------------------------------------------__31854/master/31854-0.txt

In [None]:
import re

def parse_g_url(url):
    name_id = urlparse(url).path.split("/")[-1]
    # split on last "_" -- seems it's possible to have 
    return re.search(r'(.*)_(\d+)$', name_id).groups()
    

In [None]:
repo_name_id = map(
   parse_g_url,
   g_files)

In [None]:
from itertools import izip, islice

In [None]:
def gitenberg_text_url(name, id_, suffix=None):
    if suffix:
        return "https://raw.githubusercontent.com/GITenberg/{name}_{id_}/master/{id_}-{suffix}.txt".format(name=name,
                                                                                             id_=id_,
                                                                                             suffix=suffix)
    else:
        return "https://raw.githubusercontent.com/GITenberg/{name}_{id_}/master/{id_}.txt".format(name=name,
                                                                                             id_=id_)
    
def text_url_search(name,id_):
    suffix_to_try = (None, '0', '8')
    
    for suffix in suffix_to_try:
        url = gitenberg_text_url(name,id_, suffix)
        r = requests.head(url)
        if r.status_code == 200:
            return (True, url)
    
    return (False, None)
    

In [None]:
for (url,(name,id_)) in islice(izip(g_files, repo_name_id),10):
    (txt_found, txt_url) = text_url_search(name,id_)
    print (url, txt_found, txt_url)

In [None]:
r = requests.head(txt_url)
r.status_code

# cert warnings

https://urllib3.readthedocs.org/en/latest/security.html#using-certifi-with-urllib3

In [None]:
import urllib3
import certifi

http = urllib3.PoolManager(
    cert_reqs='CERT_REQUIRED', # Force certificate check.
    ca_certs=certifi.where(),  # Path to the Certifi bundle.
)

# You're ready to make verified HTTPS requests.
try:
    r = http.request('GET', 'https://example.com/')
except urllib3.exceptions.SSLError as e:
    # Handle incorrect certificate error.
    print(e)

In [None]:
# https://github.com/kennethreitz/requests/issues/2214#issuecomment-72941896

from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

import urllib3
urllib3.__version__

# Readabiliity

[readability 0.1 : Python Package Index](https://pypi.python.org/pypi/readability/0.1)

```
pip install https://github.com/andreasvc/readability/tarball/master
pip install Gutenberg
```

https://github.com/c-w/Gutenberg/blob/master/gutenberg/cleanup/strip_headers.py

In [None]:
import readability
import requests
from gutenberg import cleanup

In [None]:
# Moby Dick

url = "https://github.com/GITenberg/Moby-Dick--Or-The-Whale_2701/raw/master/2701.txt"
text = cleanup.strip_headers(requests.get(url).text)
text[:100], text[-100:]

In [None]:
readability.getmeasures(text, lang='en')

In [None]:
from textblob import TextBlob
moby_blob = TextBlob(text)

In [None]:
# read in from data file

import pandas as pd
from pandas import DataFrame

In [None]:
df = pd.read_csv("GITenberg_repos_list_2.tsv", sep="\t", encoding="UTF-8")

In [None]:
len(df)

In [None]:
# what about those multiple txt
df.text_files = df.text_files.map(lambda s: [f.strip() for f in s[1:-1].split(",") if len(f) and not f.isspace()])

In [None]:
from collections import defaultdict

def has_standard_txt(g_id, files):
    if "{0}.txt".format(g_id) in files:
        return True
    else:
        return False
    
def classify_text_files(g_id, files):
    """
    put into buckets: .txt, -0.txt, -5.txt, -8.txt, other
    """
    
    # https://www.gutenberg.org/files/
    bucket_labels = [('ascii', "{g_id}.txt"),
                     ('utf-8', "{g_id}-0.txt"),
                     ('8-bit', "{g_id}-8.txt"), # iso-8859-1, windows-1252, MacRoman, ...
                     ('big5',  "{g_id}-5.txt")
                    ]
    
    buckets = defaultdict(list) # make values list to be consistent
    files_set = set(files)
    
    for (label, template) in bucket_labels:
        f_name = template.format(g_id=g_id)
        if f_name in files_set:
            buckets[label].append(f_name)
            files_set.remove(f_name)
            
    # pick up rest      
    if files_set:
        buckets['other'] = list(files_set)
    
    #print (dict(buckets)  )
    return buckets.items()  
    
# https://raw.githubusercontent.com/GITenberg/Little-Dorrit_963/master/963.txt

def gitenberg_url(title, text_file):
    return "https://raw.githubusercontent.com/GITenberg/{title}/master/{text_file}".format(title=title,
                                                                                          text_file=text_file)

In [None]:
df['files_by_encoding'] = df.apply(lambda r:classify_text_files(r.gitb_id, r.text_files), axis=1).map(dict)

In [None]:
df.files_by_encoding.head()

In [None]:
# study 'other'
df['other'] = df.files_by_encoding.map(lambda d: d.get('other'))
df.other.head()

In [None]:
# any other in the form of (\d+)-(\d).txt or \(d+).txt?

def unexpected_other(files):
    """files whose file namelooks suspicious """
    import re

    if files is None:
        return False
    
    for f in files:
        if re.search(r'^(\d+)-(\d).txt$', f) or re.search(r'^(\d+).txt$', f):
            return True
    
    return False

# hmmm...these files deserve a closer look
df[df.other.map(unexpected_other)][['gitb_id','other','text_files']]

In [None]:
# compute URL for a text file

def gitenberg_raw_url(repo_name, file_name, branch='master'):
    if file_name is not None:
        return "https://raw.githubusercontent.com/GITenberg/{repo_name}/{branch}/{file_name}".format(repo_name=repo_name,
                                                                                                file_name=file_name,
                                                                                                branch=branch)
    else:
        return None


In [None]:
# for a given repository, return URL for "best" file to work with
#preference:  utf-8, ascii, big5, 8-bit (because of ambiguity)
# if unicode available, return it and encoding

def preferred_file(files_d):
    PREF_ORDER = ['utf-8', 'ascii', 'big5', '8-bit']
    for f_type in PREF_ORDER:
        files_of_type = files_d.get(f_type)
        if files_of_type:
            return (files_of_type[0], f_type)
        
    return (None, None)

In [None]:
df['preferred_file'] = df.files_by_encoding.map(preferred_file)

In [None]:
df.head()

In [None]:
df['preferred_text_url'] = df.apply(lambda row: gitenberg_raw_url(row.gitb_name, row.preferred_file[0]), axis=1)

In [None]:
# HTTP HEAD on the file
def http_head_status(url):
    import requests
    
    if url is not None:
        r = requests.head(url)
        return r.status_code
    else:
        return None

In [None]:
df[:100].preferred_text_url.map(http_head_status)

In [None]:
head_status = _

In [None]:
np.nan

In [None]:
import numpy as np
df[:100][head_status == 404][['preferred_text_url']].to_csv()

In [None]:
# decorator that returns a tuple with args and return value

def also_arg(func):
    """
    for use for single-argument function
    """
    def wrapper(arg):
        try:
            result = func(arg)
        except Exception, e:
            result = e
            
        return (arg, result)
    
    return wrapper

In [None]:
# http://chriskiehl.com/article/parallelism-in-one-line/
from __future__ import print_function
from multiprocessing import Pool as ProcessPool
from multiprocessing.dummy import Pool as ThreadPool 
from functools import partial
from itertools import islice

from math import factorial

# http://stackoverflow.com/questions/2348317/how-to-write-a-pager-for-python-iterators/2350904#2350904        
def grouper(iterable, page_size):
    page= []
    for item in iterable:
        page.append( item )
        if len(page) == page_size:
            yield page
            page= []
    if len(page) > 0:
        yield page


#get_key_sizes_for_bucket = partial(get_key_sizes, bucket_name="aws-publicdatasets")

PAGE_SIZE = 10
POOL_SIZE = 8
MAX_SEGMENTS = 5 # replace with None for all segments
CHUNK_SIZE = 10

pool = ThreadPool(POOL_SIZE)  # or ThreadPool
results_iter = pool.imap_unordered(lambda x:also_arg(factorial)(x), 
                              xrange(-1,MAX_SEGMENTS),
                              CHUNK_SIZE)

results = []
                             
for (i, result) in enumerate(islice(results_iter,None)):
    print ('\r>> Result %d' % i, end="")
    results.append(result)
            

In [None]:
results

In [None]:
[f for f in islice(df.preferred_text_url,10)]

In [None]:
# http://chriskiehl.com/article/parallelism-in-one-line/
from __future__ import print_function
from multiprocessing import Pool as ProcessPool
from multiprocessing.dummy import Pool as ThreadPool 
from functools import partial
from itertools import islice

from math import factorial

# http://stackoverflow.com/questions/2348317/how-to-write-a-pager-for-python-iterators/2350904#2350904        
def grouper(iterable, page_size):
    page= []
    for item in iterable:
        page.append( item )
        if len(page) == page_size:
            yield page
            page= []
    if len(page) > 0:
        yield page


#get_key_sizes_for_bucket = partial(get_key_sizes, bucket_name="aws-publicdatasets")

PAGE_SIZE = 10
POOL_SIZE = 8
MAX_SEGMENTS = 20 # replace with None for all segments
CHUNK_SIZE = 10

pool = ThreadPool(POOL_SIZE)  # or ThreadPool
results_iter = pool.imap_unordered(lambda x:also_arg(http_head_status)(x), 
                              islice(df.preferred_text_url, MAX_SEGMENTS),
                              CHUNK_SIZE)

results = []
                             
for (i, result) in enumerate(islice(results_iter,None)):
    print ('\r>> Result %d' % i, end="")
    results.append(result)

In [None]:
results

In [None]:
from textblob import TextBlob

In [None]:
wiki = TextBlob("Python is a high-level, general-purpose programming language.")

In [None]:
wiki.noun_phrases

In [None]:
# alternative way to get Gutenberg texts
# calculate the URI
# https://github.com/c-w/Gutenberg/blob/PyPI-0.4/gutenberg/acquire/text.py#L19

from gutenberg import acquire
acquire.text._format_download_uri(2701)