In [1]:
import pandas as pd
import requests
import time
import os

## Explore data

In [2]:
path = "/home/ricardo/Descargas/Springer_Ebooks-convertido.xlsx"

In [3]:
df = pd.read_excel(path)

In [4]:
df.head()

Unnamed: 0,S.No.,Book Title,Author,Edition,OpenURL
0,287,Electronic Commerce 2018,"King, Jae Kyu Lee, Ting-Peng Liang,\nDeborrah ...",9th ed.\n2018,http://link.springer.com/openurl?genre=book&is...
1,182,Electronic Commerce,"Lee, Ting-Peng Liang, Deborrah C.\nTurban",8th ed.\n2015,http://link.springer.com/openurl?genre=book&is...
2,51,Particles and Nuclei,"Scholz, Frank Zetsche, Werner\nRodejohann",7th ed.\n2015,http://link.springer.com/openurl?genre=book&is...
3,371,Structural Dynamics,"Mario Paz, Young Hoon Kim",6th ed.\n2019,http://link.springer.com/openurl?genre=book&is...
4,331,Proofs from THE BOOK,"Martin Aigner, Günter M. Ziegler",6th ed.\n2018,http://link.springer.com/openurl?genre=book&is...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 408 entries, 0 to 407
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   S.No.       408 non-null    int64 
 1   Book Title  408 non-null    object
 2   Author      408 non-null    object
 3   Edition     408 non-null    object
 4   OpenURL     408 non-null    object
dtypes: int64(1), object(4)
memory usage: 16.1+ KB


In [6]:
df.loc[0]

S.No.                                                       287
Book Title                             Electronic Commerce 2018
Author        King, Jae Kyu Lee, Ting-Peng Liang,\nDeborrah ...
Edition                                           9th ed.\n2018
OpenURL       http://link.springer.com/openurl?genre=book&is...
Name: 0, dtype: object

In [7]:
df.loc[0, 'OpenURL']

'http://link.springer.com/openurl?genre=book&isbn=978-3-319-58715-8'

If we go to this URL, we obtain the following:

PDF LINK
https://link.springer.com/content/pdf/10.1007%2F978-3-319-58715-8.pdf

EPUB LINK
https://link.springer.com/download/epub/10.1007%2F978-3-319-58715-8.epub

## Create URLs

In [8]:
url = "'http://link.springer.com/openurl?genre=book&isbn=978-3-319-58715-8'"
url.find("isbn=")

45

In [9]:
url[45:]

"isbn=978-3-319-58715-8'"

In [10]:
url[45+5:]

"978-3-319-58715-8'"

In [11]:
def isbn_from_url(url):
    
    token = "isbn="
    offset = 5
    pos = url.find(token) + offset
    
    return url[pos:]

In [12]:
isbn_from_url(url)

"978-3-319-58715-8'"

In [13]:
df['ISBN'] = df['OpenURL'].apply(isbn_from_url)

In [14]:
df.loc[0]

S.No.                                                       287
Book Title                             Electronic Commerce 2018
Author        King, Jae Kyu Lee, Ting-Peng Liang,\nDeborrah ...
Edition                                           9th ed.\n2018
OpenURL       http://link.springer.com/openurl?genre=book&is...
ISBN                                          978-3-319-58715-8
Name: 0, dtype: object

In [15]:
def create_pdf_url(isbn):
    
    pdf_prefix = "https://link.springer.com/content/pdf/10.1007%2F"
    pdf_extension = ".pdf"
    
    return pdf_prefix + isbn + pdf_extension

In [16]:
create_pdf_url('978-3-319-58715-8')

'https://link.springer.com/content/pdf/10.1007%2F978-3-319-58715-8.pdf'

In [17]:
test_pdf_url = "https://link.springer.com/content/pdf/10.1007%2F978-3-319-58715-8.pdf"
assert test_pdf_url == create_pdf_url('978-3-319-58715-8')

In [18]:
def create_epub_url(isbn):
    
    epub_prefix = "https://link.springer.com/download/epub/10.1007%2F"
    epub_extension = ".epub"
    
    return epub_prefix + isbn + epub_extension

In [19]:
create_epub_url('978-3-319-58715-8')

'https://link.springer.com/download/epub/10.1007%2F978-3-319-58715-8.epub'

In [20]:
test_epub_url = "https://link.springer.com/download/epub/10.1007%2F978-3-319-58715-8.epub"
assert test_epub_url == create_epub_url('978-3-319-58715-8')

In [21]:
df['pdf_url'] = df['ISBN'].apply(create_pdf_url)

In [22]:
df['epub_url'] = df['ISBN'].apply(create_epub_url)

In [23]:
df.loc[0]

S.No.                                                       287
Book Title                             Electronic Commerce 2018
Author        King, Jae Kyu Lee, Ting-Peng Liang,\nDeborrah ...
Edition                                           9th ed.\n2018
OpenURL       http://link.springer.com/openurl?genre=book&is...
ISBN                                          978-3-319-58715-8
pdf_url       https://link.springer.com/content/pdf/10.1007%...
epub_url      https://link.springer.com/download/epub/10.100...
Name: 0, dtype: object

## Download ebooks

In [24]:
output_directory = "/home/ricardo/Escritorio/springer_ebooks"
output_directory_pdf = os.path.join(output_directory, "pdf")
output_directory_epub = os.path.join(output_directory, "epub")

In [25]:
for directory in [output_directory, output_directory_pdf, output_directory_epub]:
    if not os.path.exists(directory):
        os.makedirs(directory)

In [39]:
def make_output_path_pdf(row):
    filename = (str(row['S.No.']) + '_' + row['Book Title'] + ".pdf").replace(" ", "_")
    return os.path.join(output_directory_pdf, filename)

In [40]:
def make_output_path_epub(row):
    filename = (str(row['S.No.']) + '_' + row['Book Title'] + ".epub").replace(" ", "_")
    return os.path.join(output_directory_epub, filename)

In [41]:
df['output_path_pdf'] = df.apply(make_output_path_pdf, axis=1)
df['output_path_epub'] = df.apply(make_output_path_epub, axis=1)

In [51]:
# Avoid collapsing the server (or being banned)

def wait_20(func):
    """
    Add 20 seconds waiting time after function call
    """
    
    def wrapper(*args, **kwargs):
        func(*args, **kwargs)
        time.sleep(20)
    
    return wrapper

In [52]:
@wait_20
def download(row, format='pdf'):

    if format=='pdf':
        command = "wget -O %s %s" % (row['output_path_pdf'], row['pdf_url'])
    else:
        command = "wget -O %s %s" % (row['output_path_epub'], row['epub_url'])
    
    os.system(command)
    print(".", end="")

**PDF**

In [None]:
for index, ebook in df.iterrows():
    download(ebook, 'pdf')
    
    if (index+1) % 10 == 0 and index != 0:
        print("\n%d of %d completed\n" % (index+1, len(df)))
    
print("Done!!")

.