<a href="https://colab.research.google.com/github/sattwika/DownloaderScripts-Tools/blob/master/RecursiveGoIndexDownloader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Recursive GoIndex Downloader

**Features**
*   Recursive crawler (**atlonxp**)
*   Download all folders and files in a given url (**atlonxp**)
*   Download all folders and files in in sub-folders (**atlonxp**)
*   Adaptive delay in fetching url (**atlonxp**)
*   Store folders/files directly to your Google Drive (**pankaj260**)
*   Folders and files exclusion filters (**atlonxp**)
*   Download queue supported (**atlonxp**)
*   Auto-domain URL detection (**atlonxp**)
*   API-based GoIndex crawler (**atlonxp**, **ifvv**)
*   Parallel/Multiple files downloader (**atlonxp**)
*   Auto-skip password-protected folders (**cxu-fork**)



In [None]:
# Mounting Google Drive, ignore this section if you don't want to
# save on your Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install dependencies
!pip install requests tqdm

In [None]:
# Import dependencies

import json
from json import JSONDecodeError

import multiprocessing
import os
from pathlib import Path
from random import randint
from time import sleep
from urllib import parse

import requests
import tqdm

In [None]:
SHOW_DOWNLOAD_PROGRESS = False
OVERWRITE = False

MIN_DELAY = 3
MAX_DELAY = 5
MAX_RETRY_CRAWLING = 5

def check_exclusion(name, exclusions):
    for exc in exclusions:
        if exc in name:
            return True
    return False


def find(key, dictionary):
    for k, v in dictionary.items():
        if k == key:
            yield v
        elif isinstance(v, dict):
            for result in find(key, v):
                yield result
        elif isinstance(v, list):
            for d in v:
                for result in find(key, d):
                    yield result


def crawler_v2(url, downloading_dict, path, level, exclusions, verbose=False):
    # let slow down a bit
    # sleep(randint(MIN_DELAY, MAX_DELAY))

    url = parse.urlparse(url)
    print(url.geturl())

    try:
        response_text = ''
        retry = 0
        while 'files' not in response_text:
            retry += 1
            if retry > MAX_RETRY_CRAWLING:
                break
            if retry > 1:
                print('retry #{}'.format(retry), url.geturl())
                sleep(randint(MIN_DELAY, MAX_DELAY))
            data = {'password': task['password']} if 'password' in task else {}
            response = requests.post(url.geturl(), data=data)
            response_text = response.text
        # print(response.text)
        response_json = json.loads(response_text)
    except JSONDecodeError:
        sleep(randint(MIN_DELAY, MAX_DELAY))
        print('- Data is missing! change a plan -')
        print('- > use terminal CURL            -')
        try:
            response = os.popen("curl --globoff {} -d ''".format(url.geturl())).read()
            response_json = json.loads(response)
        except Exception as e:
            print('Nah, something went wrong!')
            print(e.args())
            return []
    except Exception as e:
        print('Nah, something went wrong!')
        print(e.args())
        return []

    try:
        files_dict = list(find('files', response_json))[0]
    except Exception as e:
        print('Cannot fine value for the key of "files", skip this link')
        files_dict = {}

    for file in files_dict:
        name = file['name']

        # if @name contains exclusion word, we ignore
        if check_exclusion(name, exclusions):
            continue

        if 'folder' in file['mimeType']:
            next_url = url.geturl() + parse.quote(name) + "/"
            next_path = os.path.join(path, name)
            downloading_dict = crawler_v2(next_url, downloading_dict, next_path, level + 1, exclusions, verbose)
        else:
            name = file['name']
            domain_name = url.geturl()
            if verbose:
                print('  ' + name)
            try:
                downloading_dict.append({
                    'folder': path,
                    'filename': name,
                    'filename_abs': os.path.join(path, name),
                    'size': file['size'],
                    'url': '{}{}{}'.format(domain_name, '/' if not domain_name.endswith('/') else '', parse.quote(name)),
                })
            except:
                print('skipping', url.geturl() + parse.quote(name))
                continue

    # print(json.dumps(downloading_dict, indent=2), end='\n\n')
    return downloading_dict


def download_agent(task, OVERWRITE=OVERWRITE, METHOD='curl', SILENT=True):
    folder = task['folder']
    filename = task['filename']
    filename_abs = task['filename_abs']
    url = task['url']
    size = int(task['size'])

    result = {
        'task': task
    }

    Path(folder).mkdir(parents=True, exist_ok=True)
    try:
        if not os.path.exists(filename_abs) or OVERWRITE:
            pass
        else:
            # file exists, check file size
            if os.path.getsize(filename_abs) >= size:
                result.update({ 'status': -1 })
                task = None
            else:
                pass
        retry = False
        if task:
            sleep(randint(3, 10))
            if METHOD is 'curl':
                command = "curl --globoff '{}' --output '{}' {}".format(
                    task['url'],
                    task['filename_abs'],
                    '-s' if SILENT else ''
                    )
                os.popen(command).read()
                # check filesize again
                if os.path.getsize(filename_abs) >= size:
                    result.update({ 'status': 0 })
                else:
                    # error --> 404, user rate limit, etc. --> put in failures
                    # list for re-downloading
                    retry = True
            else:
                r = requests.get(url, stream=True)
                if r.status_code is 200:
                    with open(filename_abs, 'ab+') as f:
                        f.write(r.content)
                    # check filesize again
                    if os.path.getsize(filename_abs) >= size:
                        result.update({ 'status': 0 })
                    else:
                        # error --> user rate limit, etc. --> put in failures
                        # list for re-downloading
                        result.update({ 'status': 1 })
                else:
                    # error --> 404
                    result.update({ 'status': 1 })

        if retry:
            r = requests.get(url, stream=True)
            if r.status_code is 200:
                with open(filename_abs, 'ab+') as f:
                    f.write(r.content)
                # check filesize again
                if os.path.getsize(filename_abs) >= size:
                    result.update({ 'status': 0 })
                else:
                    # error --> user rate limit, etc. --> put in failures
                    # list for re-downloading
                    result.update({ 'status': 1 })
            else:
                # error --> 404
                result.update({ 'status': 1 })

    except Exception as e:
        print('[Exception]', e.args, task['url'])
        result.update({ 'status': 1 })
    return result


def get_filesize(size, power=3):
    return size/pow(1024, power)

In [None]:
exclusions = ['__MACOSX/']

destination = "/content/drive/My Drive/"
download_tasks = [
    {
        'folder': 'FrontEndMasters - Complete Intro to Containers',
        'url': 'https://tutnetflix.mlwdl.workers.dev/FrontEndMasters%20-%20Complete%20Intro%20to%20Containers/'
    },
]

print('##################################')
print('# Crawling all downloadable urls #')
print('##################################', end='\n\n')
tasks = []
for task in download_tasks:
    tasks += crawler_v2(task['url'], [], os.path.join(destination, task['folder']), 0, exclusions, verbose=False)
    # print(json.dumps(tasks, indent=2), end='\n\n')

total_size = get_filesize(sum([int(task['size']) for task in tasks]))

# print(json.dumps(tasks, indent=2))
print('\nTotal Task:', len(tasks))
print('Total size: %.3fGB' % total_size, end='\n\n')



In [None]:
MAX_DOWNLOAD_TASKS = 32

print('######################################')
print('# Downloading {} files and folders  #'.format(len(tasks)))
print('######################################', end='\n\n')
pool = multiprocessing.Pool(processes=MAX_DOWNLOAD_TASKS)  # Num of CPUs

downloads = []
skips = []
failures = []
errors = []
downloaded_size = 0
with tqdm.tqdm(total=len(tasks)) as pbar:
    for i, result in enumerate(pool.imap_unordered(download_agent, tasks)):
        if result is not None:
            status = result.get('status')
            task = result.get('task')
            downloaded = int(task['size'])
            if status == 0:
                downloaded_size += downloaded
                pbar.set_description('[%.3f/%.3f GB] Downloading %s' % (get_filesize(downloaded_size), total_size, task.get('filename')))
                downloads.append(task)
            elif status == 1:
                pbar.set_description('[%.3f/%.3f GB] Fail downloading %s' % (get_filesize(downloaded_size), total_size, task.get('filename')))
                failures.append(task)
            elif status == -1:
                downloaded_size += downloaded
                pbar.set_description('[%.3f/%.3f GB] Skipping %s' % (get_filesize(downloaded_size), total_size, task.get('filename')))
                skips.append(task)
            else:
                pass
        else:
            pbar.set_description('[%.3f/%.3f GB] ERROR')
        pbar.update()

print('Waiting 1 minute')
sleep(60)

# print(json.dumps(failures, indent=2))
if len(failures) > 0:
    print('\n\n##################################')
    print('# Retry all {} failures          #'.format(len(failures)))
    print('##################################')
    with tqdm.tqdm(total=len(failures)) as pbar:
        for i, result in enumerate(pool.imap_unordered(download_agent, failures)):
            if result is not None:
                status = result.get('status')
                task = result.get('task')
                downloaded = int(task['size'])
                if status == 0:
                    downloaded_size += downloaded
                    pbar.set_description('[%.3f/%.3f GB] Downloading %s' % (get_filesize(downloaded_size), total_size, task.get('filename')))
                    downloads.append(task)
                    del failures[failures.index(task)]
                elif status == 1:
                    pbar.set_description('[%.3f/%.3f GB] Fail downloading %s' % (get_filesize(downloaded_size), total_size, task.get('filename')))
                elif status == -1:
                    downloaded_size += downloaded
                    pbar.set_description('[%.3f/%.3f GB] Skipping %s' % (get_filesize(downloaded_size), total_size, task.get('filename')))
                    skips.append(task)
                else:
                    pass
            pbar.update()

pool.close()
pool.terminate()

print('\n\n##################################')
print('# Summary                        #')
print('##################################')
print('Tasks     :', len(tasks))
print('-----------------')
print('Downloads :', len(downloads))
print('Skip      :', len(skips))
print('Failures  :', len(failures))
print('Errors    :', len(errors))

if len(failures) > 0:
    print('\nWait for 1 minute and Re-run this section again to download all fail tasks')
    for f in failures:
        print(f['url'])

if len(errors) > 0:
    print('\nError tasks')
    for f in errors:
        print(f['url'])

print('\nAll done, Voila!')