In [1]:
import re
import time
import os
import shutil
import requests
import hashlib
import json

class Hasher:
    hashes = {}
    invertedHashes = {}
    def updateFileHash(self, file_path):
        hash_md5 = hashlib.md5()
        with open(file_path, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                hash_md5.update(chunk)
        file_hash = hash_md5.hexdigest()
        if file_hash in self.hashes:
            return True
        self.hashes[file_hash] = file_path
        self.invertedHashes[file_path] = file_hash
        return False
        
    def updateDirHashes(self, dir_path):
        for file in os.listdir(dir_path):
            file_path = os.path.join(dir_path, file)
            if os.path.isfile(file_path):
                self.updateFileHash(file_path)
    
    def loadJson(self, file_path):
        self.hashes = json.load(open(file_path, 'rt'))
        self.invertedHashes = {v: k for k, v in self.hashes.items()}
        
    def saveJson(self, file_path):
        json.dump(self.hashes, open(file_path, 'wt'))

class Scrapper:
    hasher = Hasher()
    hasher.loadJson('hashes.txt')
    
    def getPage(url, since):
        page = ""
        while page == "":
            try:
                page = requests.get(url if since is None else url + '?since=' + since, timeout=15.0).text
            except requests.exceptions.Timeout:
                print("timeout on: " + url)
                time.sleep(1.0)
        return page
    
    def getImage(url):
        sucess = False
        image = None
        while sucess == False:
            try:
                image = requests.get(url, timeout=5.0)
                sucess = True
            except requests.exceptions.Timeout:
                print("timeout on: " + url)
                return None
        return image
            
    def downloadImages(url, path, pages, since=None, until=None):
        print('downloading url={0} since={1} page={2}, until={3}'.format(url, since, pages, until))
        page = Scrapper.getPage(url, since)
        lines = page.split('\n')
        next_since = ""
        line_count = 0
        was_until = False
        for l in lines:
            m = re.findall('href="/friends\?since=(.+?)" name="more"', l)
            if len(m) > 0:
                next_since = m[0]

            if(until is not None):
                m = re.findall('<span class="d">(.+?)</span>', l)
                if(len(m) > 0 and m[0] == until):
                    was_until = True

            m = re.findall('a class="lightbox".+?href="(.+?)".+?img.+?src=".+?"', l)
            if len(m) > 0:
                name = m[0].split("/")[-1]
                file_path = path + '/' + name
                if os.path.isfile(file_path):
                    continue
                if file_path in Scrapper.hasher.invertedHashes:
                    continue

                image = Scrapper.getImage(m[0])
                time.sleep(1)
                line_count = line_count + 1
                if image is not None and image.status_code == 200:
                    with open(file_path, 'wb') as f:
                        f.write(image.content)
                    if Scrapper.hasher.updateFileHash(file_path):
                        os.remove(file_path)
                    continue
                        
            m = re.findall('img.+?alt="\d{4} .+?".+?src="(.+?asset.+?)"', l)
            if len(m) > 0:
                name = m[0].split("/")[-1]
                file_path = path + '/' + name
                if os.path.isfile(file_path):
                    continue
                if file_path in Scrapper.hasher.invertedHashes:
                    continue

                image = Scrapper.getImage(m[0])
                time.sleep(1)
                line_count = line_count + 1
                if image is not None and image.status_code == 200:
                    with open(file_path, 'wb') as f:
                        f.write(image.content)
                    if Scrapper.hasher.updateFileHash(file_path):
                        os.remove(file_path)
        
        Scrapper.hasher.saveJson('hashes.txt')
        if was_until:
            return
        if line_count == 0:
            pages = pages // 2
        if len(next_since) > 0 and pages > 1:
            Scrapper.downloadImages(url, path, pages - 1, next_since, until)

In [2]:
up_to = '31'
Scrapper.downloadImages("http://vogel.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://bercik.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://inzynier.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://siostra.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://tfu.soup.io", "in", 1, None, up_to)
Scrapper.downloadImages("http://niedobrze.soup.io", "in", 1, None, up_to)
Scrapper.downloadImages("http://ecce.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://sucznik.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://bumszakalaka.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://saski.soup.io/friends", "in", 100, None, up_to)

downloading url=http://vogel.soup.io/friends since=None page=100, until=31
timeout on: http://asset-5.soupcdn.com/asset/8852/8510_50b5_640.jpeg
downloading url=http://vogel.soup.io/friends since=642883113 page=99, until=31
timeout on: http://asset-d.soupcdn.com/asset/13949/0801_d8e5_960.jpeg
downloading url=http://vogel.soup.io/friends since=642828061 page=98, until=31
downloading url=http://vogel.soup.io/friends since=642442858 page=48, until=31
downloading url=http://vogel.soup.io/friends since=642440307 page=23, until=31
downloading url=http://vogel.soup.io/friends since=642436090 page=10, until=31
downloading url=http://vogel.soup.io/friends since=642331390 page=4, until=31
downloading url=http://vogel.soup.io/friends since=642322581 page=1, until=31
downloading url=http://bercik.soup.io/friends since=None page=100, until=31
downloading url=http://bercik.soup.io/friends since=642909630 page=99, until=31
downloading url=http://bercik.soup.io/friends since=642905848 page=98, until=31