In [1]:
import re
import time
import os
import shutil
import requests
import hashlib
import json

class Hasher:
    hashes = {}
    invertedHashes = {}
    def updateFileHash(self, file_path):
        hash_md5 = hashlib.md5()
        with open(file_path, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                hash_md5.update(chunk)
        file_hash = hash_md5.hexdigest()
        if file_hash in self.hashes:
            return True
        self.hashes[file_hash] = file_path
        self.invertedHashes[file_path] = file_hash
        return False
        
    def updateDirHashes(self, dir_path):
        for file in os.listdir(dir_path):
            file_path = os.path.join(dir_path, file)
            if os.path.isfile(file_path):
                self.updateFileHash(file_path)
    
    def loadJson(self, file_path):
        self.hashes = json.load(open(file_path, 'rt'))
        self.invertedHashes = {v: k for k, v in self.hashes.items()}
        
    def saveJson(self, file_path):
        json.dump(self.hashes, open(file_path, 'wt'))

class Scrapper:
    hasher = Hasher()
    hasher.loadJson('hashes.txt')
    
    def getPage(url, since):
        page = ""
        while page == "":
            try:
                page = requests.get(url if since is None else url + '?since=' + since, timeout=15.0).text
            except requests.exceptions.Timeout:
                print("timeout on: " + url)
                time.sleep(1.0)
        return page
    
    def getImage(url):
        sucess = False
        image = None
        while sucess == False:
            try:
                image = requests.get(url, timeout=5.0)
                sucess = True
            except requests.exceptions.Timeout:
                print("timeout on: " + url)
                return None
        return image
            
    def downloadImages(url, path, pages, since=None, until=None):
        print('downloading url={0} since={1} page={2}, until={3}'.format(url, since, pages, until))
        page = Scrapper.getPage(url, since)
        lines = page.split('\n')
        next_since = ""
        line_count = 0
        was_until = False
        for l in lines:
            m = re.findall('href="/friends\?since=(.+?)" name="more"', l)
            if len(m) > 0:
                next_since = m[0]

            if(until is not None):
                m = re.findall('<span class="d">(.+?)</span>', l)
                if(len(m) > 0 and m[0] == until):
                    was_until = True

            m = re.findall('a class="lightbox".+?href="(.+?)".+?img.+?src=".+?"', l)
            if len(m) > 0:
                name = m[0].split("/")[-1]
                file_path = path + '/' + name
                if os.path.isfile(file_path):
                    continue
                if file_path in Scrapper.hasher.invertedHashes:
                    continue

                image = Scrapper.getImage(m[0])
                time.sleep(1)
                line_count = line_count + 1
                if image is not None and image.status_code == 200:
                    with open(file_path, 'wb') as f:
                        f.write(image.content)
                    if Scrapper.hasher.updateFileHash(file_path):
                        os.remove(file_path)
                    continue
                        
            m = re.findall('img.+?alt="\d{4} .+?".+?src="(.+?asset.+?)"', l)
            if len(m) > 0:
                name = m[0].split("/")[-1]
                file_path = path + '/' + name
                if os.path.isfile(file_path):
                    continue
                if file_path in Scrapper.hasher.invertedHashes:
                    continue

                image = Scrapper.getImage(m[0])
                time.sleep(1)
                line_count = line_count + 1
                if image is not None and image.status_code == 200:
                    with open(file_path, 'wb') as f:
                        f.write(image.content)
                    if Scrapper.hasher.updateFileHash(file_path):
                        os.remove(file_path)
        
        Scrapper.hasher.saveJson('hashes.txt')
        if was_until:
            return
        if line_count == 0:
            pages = pages // 2
        if len(next_since) > 0 and pages > 1:
            Scrapper.downloadImages(url, path, pages - 1, next_since, until)

In [2]:
up_to = '15'
Scrapper.downloadImages("http://vogel.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://bercik.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://inzynier.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://siostra.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://tfu.soup.io", "in", 1, None, up_to)
Scrapper.downloadImages("http://niedobrze.soup.io", "in", 1, None, up_to)
Scrapper.downloadImages("http://ecce.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://sucznik.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://bumszakalaka.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://saski.soup.io/friends", "in", 100, None, up_to)

downloading url=http://vogel.soup.io/friends since=None page=100, until=15
downloading url=http://vogel.soup.io/friends since=658408823 page=99, until=15
downloading url=http://vogel.soup.io/friends since=658309338 page=98, until=15
downloading url=http://vogel.soup.io/friends since=658288300 page=48, until=15
downloading url=http://vogel.soup.io/friends since=658281863 page=23, until=15
downloading url=http://vogel.soup.io/friends since=658280917 page=22, until=15
downloading url=http://vogel.soup.io/friends since=658279957 page=10, until=15
downloading url=http://vogel.soup.io/friends since=658279439 page=4, until=15
downloading url=http://vogel.soup.io/friends since=658279193 page=1, until=15
downloading url=http://bercik.soup.io/friends since=None page=100, until=15
timeout on: http://bercik.soup.io/friends
timeout on: http://asset-2.soupcdn.com/asset/14395/9645_2b64_700.jpeg
downloading url=http://bercik.soup.io/friends since=658487824 page=99, until=15
downloading url=http://berc

downloading url=http://sucznik.soup.io/friends since=658245620 page=3, until=15
downloading url=http://sucznik.soup.io/friends since=658239259 page=2, until=15
downloading url=http://bumszakalaka.soup.io/friends since=None page=100, until=15
timeout on: http://bumszakalaka.soup.io/friends
timeout on: http://bumszakalaka.soup.io/friends
timeout on: http://bumszakalaka.soup.io/friends
timeout on: http://bumszakalaka.soup.io/friends
timeout on: http://bumszakalaka.soup.io/friends
timeout on: http://bumszakalaka.soup.io/friends
timeout on: http://bumszakalaka.soup.io/friends
timeout on: http://bumszakalaka.soup.io/friends
timeout on: http://bumszakalaka.soup.io/friends
timeout on: http://bumszakalaka.soup.io/friends
timeout on: http://bumszakalaka.soup.io/friends
downloading url=http://bumszakalaka.soup.io/friends since=655950448 page=49, until=15
timeout on: http://bumszakalaka.soup.io/friends
downloading url=http://bumszakalaka.soup.io/friends since=655949243 page=23, until=15
downloadin

In [5]:
Scrapper.downloadImages("http://ecce.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://sucznik.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://bumszakalaka.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://saski.soup.io/friends", "in", 100, None, up_to)

downloading url=http://ecce.soup.io/friends since=None page=100, until=15
downloading url=http://ecce.soup.io/friends since=658138896 page=99, until=15
downloading url=http://ecce.soup.io/friends since=657480327 page=48, until=15
downloading url=http://ecce.soup.io/friends since=657418579 page=47, until=15
downloading url=http://ecce.soup.io/friends since=657415497 page=22, until=15
timeout on: http://asset-8.soupcdn.com/asset/14390/9313_8eda_600.gif
downloading url=http://ecce.soup.io/friends since=656706888 page=21, until=15
downloading url=http://ecce.soup.io/friends since=656413539 page=9, until=15
downloading url=http://ecce.soup.io/friends since=656396625 page=3, until=15
downloading url=http://sucznik.soup.io/friends since=None page=100, until=15
downloading url=http://sucznik.soup.io/friends since=658298002 page=99, until=15
downloading url=http://sucznik.soup.io/friends since=658256883 page=48, until=15
downloading url=http://sucznik.soup.io/friends since=658247933 page=47, un

KeyboardInterrupt: 

In [6]:
Scrapper.downloadImages("http://saski.soup.io/friends", "in", 100, None, up_to)

downloading url=http://saski.soup.io/friends since=None page=100, until=15
downloading url=http://saski.soup.io/friends since=658367273 page=99, until=15
downloading url=http://saski.soup.io/friends since=658313307 page=98, until=15
downloading url=http://saski.soup.io/friends since=658310136 page=97, until=15
downloading url=http://saski.soup.io/friends since=658306678 page=47, until=15
downloading url=http://saski.soup.io/friends since=658298522 page=46, until=15
downloading url=http://saski.soup.io/friends since=658282012 page=45, until=15
downloading url=http://saski.soup.io/friends since=658281411 page=44, until=15
downloading url=http://saski.soup.io/friends since=658280750 page=21, until=15
downloading url=http://saski.soup.io/friends since=658279767 page=9, until=15
downloading url=http://saski.soup.io/friends since=658279396 page=3, until=15
