In [5]:
import re
import time
import os
import shutil
import requests
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras import backend as K

class Scrapper:    
    def getPage(url, since):
        page = ""
        while page == "":
            try:
                page = requests.get(url if since is None else url + '?since=' + since, timeout=3.0).text
            except requests.exceptions.Timeout:
                print("timeout on: " + url)
                time.sleep(1.0)
        return page
    
    def getImage(url):
        sucess = False
        image = None
        while sucess == False:
            try:
                image = requests.get(url, timeout=3.0)
                sucess = True
            except requests.exceptions.Timeout:
                print("timeout on: " + url)
                return None
        return image
        
    def downloadImages(url, path, pages, since=None, until=None):
        print('downloading url={0} since={1} page={2}, until={3}'.format(url, since, pages, until))
        page = Scrapper.getPage(url, since)
        lines = page.split('\n')
        next_since = ""
        line_count = 0
        for l in lines:
            m = re.findall('href="/friends\?since=(.+?)" name="more"', l)
            if len(m) > 0:
                next_since = m[0]

            if(until is not None):
                m = re.findall('<span class="d">(.+?)</span>', l)
                if(len(m) > 0 and m[0] == until):
                    return

            m = re.findall('a class="lightbox".+?href="(.+?)".+?img.+?src=".+?"', l)
            if len(m) == 0:
                m = re.findall('img.+?alt="\d{4} .+?".+?src="(.+?asset.+?)"', l)
            if len(m) > 0:
                name = m[0].split("/")[-1]
                if os.path.isfile(path + '/' + name):
                    continue

                image = Scrapper.getImage(m[0])
                time.sleep(1)
                line_count = line_count + 1
                if image is not None and image.status_code == 200:
                    with open(path + '/' + name, 'wb') as f:
                        f.write(image.content)  
                
        if line_count == 0:
            pages = pages // 2
        if len(next_since) > 0 and pages > 1:
            Scrapper.downloadImages(url, path, pages - 1, next_since, until)
            
class Predictor:
    img_width, img_height = 150, 150
    input_shape = (3, img_width, img_height) if K.image_data_format() == 'channels_first' else (img_width, img_height, 3)
    
    model = Sequential()
    model.add(Conv2D(32, (3, 3), input_shape=input_shape))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(32, (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(64, (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(64))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    model.load_weights('first_try.h5')
        
    def predictDir(prediction_dir):
        prediction_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
            prediction_dir,
            target_size=(Predictor.img_width, Predictor.img_height),
            batch_size=1,
            class_mode='binary')
        return Predictor.model.predict_generator(prediction_generator, steps=1)
    
    def getAllPredictions(input_dir, prediction_dir, output_dir):
        for file in os.listdir(input_dir):
            if os.path.isfile(os.path.join(input_dir, file)) and file.split(".")[-1] != 'gif':
                output_path = shutil.copy2(os.path.join(input_dir, file), prediction_dir + '/tmp')
                try:
                    prediction = Predictor.predictDir(prediction_dir)
                    if prediction[0][0] > 0.5:
                        shutil.copy2(output_path, output_dir)
                except:
                    pass
                os.remove(output_path)

In [3]:
Predictor.getAllPredictions('in', 'predict', 'out')


Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging 

Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging 

Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging 

Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging 

Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging 

Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging 

In [None]:
up_to = '29'
Scrapper.downloadImages("http://vogel.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://bercik.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://inzynier.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://siostra.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://tfu.soup.io", "in", 1, None, up_to)
Scrapper.downloadImages("http://niedobrze.soup.io", "in", 1, None, up_to)
Scrapper.downloadImages("http://ecce.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://sucznik.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://bumszakalaka.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://saski.soup.io/friends", "in", 100, None, up_to)

downloading url=http://vogel.soup.io/friends since=None page=100, until=29
timeout on: http://asset-d.soupcdn.com/asset/13945/0706_d79a_960.jpeg
downloading url=http://vogel.soup.io/friends since=638819857 page=99, until=29
downloading url=http://vogel.soup.io/friends since=638818948 page=98, until=29
downloading url=http://vogel.soup.io/friends since=638818728 page=97, until=29
downloading url=http://vogel.soup.io/friends since=638818542 page=96, until=29
timeout on: http://asset-1.soupcdn.com/asset/13138/7851_14a9_960.png
