In [1]:
import requests
import re
import time
import os
import shutil
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras import backend as K

class Scrapper:
    def downloadImages(url, path, pages, since=None, until=None):
        print('downloading url={0} since={1} page={2}, until={3}'.format(url, since, pages, until))
        page = requests.get(url if since is None else url + '?since=' + since)
        lines = page.text.split('\n')
        next_since = ""
        line_count = 0
        for l in lines:
            m = re.findall('href="/friends\?since=(.+?)" name="more"', l)
            if len(m) > 0:
                next_since = m[0]

            if(until is not None):
                m = re.findall('<span class="d">(.+?)</span>', l)
                if(len(m) > 0 and m[0] == until):
                    return
                
            m = re.findall('a class="lightbox".+?href="(.+?)".+?img.+?src=".+?"', l)
            if len(m) == 0:
                m = re.findall('img.+?alt="\d{4} .+?".+?src="(.+?asset.+?)"', l)
            if len(m) > 0:
                name = m[0].split("/")[-1]
                if os.path.isfile(path + '/' + name):
                    continue
                    
                image = requests.get(m[0])
                time.sleep(1)
                line_count = line_count + 1
                if image.status_code == 200:
                    with open(path + '/' + name, 'wb') as f:
                        f.write(image.content)                
                
        if line_count == 0:
            pages = pages // 2
        if len(next_since) > 0 and pages > 1:
            Scrapper.downloadImages(url, path, pages - 1, next_since, until)
            
class Predictor:
    img_width, img_height = 150, 150
    input_shape = (3, img_width, img_height) if K.image_data_format() == 'channels_first' else (img_width, img_height, 3)
    
    model = Sequential()
    model.add(Conv2D(32, (3, 3), input_shape=input_shape))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(32, (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(64, (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(64))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    model.load_weights('first_try.h5')
        
    def predictDir(prediction_dir):
        prediction_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
            prediction_dir,
            target_size=(Predictor.img_width, Predictor.img_height),
            batch_size=1,
            class_mode='binary')
        return Predictor.model.predict_generator(prediction_generator, steps=1)
    
    def getAllPredictions(input_dir, prediction_dir, output_dir):
        for file in os.listdir(input_dir):
            if os.path.isfile(os.path.join(input_dir, file)) and file.split(".")[-1] != 'gif':
                output_path = shutil.copy2(os.path.join(input_dir, file), prediction_dir + '/tmp')
                try:
                    prediction = Predictor.predictDir(prediction_dir)
                    if prediction[0][0] > 0.5:
                        shutil.copy2(output_path, output_dir)
                except:
                    pass
                os.remove(output_path)

Using TensorFlow backend.


In [None]:
Predictor.getAllPredictions('in', 'predict', 'out')


Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging 

Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging 




Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging

Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging 

In [2]:
up_to = '08'
Scrapper.downloadImages("http://vogel.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://bercik.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://inzynier.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://siostra.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://tfu.soup.io", "in", 1, None, up_to)
Scrapper.downloadImages("http://niedobrze.soup.io", "in", 1, None, up_to)
Scrapper.downloadImages("http://ecce.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://sucznik.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://bumszakalaka.soup.io/friends", "in", 100, None, up_to)
Scrapper.downloadImages("http://saski.soup.io/friends", "in", 100, None, up_to)

downloading url=http://vogel.soup.io/friends since=None page=100, until=08
downloading url=http://vogel.soup.io/friends since=637525305 page=99, until=08
downloading url=http://vogel.soup.io/friends since=637494678 page=98, until=08
downloading url=http://vogel.soup.io/friends since=637493082 page=97, until=08
downloading url=http://vogel.soup.io/friends since=637473226 page=47, until=08
downloading url=http://vogel.soup.io/friends since=637472763 page=22, until=08
downloading url=http://bercik.soup.io/friends since=None page=100, until=08
downloading url=http://bercik.soup.io/friends since=637549381 page=99, until=08
downloading url=http://bercik.soup.io/friends since=637549181 page=98, until=08
downloading url=http://bercik.soup.io/friends since=637548896 page=97, until=08
downloading url=http://bercik.soup.io/friends since=637545563 page=96, until=08
downloading url=http://bercik.soup.io/friends since=637541458 page=95, until=08
downloading url=http://bercik.soup.io/friends since=63

downloading url=http://bumszakalaka.soup.io/friends since=None page=100, until=08
downloading url=http://bumszakalaka.soup.io/friends since=637523654 page=99, until=08
downloading url=http://saski.soup.io/friends since=None page=100, until=08
downloading url=http://saski.soup.io/friends since=637526159 page=99, until=08
downloading url=http://saski.soup.io/friends since=637525239 page=98, until=08
downloading url=http://saski.soup.io/friends since=637524242 page=48, until=08
downloading url=http://saski.soup.io/friends since=637521084 page=23, until=08
downloading url=http://saski.soup.io/friends since=637511347 page=22, until=08
downloading url=http://saski.soup.io/friends since=637502743 page=21, until=08
downloading url=http://saski.soup.io/friends since=637501669 page=20, until=08
downloading url=http://saski.soup.io/friends since=637497600 page=19, until=08
downloading url=http://saski.soup.io/friends since=637494014 page=18, until=08
downloading url=http://saski.soup.io/friends s