## Iterative Training

This is the step-by-step iterative visual model training based on the crawled data.

Based on how many pictures are needed for the crawl, there will be different number of models trained for filtering against the image candidates.

The precision will be measured afterwords for each iteration.

In [None]:
"""
Data samples from multiple sources

1. 100 seed images from google image
2. 1000 images from google and possibly Flickr
3. 10000 images from google, Flickr
"""

# 485 results from google image search
results_google = '../tasks/03.image-crawl/result-wse_google_bird.jsonl'

# 119784 results with good varieties, serving as negative examples
results_non_samples = '../tasks/03.image-crawl/result-wse_non_samples-100k.jsonl'


In [None]:
# plow multiple images

"""
plot image matrix
"""

# plot 3x3 picture matrix

%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import requests
import StringIO
from matplotlib.pyplot import figure, show, axes, sci
from PIL import Image


def thumbnail(img, size=250):
    """ generate size x size square thumbnail """
    
    THUMB_SIZE = size, size
    width, height = img.size

    if width > height:
        delta = width - height
        left = int(delta/2)
        upper = 0
        right = height + left
        lower = height
    else:
        delta = height - width
        left = 0
        upper = int(delta/2)
        right = width
        lower = width + upper

    img = img.crop((left, upper, right, lower))
    img.thumbnail(THUMB_SIZE, Image.ANTIALIAS)
    
    return img


def plot_3x3(images, title = ''):
    """
    given 9 images, plot them in 3x3 matrix
    """

    plt.figure(figsize=(10,10))
    
    Nr = 3
    Nc = 3
    i = 0
    
    for image in images:
        try:
            img_io = requests.get(image)
            image = Image.open(StringIO.StringIO(img_io.content))
        except:
            continue
        
        i += 1
        plt.subplot(Nr, Nc, i)
        image = thumbnail(image)
        
        #img = io.imread(StringIO.StringIO(img_io.content))
        plt.imshow(image)
    
    plt.show()

def plot_images(images, title = '', nrow=3, ncol=3):
    """
    given 9 images, plot them in 3x3 matrix
    """

    # force truncate the image array
    images = images[:nrow*ncol]
    
    plt.figure(figsize=(20,20))
    
    Nr = nrow
    Nc = ncol
    i = 0
    
    for image in images:
        
        try:
            img_io = requests.get(image)
            image = Image.open(StringIO.StringIO(img_io.content))
            image = thumbnail(image, size=100)
        except:
            continue
            
        i += 1
        plt.subplot(Nr, Nc, i)
        plt.imshow(image)
    
    plt.show()


In [None]:
#
# load samples
#

import os
import json

def load_samples(filename, num=None, labels=[]):
    """ read predicted results from file """
    
    if not os.path.exists(filename):
        print 'file not exist %s' % filename
        return
    
    items = []
    with open(filename) as fdr:
        for line in fdr:
            line = line.strip()
            try:
                item = json.loads(line)
            except:
                continue
            if not item.get('embeds') or len(item.get('embeds')) != 1024:
                continue
            if labels != [] and set(labels).intersection(item.get('tags')) == set():
                continue

            items.append(item)
            if num is not None and len(items) == num:
                break

    return items

# load positive examples
results = load_samples(results_google, 100)

print 'number of samples loaded:'
print len(results)

seed_urls = [sample['url'] for sample in results]
seed_tags = [sample['tags'] for sample in results]
seed_samples = [sample['embeds'] for sample in results]
seed_samples = np.array(seed_samples)
print seed_samples.shape

# load negative examples
results = load_samples(results_non_samples, 100)
neg_urls = [sample['url'] for sample in results]
neg_samples = [sample['embeds'] for sample in results]
neg_samples = np.array(neg_samples)
print neg_samples.shape

# load test examples
results = load_samples(results_google)
results = results[-50:]

results2 = load_samples(results_non_samples, 50)

results += results2

test_urls = [sample['url'] for sample in results]
test_samples = [sample['embeds'] for sample in results]
test_samples = np.array(test_samples)

print test_samples.shape

In [None]:
"""
run K-means to cluster the seed crawl to 3 clusters
then rule out the clusters with less than 5% population
"""

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.cluster import KMeans
from sklearn import datasets

np.random.seed(5)

from time import time
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

# cluster the data

data = scale(seed_samples)
n_samples, n_features = data.shape
k = 3

k_means = KMeans(init='k-means++', n_clusters=k, n_init=10)
k_means.fit(data)


In [None]:
# check the clustered data

clustered_urls = []

it = np.nditer(k_means.labels_, flags=['f_index'])

while not it.finished:
    clustered_urls.append([])
    clustered_urls[it[0]].append(seed_urls[it.index])
    it.iternext()

for idx in range(k):
    print 'count for %d: %d' % (idx, len(clustered_urls[idx]))

for cat in range(k):
    img_urls = clustered_urls[cat][:9]
    print 'cluster %d' % cat
    plot_3x3(img_urls)


In [None]:
# clean and rebalanced the samples

import numpy as np

ids_excl = []
for idx in range(k):
    cnt = np.sum(k_means.labels_ == idx)
    if cnt < len(k_means.labels_) * 0.05:
        for idid in np.where(k_means.labels_ == idx)[0]:
            ids_excl.append(idid)

ids_excl = sorted(ids_excl, reverse=True)

for idid in ids_excl:
    del seed_tags[idid]
    del seed_urls[idid]
    seed_samples = np.delete(seed_samples, idid, 0)

# check the samples
print seed_samples.shape
print len(seed_tags)
print len(seed_urls)


In [None]:
# plot the positive images for visual inspection

plot_images(seed_urls, nrow=10, ncol=10)

In [None]:
# train the model0

import numpy as np
from sklearn.linear_model import LogisticRegression

npos, _ = seed_samples.shape
nneg, _ = neg_samples.shape

print npos, nneg

train_data = np.append(seed_samples, neg_samples, 0)
train_label = np.array([1] * npos + [2] * nneg)

print train_data.shape
print train_label.shape

print len(train_data)
print len(train_label)

assert(len(train_data) == len(train_label))

clf = LogisticRegression()
clf.fit(train_data, train_label)


In [None]:
"""
test the model0
with positive and negative samples
"""

print 'test positive samples:'
for test_feature in test_samples[:50]:
    print(clf.predict_proba(test_feature))
plot_3x3(test_urls[:9])

print 'test negative samples:'
for test_feature in test_samples[-50:]:
    print(clf.predict_proba(test_feature))
plot_3x3(test_urls[-9:])


In [None]:
"""
2nd iteration

1. predict 1k+ until get 1k positive
2. train on the 1k positives
3. test
"""

# results from flickr
results_flickr_10k = '../tasks/03.image-crawl/result-wse_flickr_bird_10k.jsonl'

# load positive examples
results = load_samples(results_google)
results = results[100:]

results_flickr = load_samples(results_flickr_10k)
results += results_flickr

seed_urls = [sample['url'] for sample in results]
seed_tags = [sample['tags'] for sample in results]
seed_samples = [sample['embeds'] for sample in results]
seed_samples = np.array(seed_samples)

print '%d number of samples loaded:' % len(results)

cnt = 0
pos_ids = []
neg_ids = []
for idx in range(len(seed_urls)):
    if clf.predict_proba(seed_samples[idx])[0][0] > 0.80:
        pos_ids.append(idx)
        cnt += 1
        if cnt == 1000:
            break
    else:
        neg_ids.append(idx)

import random
import copy

random.shuffle(neg_ids)
pos_idx_copy = copy.copy(pos_ids)
random.shuffle(pos_idx_copy)

print "%d positive samples from %d candidates" % (cnt, idx)


In [None]:
def make_data_samples(datasets, idsets):
    """ subset the datasets with a group of ids """
    results = []
    for idx in idsets:
        results.append(datasets[idx])
    return results

pos_urls = make_data_samples(seed_urls, pos_idx_copy)
neg_urls = make_data_samples(seed_urls, neg_ids)

plot_3x3(pos_urls[:9])

plot_3x3(neg_urls[:9])


In [None]:
plot_images(pos_urls, nrow=10, ncol=10)

In [None]:
# make the training data for 
# iteration 2
# with 1k positive and 1k negative

# make positive samples
results_pos = make_data_samples(results, pos_ids)

seed_urls = [sample['url'] for sample in results_pos]
seed_tags = [sample['tags'] for sample in results_pos]
seed_samples = [sample['embeds'] for sample in results_pos]
seed_samples = np.array(seed_samples)

print seed_samples.shape

# make negative samples
results_neg = load_samples(results_non_samples, 1000)
neg_urls = [sample['url'] for sample in results_neg]
neg_samples = [sample['embeds'] for sample in results_neg]
neg_samples = np.array(neg_samples)
print neg_samples.shape


In [None]:
#
# train the 2nd model with 1k+1k
#

import numpy as np
from sklearn.linear_model import LogisticRegression

npos, _ = seed_samples.shape
nneg, _ = neg_samples.shape

print npos, nneg

train_data = np.append(seed_samples, neg_samples, 0)
train_label = np.array([1] * npos + [2] * nneg)

print train_data.shape
print train_label.shape

print len(train_data)
print len(train_label)

assert(len(train_data) == len(train_label))

clf2 = LogisticRegression()
clf2.fit(train_data, train_label)


In [None]:
"""
test the model1
with positive and negative samples
"""

print 'test positive samples:'
for test_feature in test_samples[:50]:
    print(clf2.predict_proba(test_feature))
plot_3x3(test_urls[10:18])

print 'test negative samples:'
for test_feature in test_samples[-50:]:
    print(clf2.predict_proba(test_feature))
plot_3x3(test_urls[-9:])


In [None]:
"""
3rd iteration, to 10k positives
"""

# results from flickr
results_flickr_100k = '../tasks/03.image-crawl/result-wse_flickr_bird_200k.jsonl'

# load positive examples
results = load_samples(results_google)

results_flickr = load_samples(results_flickr_100k, 20000)
results += results_flickr

seed_urls = [sample['url'] for sample in results]
seed_tags = [sample['tags'] for sample in results]
seed_samples = [sample['embeds'] for sample in results]
seed_samples = np.array(seed_samples)

print '%d number of samples loaded:' % len(results)

cnt = 0
pos_ids = []
for idx in range(len(seed_urls)):
    if clf2.predict_proba(seed_samples[idx])[0][0] > 0.99:
        pos_ids.append(idx)
        cnt += 1
        if cnt == 10000:
            break

print "%d positive samples from %d candidates" % (cnt, idx)


In [None]:
# make the training data for 
# iteration 3
# with 10k positive and 10k negative

# make positive samples
results_pos = make_data_samples(results, pos_ids)

seed_urls = [sample['url'] for sample in results_pos]
seed_tags = [sample['tags'] for sample in results_pos]
seed_samples = [sample['embeds'] for sample in results_pos]
seed_samples = np.array(seed_samples)

print seed_samples.shape

# make negative samples
results_neg = load_samples(results_non_samples, len(seed_urls))
neg_urls = [sample['url'] for sample in results_neg]
neg_samples = [sample['embeds'] for sample in results_neg]
neg_samples = np.array(neg_samples)
print neg_samples.shape


In [None]:
plot_images(seed_urls, nrow=10, ncol=10)

In [None]:
#
# train the 3nd model with 10k+10k
#

import numpy as np
from sklearn.linear_model import LogisticRegression

npos, _ = seed_samples.shape
nneg, _ = neg_samples.shape

print npos, nneg

train_data = np.append(seed_samples, neg_samples, 0)
train_label = np.array([1] * npos + [2] * nneg)

print train_data.shape
print train_label.shape

print len(train_data)
print len(train_label)

assert(len(train_data) == len(train_label))

clf3 = LogisticRegression()
clf3.fit(train_data, train_label)


In [None]:
"""
test the model1
with positive and negative samples
"""

print 'test positive samples:'
for test_feature in test_samples[:50]:
    print(clf3.predict_proba(test_feature))
plot_3x3(test_urls[10:18])

print 'test negative samples:'
for test_feature in test_samples[-50:]:
    print(clf3.predict_proba(test_feature))
plot_3x3(test_urls[-9:])


In [None]:
"""
4th iteration, to 100k positives
"""

# results from flickr
results_flickr_100k = '../tasks/03.image-crawl/result-wse_flickr_bird_200k.jsonl'

# load positive examples
results = load_samples(results_google)

results_flickr = load_samples(results_flickr_100k)
results += results_flickr

seed_urls = [sample['url'] for sample in results]
seed_tags = [sample['tags'] for sample in results]
seed_samples = [sample['embeds'] for sample in results]
seed_samples = np.array(seed_samples)

print '%d number of samples loaded:' % len(results)

cnt = 0
pos_ids = []
for idx in range(len(seed_urls)):
    if clf3.predict_proba(seed_samples[idx])[0][0] > 0.99:
        pos_ids.append(idx)
        cnt += 1
        if cnt == 100000:
            break

print "%d positive samples from %d candidates" % (cnt, idx)


In [None]:
# make the training data for 
# iteration 4
# with 100k positive and 100k negative

# make positive samples
results_pos = make_data_samples(results, pos_ids)

seed_urls = [sample['url'] for sample in results_pos]
seed_tags = [sample['tags'] for sample in results_pos]
seed_samples = [sample['embeds'] for sample in results_pos]
seed_samples = np.array(seed_samples)

print seed_samples.shape

# make negative samples
results_neg = load_samples(results_non_samples, len(seed_urls))
neg_urls = [sample['url'] for sample in results_neg]
neg_samples = [sample['embeds'] for sample in results_neg]
neg_samples = np.array(neg_samples)
print neg_samples.shape


In [None]:
#
# train the 4th model with 100k+100k
#

import numpy as np
from sklearn.linear_model import LogisticRegression

npos, _ = seed_samples.shape
nneg, _ = neg_samples.shape

print npos, nneg

train_data = np.append(seed_samples, neg_samples, 0)
train_label = np.array([1] * npos + [2] * nneg)

print train_data.shape
print train_label.shape

print len(train_data)
print len(train_label)

assert(len(train_data) == len(train_label))

clf4 = LogisticRegression()
clf4.fit(train_data, train_label)


In [None]:
"""
test the model1
with positive and negative samples
"""

print 'test positive samples:'
for test_feature in test_samples[:50]:
    print(clf4.predict_proba(test_feature))
plot_3x3(test_urls[10:18])

print 'test negative samples:'
for test_feature in test_samples[-50:]:
    print(clf4.predict_proba(test_feature))
plot_3x3(test_urls[-9:])


In [None]:
import copy
pos_urls = copy.copy(seed_urls)

import random
random.shuffle(pos_urls)

pos_urls = pos_urls[:100]
print pos_urls

print len(pos_urls)



In [None]:
plot_images(pos_urls, nrow=10, ncol=10)