In [13]:
%matplotlib inline
import os
import gc

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
from PIL import Image, ImageFile
from urllib2 import urlopen
from django.core.validators import URLValidator
from django.core.exceptions import ValidationError
from sklearn.externals import joblib


import keras
from keras import backend as K
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import preprocess_input
import math
import pickle

In [2]:
ImageFile.LOAD_TRUNCATED_IMAGES = True


def delete_model(model, clear_session=True):
    '''removes model!
    '''
    del model
    gc.collect()
    if clear_session: K.clear_session()


def is_url(url):
    val = URLValidator()
    try:
        val(url)
        return True
    except ValidationError as e:
        return False
    
def plot_channels(img):
    _ , ax = plt.subplots(1, 3, sharex='col', sharey='row', figsize=(24, 6))
    plt.suptitle('RBG Channels of an Image', size=20)
    
    np_img_r = np_img.copy()
    np_img_r[:, :, 1] = np.zeros(shape=[img_height, img_width])
    np_img_r[:, :, 2] = np.zeros(shape=[img_height, img_width])
    ax[0].imshow(np_img_r)
    ax[0].axis('off')
    
    np_img_g = np_img.copy()
    np_img_g[:, :, 0] = np.zeros(shape=[img_height, img_width])
    np_img_g[:, :, 2] = np.zeros(shape=[img_height, img_width])
    ax[1].imshow(np_img_g)
    ax[1].axis('off')
    
    np_img_b = np_img.copy()
    np_img_b[:, :, 0] = np.zeros(shape=[img_height, img_width])
    np_img_b[:, :, 1] = np.zeros(shape=[img_height, img_width])
    ax[2].imshow(np_img_b)
    ax[2].axis('off')


def read_img_url(url):
    file = urlopen(url)

    img = Image.open(file)
    
    if img.mode != 'RGB':
        img = img.convert('RGB')
    return img

def read_img_file(f):
    img = Image.open(f)
    
    if img.mode != 'RGB':
        img = img.convert('RGB')
    return img


def read_img(f):
    if is_url(f):
        img = read_img_url(f)
    else:
        img = read_img_file(f)
    
    return img

def resize_img_to_array(img, img_shape=(244, 244)):
    img_array = np.array(
        img.resize(
            img_shape, 
            Image.ANTIALIAS
        )
    )
    
    return img_array

In [3]:
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import preprocess_input

def make_resnet_conv(input_shape):
    '''
    Creates a ResNet50 model trained on ImageNet.
    It includes no final activation function,
    so model returns conv. features.
    
    `input_shape` is a tuple of integers.
    '''
    model = ResNet50(input_shape=input_shape, 
                     weights='imagenet', 
                     include_top=False)
    for layer in model.layers:
        layer.trainable = False   
    
    return model

# Read Stock/Train Embeddings from File, Fit KNN

In [4]:
#TODO remove all afkikea 
meta_path = 'trial.csv'
X = pd.read_csv(meta_path)
print X.shape

  interactivity=interactivity, compiler=compiler, result=result)


(15695, 2053)


In [5]:
X.product_id = X.product_id.astype(str)

In [35]:
X_train = X[~X['product_id'].str.contains('afrikrea')]
X_train = X_train[~X_train['gender'].str.contains('others')]
print X_train.shape

(4229, 2053)


In [36]:
X_train_conv_2d = X_train[[_ for _ in X_train.columns if "x_" in _]].values.astype(np.float)
X_train_conv_2d.shape

(4229, 2048)

In [37]:
col_names = ['filename', "gender", "category", "product_id"]
datadir = '/Users/robelmengistu/Documents/CS230_project/data/'
X_train_urls = X_train[[_ for _ in X_train.columns if _ in col_names]].values

train_urls = []
for url_split in X_train_urls:
    url_a = datadir + "/".join(url_split)
    #print url_a
    train_urls.append(url_a)
print len(train_urls)

4229


In [38]:
#Training KNN on 100K images w/ 2K features takes about 2 minutes!
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors=30, n_jobs=8, algorithm='ball_tree')
knn.fit(X_train_conv_2d)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=8, n_neighbors=30, p=2, radius=1.0)

In [39]:
#knn.kneighbors([X, 8, return_distance=True])

In [40]:
#serialize 
from sklearn.externals import joblib
knn_file = 'knn_train_2d_4229.pkl'
joblib.dump(knn, knn_file)

['knn_train_2d_4229.pkl']

# Swatch Embeddings

In [14]:
tri_url_map = pickle.load( open( "url_pickle.p", "rb" ) )
print len(tri_url_map)

44520


In [15]:
# Read From File
tri_feat_map = pickle.load( open( "feats_pickle.p", "rb" ) )
print len(tri_feat_map)

44520


In [16]:
def calc_dif_2 (index):
    anc, pos, neg = tri_feat_map[index][0], tri_feat_map[index][1], tri_feat_map[index][2]
    pos_diff =   np.linalg.norm((anc) - (pos))**2
    neg_diff =  np.linalg.norm ((anc) - (neg))**2
    return neg_diff - pos_diff

In [17]:
diff = []
off_diff = []
off_train, off_val, off_test = 0,0,0 
semi_hard_triplets = [] 
for key, value in tri_feat_map.iteritems():
    if calc_dif_2(key) > 0: 
        semi_hard_triplets.append((value[0], value[1], value[2]))
    if key % 10000 == 0: print key, len(semi_hard_triplets) 

0 0
10000 8331
20000 17892
30000 26770
40000 35130


In [18]:
print len(semi_hard_triplets)
print len(semi_hard_triplets[0]), len(semi_hard_triplets[0][0])

38536
3 2048


# Stock Images Embeddings and Neighbors

In [41]:
print train_urls[0]

/Users/robelmengistu/Documents/CS230_project/data/women/women-skirts/468149363/2AmsaAfricanPrintMidiSkirtwithSashYellowBlue.jpeg


In [42]:
train_embeddings = X_train_conv_2d
stock_urls = train_urls
print train_embeddings.shape, len(stock_urls)

(4229, 2048) 4229


In [43]:
def get_stock_n_set():
    n_set = []
    for train_data in train_embeddings: 
        n_set.append(knn.kneighbors(train_data.reshape(1, train_data.shape[0]), return_distance=True))
        if len(n_set) % 500 == 0:
            print len(n_set)
    return n_set

In [44]:
stock_n_test = get_stock_n_set()

500
1000
1500
2000
2500
3000
3500
4000


In [49]:
stock_triplets = []
for i in range(len(stock_urls)):
    neg_paths, pos_paths = get_triplets (stock_urls[i], stock_n_test[i], num_rec=30)
    generate_triplets(stock_urls[i], neg_paths, pos_paths, stock_triplets)
    if i % 1000 == 0: print i
print "total stock triplets: ", len(stock_triplets)

0
1000
2000
3000
4000
total stock triplets:  96100


# Street Images Embeddings and Neighbors

In [53]:
test_eb = pickle.load(open( "test_embeddings_11194.p", "rb" ))
print len(test_eb), type(test_eb) 

11194 <type 'collections.defaultdict'>


In [55]:
street_urls = []
street_test_urls = [] 
street_2d_feat = []
street_test_2d_feat = []
i = 0 

for key,value in test_eb.iteritems():
    i += 1
    if i % 11 == 0: 
        street_test_urls.append(key)
        street_test_2d_feat.append(value)
    else: 
        street_urls.append(key)
        street_2d_feat.append(value)

street_2d_feat = np.asarray(street_2d_feat)

In [56]:
pickle.dump( street_test_urls, open( "street_test_urls.p", "wb" ) )

In [57]:
pickle.dump( street_test_urls, open( "street_test_urls_p3.p", "wb" ), protocol= pickle.HIGHEST_PROTOCOL)

In [58]:
def get_street_n_set():
    n_set = []
    
    for train_data in street_2d_feat: 
        n_set.append(knn.kneighbors(train_data.reshape(1, train_data.shape[0]), return_distance=True))
        if len(n_set) % 500 == 0:
            print len(n_set)
    return n_set

In [59]:
street_n_test = get_street_n_set()

500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000


In [60]:
street_n_train = street_n_test

In [84]:
street_triplets = []
for i in range(len(street_urls)):
    neg_paths, pos_paths = get_triplets (street_urls[i], street_n_train[i], num_rec=30)
    generate_triplets(street_urls[i], neg_paths, pos_paths, street_triplets, street=True)
    if i % 1000 == 0: print i
print "total triplets: ", len(street_triplets)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
total triplets:  198320


# Merge All Triplets

Total Number of Triplets  = 332956 <br> 
Train/Val/Test = 319900/6528/6528

In [90]:
from random import shuffle
final_triplets = street_triplets + stock_triplets + semi_hard_triplets
print len(final_triplets)
shuffle(final_triplets)

332956


In [95]:
# total, train/val/test:  319900 6528 6528
from collections import defaultdict 
triplet_count = 0
train_count, val_count, test_count = 0,0,0
t_set = "train"
PRODUCT_INDEX = 6
val_set_count, test_set_count = 0,0

s_h_map = defaultdict(list)

for tri_tuple in final_triplets:
    if train_count >0 and  train_count% 98 == 0:
        t_set = "val"
        if val_set_count == 2:
            t_set = "test"
            if test_set_count == 2:
                t_set = "train"
    
    s_h_map[t_set + "a_file.pkl"].append(tri_tuple[0])
    s_h_map[t_set + "p_file.pkl"].append(tri_tuple[1])
    s_h_map[t_set + "n_file.pkl"].append(tri_tuple[2])

    
    if t_set == "train": 
        train_count += 1 
        val_set_count =0 
        test_set_count = 0
    elif t_set == "val": 
        test_set_count = 0
        val_set_count += 1
        val_count += 1 
    elif t_set == "test": 
        test_count += 1 
        test_set_count += 1
    
    triplet_count += 1

    if triplet_count % 1000 == 0:
        print "total/train/val/test: ", triplet_count, train_count, val_count, test_count

total/train/val/test:  1000 964 18 18
total/train/val/test:  2000 1924 38 38
total/train/val/test:  3000 2884 58 58
total/train/val/test:  4000 3844 78 78
total/train/val/test:  5000 4804 98 98
total/train/val/test:  6000 5768 116 116
total/train/val/test:  7000 6728 136 136
total/train/val/test:  8000 7688 156 156
total/train/val/test:  9000 8648 176 176
total/train/val/test:  10000 9608 196 196
total/train/val/test:  11000 10572 214 214
total/train/val/test:  12000 11532 234 234
total/train/val/test:  13000 12492 254 254
total/train/val/test:  14000 13452 274 274
total/train/val/test:  15000 14412 294 294
total/train/val/test:  16000 15376 312 312
total/train/val/test:  17000 16336 332 332
total/train/val/test:  18000 17296 352 352
total/train/val/test:  19000 18256 372 372
total/train/val/test:  20000 19216 392 392
total/train/val/test:  21000 20180 410 410
total/train/val/test:  22000 21140 430 430
total/train/val/test:  23000 22100 450 450
total/train/val/test:  24000 23060 470 47

total/train/val/test:  215000 206572 4214 4214
total/train/val/test:  216000 207532 4234 4234
total/train/val/test:  217000 208492 4254 4254
total/train/val/test:  218000 209452 4274 4274
total/train/val/test:  219000 210412 4294 4294
total/train/val/test:  220000 211376 4312 4312
total/train/val/test:  221000 212336 4332 4332
total/train/val/test:  222000 213296 4352 4352
total/train/val/test:  223000 214256 4372 4372
total/train/val/test:  224000 215216 4392 4392
total/train/val/test:  225000 216180 4410 4410
total/train/val/test:  226000 217140 4430 4430
total/train/val/test:  227000 218100 4450 4450
total/train/val/test:  228000 219060 4470 4470
total/train/val/test:  229000 220020 4490 4490
total/train/val/test:  230000 220984 4508 4508
total/train/val/test:  231000 221944 4528 4528
total/train/val/test:  232000 222904 4548 4548
total/train/val/test:  233000 223864 4568 4568
total/train/val/test:  234000 224824 4588 4588
total/train/val/test:  235000 225788 4606 4606
total/train/v

In [96]:
print "total/train/val/test: ", triplet_count, train_count, val_count, test_count

total/train/val/test:  332956 319900 6528 6528


In [105]:
print len(s_h_map["train"+"p_file.pkl"])

319900


# Save All Embeddings to File 

<b>Input for Triplet Network</b> <br>
General File Structure: <br>
- embedding_data_final <br>
    - Train 
        - Anchor
            - a_file.pkl
        - Positives 
            - p_file.pkl
        - Negatives 
            - n_file.pkl
    - Test
        ...
    - Val 
        ...

In [107]:

for t_set in ["train","val", "test"]: 
    if not os.path.exists("embedding_data_final/" + t_set+ "/anchors/" + "/"): 
        os.makedirs("embedding_data_final/" + t_set+ "/anchors/")
        path_a = "embedding_data_final/" + t_set+ "/anchors/" + "a_file.pkl"
        pickle.dump(s_h_map[t_set+"a_file.pkl"] , open( path_a, "wb" ), protocol =pickle.HIGHEST_PROTOCOL )


    if not os.path.exists("embedding_data_final/" + t_set+ "/negatives/"  + "/"): 
        os.makedirs("embedding_data_final/" + t_set+ "/negatives/")
        path_n = "embedding_data_final/" + t_set+ "/negatives"  + "/" + "n_file.pkl"
        pickle.dump(s_h_map[t_set+"n_file.pkl"] , open( path_n, "wb" ), protocol=pickle.HIGHEST_PROTOCOL )


    if not os.path.exists("embedding_data_final/" + t_set+"/positives/"  + "/"): 
        os.makedirs("embedding_data_final/" + t_set+"/positives/")
        path_p = "embedding_data_final/" + t_set+ "/positives"  + "/" + "p_file.pkl"
        pickle.dump(s_h_map[t_set+"p_file.pkl"] , open( path_p, "wb" ), protocol=pickle.HIGHEST_PROTOCOL)

# Generate hard triplets (Helper Functions)

In [26]:
def get_triplets(url, neighbors, num_rec=8, per_row=4):
    path_split = url.split("/")
    CATEGORY_INDEX = 7   #TODO: change depending on data location 
    PID_INDEX = 8
    query_category = path_split[CATEGORY_INDEX]
    distance = neighbors[0][0]
    datadir = '/Users/robelmengistu/Documents/CS230_project/data/'
    c = 0 
    i = 0 
    neg_count = 0 
    neg_paths = []
    pos_paths = []
    while (True):
        if c == num_rec: break 
        k = neighbors[1][0][i]
        pd_id = train_urls[k].split("/")[PID_INDEX]
        pd_cat = train_urls[k].split("/")[CATEGORY_INDEX] 
        
        if pd_cat == query_category: pos_paths.append(train_urls[k])
        else: neg_paths.append(train_urls[k])
        i += 1
        c += 1
    return neg_paths[:5], pos_paths[-5:]

In [83]:
def generate_triplets(anchor, neg_paths, pos_paths, triplets_list, street=False):
    for neg in neg_paths:
        for pos in pos_paths:
            if street == True: 
                p_index, n_index = train_urls.index(pos), train_urls.index(neg)
                triplets_list.append((test_eb[anchor],  train_embeddings[p_index], train_embeddings[n_index])) 
            else:
                #print "a,p,n: ", train_urls[train_urls.index(anchor)].split("/")[7],  train_urls[train_urls.index(pos)].split("/")[7],  train_urls[train_urls.index(neg)].split("/")[7]
                a_index, p_index, n_index = train_urls.index(anchor), train_urls.index(pos), train_urls.index(neg)
                triplets_list.append((train_embeddings[a_index], train_embeddings[p_index], train_embeddings[n_index])) 

In [15]:
def accuracy_score (q_cat, r_cat, num_rec): 

    match_score =  1.0/num_rec
    half_match_score = (1.0/(num_rec * 2))
    
    matching_sets = [["women-dresses","women-outerwear", "women-matching-sets"], ["women-tops", "women-jackets"]]
    
    half_match_set_1 = ["women-jumpsuits", "women-pants-and-shorts","women-matching-sets","women-dresses", "women-skirts"]
    half_match_set_2 = ["women-tops", "women-jackets", "women-dresses", "women-matching-sets", "women-outerwear"]
    
    
    match_tuples = []
    for ms in matching_sets:
        for i in ms:
            for j in ms:
                if i != j: match_tuples.append((i,j))
    
    half_match_tuples = []
    for hs in [half_match_set_1, half_match_set_2]:
        for i in hs:
            for j in hs:
                if i != j: half_match_tuples.append((i,j))
    
    
    if r_cat == q_cat: return match_score
    if (r_cat, q_cat) in match_tuples: return match_score
    if (r_cat, q_cat) in half_match_tuples: return half_match_score
    return 0

In [13]:
# calc the mean avg. precision
def get_map(url, neighbors, J, num_rec=8, per_row=4):
    path_split = url.split("/")
    CATEGORY_INDEX = 7   #TODO: change depending on data location 
    query_category = path_split[7]
    distance = neighbors[0][0]
    datadir = '/Users/robelmengistu/Documents/CS230_project/data/'
    c = 0 
    i = 0 
    MAP = 0 
    accuracy = [] 
    product_recommended = []
    while (True):
        if c == num_rec: break 
        k = neighbors[1][0][i]
        #print k, X_train['product_id'][k]
        if J['product_id'][k] in product_recommended: 
            i += 1
            continue
        
        accuracy.append(accuracy_score(query_category, J['category'][k], num_rec))
        
        MAP += (sum(accuracy)/len(accuracy))
        
        i += 1
        c += 1
        product_recommended.append(J['product_id'][k])
    
    
    return MAP
        
        

In [14]:
def get_neighbors_set(f_set, knn_file):
    '''
    Loads the KNN file, and a pre-trained neural network.
    Converts image (f) to convoultional features,
    Sends conv features to KNN to find closest hits,
    Plots the top images and their distances.
    '''
    knn = joblib.load(knn_file) 
    model = make_resnet_conv(input_shape=[img_width, img_height, 3])
    neighbors_set = []
    for f in f_set: 
        X_conv_2d = get_conv_feats(f, model)
        neighbors = knn.kneighbors(X_conv_2d, return_distance=True)
        neighbors_set.append(neighbors)
    delete_model(model)
    return neighbors_set