# LDA-based Recommender System
LDA is being often described as the simplest topic model (Blei and Lafferty, 2009): The intuition behind this model is that documents exhibit multiple topics.

The aim of topic modelling is to automatically discover the topics from a collection of documents, which are observed, while the topic structure is hidden.

Rather than simply concatenating image and text features and
linearly combining them, PolyLDA enables us to learn two distinct
but coupled latent style representations. This allows for a versatile
interpretation of style. We then use these learned styles to make
bundle or assortment recommendations

## Reference
0. LDA pacakge: https://github.com/lda-project/lda
1. LDA-based book recommender system: https://humboldt-wi.github.io/blog/research/information_systems_1819/is_lda_final/
2. A Multimodal Recommender System for Large-scale Assortment Generation in E-commerce: https://arxiv.org/abs/1806.11226
3. Discovering Style Trends through Deep Visually Aware Latent Item Embeddings: https://arxiv.org/abs/1804.08704
4. Text LDA code demo, https://github.com/agrawal-priyank/machine-learning-clustering-retrieval/blob/master/latent-dirichlet-allocation/latent-dirichlet-allocation.ipynb
5. Build image recommendation app: https://towardsdatascience.com/a-flask-app-for-image-recommendations-a865e1496a0d, 
https://towardsdatascience.com/image-recommendations-with-pytorch-flask-postgresql-heroku-deployment-206682d06c6b

## Prepare for raw data
! Due to the scraping code, the downloaded images are more than records in csv, so only left those that exist in both parts.

In [None]:
import os
from glob import glob
import pickle
import shutil
import pandas as pd
import numpy as np
from pathlib import Path
from tensorflow.keras.preprocessing.image import load_img, img_to_array

ROOT = os.environ.get("ROOT")

In [None]:
meta_path = ROOT + '/Meta.nosync'
model_path = ROOT + '/Model'
static_model_path = ROOT + '/Engine/imageRecommender/static/pickles'
static_image_path = ROOT + '/Engine/imageRecommender/static/site_imgs/images'
CLS = ['chairs-chaises', 'coffee-tables', 'ottomans-benches', 'tv-stands-tv-mounts', 
       'recliners', 'sofa-console-tables', 'sofas', 'end-accent-tables', 'cabinets-shelvings']
NUM_CLASSES = 6
RANDOM_CROP_SIZE = 64
TARGET_SIZE = 224
BATCH_SIZE = 8
LR = 0.01
SEED = 1
PLOT_SIZE_1 = (40, 40)
PLOT_SIZE_2 = (10, 10)

In [None]:
df_org = pd.read_csv(Path(ROOT, 'Furniture.csv'), index_col=0)
df_org['keep'] = 0 # 1 means to keep the record, otherwises abandon
df_org['img_abs_path'] = None # add local image path

In [None]:
# remove existing folder
if os.path.exists(meta_path):
    shutil.rmtree(meta_path)

In [None]:
# freq of each type
df_org['category'].value_counts()

In [None]:
df_org.head()

In [None]:
types_remove = ['living-room-packages', 'sectionals', 'loveseats']
img_abs_path = glob(ROOT + '/Meta_org/*/*')

# create new dir
if not os.path.exists(meta_path):
    os.makedirs(meta_path)
for i in df_org['category'].unique():
    if i not in types_remove: # remove types that are not required for this study
        if not os.path.exists(Path(meta_path, i)):
            os.makedirs(Path(meta_path, i))

# copy image to new folder
for i in img_abs_path:
    main_path = i.rsplit('/', 3)[0]
    subfolder_name = i.rsplit('/', 3)[2]
    file_name = i.rsplit('/', 3)[3]
    file_name_1 = file_name.split('.')[0]
    if subfolder_name not in types_remove and file_name_1 in df_org['name'].tolist():
        print('image exist in csv so move it to meta folder')
        shutil.copy(i, Path(main_path, 'Meta.nosync', subfolder_name))
        df_org.loc[df_org['name'] == file_name_1, 'keep'] = 1
        df_org.loc[df_org['name'] == file_name_1, 'img_abs_path'] = str(Path(main_path, 'Meta.nosync', subfolder_name, file_name_1 + '.jpg'))
    else:
        print(i, 'is not requried or not match with the record in csv')

In [None]:
df_0 = df_org[df_org['keep'] == 1]
print('number of records left are', len(df_0))

In [None]:
assert len(glob(meta_path + '/*/*')) == len(df_0), 'csv record does not match with image record, need to troubleshoot'

In [None]:
# # if above has error, execute below
# # manually process unmatched records, e.g. delete imgs from meta folder
# img_abs_path = glob(meta_path + '/*/*')
# for index, row in df_0.iterrows():
#     try:
#         idx = [i for i, s in enumerate(img_abs_path) if row['name'] + '.jpg' in s]
#         if len(idx) > 1:
#             print(idx)
#             print(np.array(img_abs_path)[idx])
#             print(df.loc[df['name'] == row['name'], ['category', 'name']])
#     except Exception as e:
#         print(e)

In [None]:
# # manually process unmatched types
# img_abs_path = glob(meta_path + '/*/*')
# for index, row in df_0.iterrows():
#     for i in img_abs_path:
#         if row['name'] + '.jpg' == i.rsplit('/')[-1]:
#             assert row['category'] == i.rsplit('/')[-2], i

In [None]:
# remove image that has no white background
def find_white_background(img_abs_path, threshold=0.01):
    # remove images with transparent or white background
    imgArr = img_to_array(load_img(img_abs_path, target_size = (TARGET_SIZE, TARGET_SIZE)))
    background = np.array([255, 255, 255])
    percent = (imgArr == background).sum() / imgArr.size
    # print('white background percent is {:.2f}'.format(percent))
    if percent >= threshold:
        return True
    else:
        return False
    
wb = df_0['img_abs_path'].apply(lambda x: find_white_background(x))
wb_idx = wb[wb == True]
print('No. of pics that have no white groud is', len(wb[wb != True]), ', remove them from the study.')

In [None]:
df_1 = df_0.loc[wb_idx.index, :]

## Text LDA

In [None]:
from pathlib import Path
import re
import pandas as pd
import numpy as np
import collections
from glob import glob
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words
# import enchant
# from nltk.tokenize import word_tokenize, pos_tag
# The NLTK Lemmatization method is based on WordNet’s built-in morph function.
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.image import load_img
import lda
import matplotlib.pyplot as plt

In [None]:
df_1 = df_1.reset_index(drop=True)
# shuffle dataset
df = df_1.sample(frac = 1, random_state=1).reset_index(drop=True)
df.head()

In [None]:
# freq of each type
df['category'].value_counts()

In [None]:
def plot_img(img_abs_path):
    try:
        original = load_img(img_abs_path)
        plt.figure(figsize=[7,7])
        plt.title(img_abs_path)
        plt.axis('off')
        plt.imshow(original)
        plt.show()
    except Exception as e:
        print(e)

In [None]:
def text_cleaner(lst: list) -> str:
    cleaned_lst = []
    for a, i in enumerate(lst):
        # print(a, i)
        # only extract informative tags # cover, frame
        if [w for w in ['finish', 'material', 'cover'] if w in i.lower()]:
            # split by :
            lst2 = i.split(':')
            for j in lst2:
                # should always come with capital and small letters or precentage and small letters
                lst3 = re.findall('(?:[0-9]|[A-Z])[^A-Z]*', j)
                for k in lst3:
                    # remove string that starts with ' b'
                    if not k.startswith(("[", " b")):
                        q = re.sub("[^\sA-Za-z]", ' ', k).lstrip(' ')
                        cleaned_lst.append(q)
    cleaned = ' '.join(map(str, cleaned_lst))
    cleaned = ' '.join(cleaned.split()) # compress extra white space
    return cleaned

# print(df['attributes'][2])
# print(text_cleaner(df['attr_1'][2]))

In [None]:
# d = enchant.Dict("en_US")
d = words.words()
def text_lemmatizer(s: str) -> str:
    token_words = word_tokenize(s) 
    # exact noun and adj only. TODO: consider single and plural the same
    help_words = [w[0] for w in (pos_tag(token_words)) if ('NN' in w[1]) or ('JJ' in w[1])]
    help_words_2 = []
    for word in help_words:
        # exact English word only, however, word like 'microsuede' will be ignored
        if len(word) > 3:
            lemword = WordNetLemmatizer().lemmatize(word.lower())
            help_words_2.append(lemword)
    cleaned = ' '.join(map(str, help_words_2))
    return cleaned

# print(df['name'][12])
# print(df['attr_1'][12])
# print(df['attr_2'][12])
# text_lemmatizer(df['attr_2'][12])

In [None]:
def text_remover(s: str) -> str:
    token_words = word_tokenize(s) 
    cleaned_lst = []
    for w in token_words:
        # remove frequent non-helpful words
        if w.lower() not in ['finish', 'material', 'materials', 'cover', 'frame', 'fabric', 'content', 'unique', 'feature', 'features', 'provide', 'ensure',
                     'require', 'create', 'look', 'use', 'color', 'fabric', 'colour', 'warranty', 'year', 'key', 'construction', 'furniture']:
            cleaned_lst.append(w)
    cleaned = ' '.join(map(str, cleaned_lst))
    return cleaned

### Text cleaning

In [None]:
# plot image based on its position in the file
idx = 0
img_path = df.iloc[idx]['img_abs_path']
plot_img(img_path)

In [None]:
# plot image based on its position in the file
idx = 100
img_name = df.iloc[idx]['img_abs_path']
plot_img(img_name)

In [None]:
# split words and remove all white blanks
df['attr_1'] = df['details'].apply(lambda x: list(filter(None, x.split(','))))
# remove all words that contain special chars
df['attr_2'] = df['attr_1'].apply(lambda x: text_cleaner(x))
#  lemmatization
df['attr_3'] = df['attr_2'].apply(lambda x: text_lemmatizer(x))
#  remove most frequet non-meaninful words
df['attr_4'] = df['attr_3'].apply(lambda x: text_remover(x))

In [None]:
for i in range(10):
    print(i, df['name'].iloc[i], '----->', df['attr_2'].iloc[i], '----->', df['attr_3'].iloc[i], '----->', df['attr_4'].iloc[i])

In [None]:
# visualize most frequent words, handcrafted excluded in text_remover()
all_tokens = []
for i in df.itertuples():
    all_tokens.extend(word_tokenize(i[-2])) # df['attr_3']

counter=collections.Counter(all_tokens)
result = counter.most_common(50)
mc = pd.DataFrame(result, columns = ['Word', 'Count'])
mc.plot.bar(x='Word',y='Count', figsize=(20,20))
# decide to remove all words up to 'Leather'

### Create bag of word
concept: https://www.freecodecamp.org/news/an-introduction-to-bag-of-words-and-how-to-code-it-in-python-for-nlp-282e87a9da04/

In [None]:
text_corpus = df['attr_4'].tolist() # if use attr_2, 2161 vocabs
vectorizer = CountVectorizer(stop_words='english', analyzer='word', lowercase=True)
X_vect = vectorizer.fit_transform(text_corpus)
vocab = tuple(vectorizer.get_feature_names())
X = X_vect.toarray()
X.shape

### Build model
X: array-like (samples, n_features)
vocab: tuple of string

In [None]:
class lda_model():
    def __init__(self, X, corpus, vocab, n_topics, n_iter, n_top_docs=10, n_top_words=20, random_state=1):
        self.X = X
        self.corpus = corpus
        self.vocab = vocab
        self.n_topics = n_topics
        self.n_iter = n_iter
        self.n_top_docs = n_top_docs
        self.n_top_words = n_top_words
        self.random_state = random_state
    
    @staticmethod
    def jaccard_similarity(list1, list2):
        s1 = set(list1)
        s2 = set(list2)
        return float(len(s1.intersection(s2)) / len(s1.union(s2)))
    
    def fit(self):
        # model = LatentDirichletAllocation(n_components=n_topics, max_iter=n_iter, random_state=random_state, n_jobs=-1) # max_iter， max_doc_update_iter
        self.model = lda.LDA(n_topics=self.n_topics, n_iter=self.n_iter, random_state=self.random_state)
        self.model.fit(self.X)  # model.fit_transform(X) is also available
        return self.model
    
    def topic_path(self, df):
        doc_topic = self.model.transform(self.X)
        
        # assign topics
        max_topic_idx = np.argmax(doc_topic, axis=1)
        df['topic'] = max_topic_idx
        # topic_path = []
        for i in range(self.n_topics):
            doc_idx = np.where(max_topic_idx == i)[0]
            vars()['topic_' + str(i) + '_path'] = df['img_abs_path'].iloc[doc_idx].tolist()
            vars()['topic_' + str(i) + '_attr'] = np.array(self.corpus)[doc_idx].tolist()
            vars()['topic_' + str(i) + '_attr'] = list(' '.join(vars()['topic_' + str(i) + '_attr']).split(' ')) # convert big string to list
            # topic_path.append(vars()['topic_' + str(i) + '_path'])
            
        # calculate Jaccard similarity
        sims = []
        for i in range(self.n_topics):
            for j in range(i+1, self.n_topics):
                sim = self.jaccard_similarity(vars()['topic_' + str(i) + '_attr'], vars()['topic_' + str(j) + '_attr'])
                sims.append(sim)
        sim_score = sum(sims) / len(sims)
        print('mean of jaccard similarity is {:.2f}'.format(sim_score))
        
        # self.topic_path = dict(zip(['topic_' + str(i) + '_path' for i in range(self.n_topics)], topic_path))
        return sim_score # , self.topic_path # {'topic_' + str(i) + '_path': vars()['topic_' + str(i) + '_path'] for i in range(self.n_topics)}
    
    def topic_representation(self):
        # get word prob per topic
        topic_word = self.model.components_
    
        i = 0
        while i < self.n_topics:
            topic_words = np.array(self.vocab)[np.argsort(topic_word[i])][:-self.n_top_words:-1]
            print('Topic {} has {} documents'.format(i, len(df[df['topic'] == i])))
            print('Topic {}: {}'.format(i, ' '.join(topic_words)))
            print('-------------List below is the top n furnitures for topic %s-----------------' %i)
            plt.figure(figsize = PLOT_SIZE_1)
            for j in range(self.n_top_docs):
                try:
                    img_path = df.loc[df['topic'] == i, 'img_abs_path'].iloc[j]
                    ax = plt.subplot(1, self.n_top_docs, j+1)
                    ax.set_xticks([])
                    ax.set_yticks([])
                    # plot filter channel in grayscale
                    plt.imshow(load_img(img_path, target_size=(TARGET_SIZE, TARGET_SIZE)))
                except Exception as e:
                    print(e)
            plt.show()
            i += 1
        return None
    
# model = lda_model(X, multi_corpus, vocab, n_topics=6, n_iter=100)
# model.fit()
# sim_score = model.topic_path(df)

### Determine optimal topic number

In [None]:
N_TOPICS = [2, 3, 4, 5, 6, 7, 8]
N_ITER = 1000
RAND_STATE = 1
N_TOP_WORDS = 10
N_TOP_DOCS = 10

In [None]:
if False:
    sim_scores = []
    for n in N_TOPICS:
        model = lda_model(X, text_corpus, vocab, n_topics=n, n_iter=N_ITER)
        model.fit()
        sim_score = model.topic_path(df)
        sim_scores.append(sim_score)

In [None]:
if False:
    plt.figure()
    plt.plot(N_TOPICS, sim_scores, label='Average Overlap Between Topics', linewidth=2, markersize=12)
    plt.xlabel('Number of topics')
    plt.ylabel('Average Jaccard similarity')   
    plt.title('Average Jaccard Similarity Between Topics')
    plt.show()

### Topic representation

In [None]:
model = lda_model(X, text_corpus, vocab, n_topics=6, n_iter=1000)
model.fit()
sim_score = model.topic_path(df)
model.topic_representation()

## Visual LDA

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Flatten, GlobalAveragePooling2D
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.applications import ResNet50
from tensorflow.keras import layers, optimizers
import matplotlib.pyplot as plt
import numpy as np
import math
import random
from glob import glob
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity

np.random.seed(1)

In [None]:
def map_to_labels(v):
    idx = np.where(v==1)[0][0]
    if idx==0:
        l='chairs'
    if idx==1:
        l='coffeetables'
    if idx==2:
        l='ottman & benches'
    if idx==3:
        l='TV stands & mounts'
    if idx==4:
        l='recliners'
    if idx==5:
        l='sofa & console tables'
    if idx==6:
        l='sofas'
    if idx==7:
        l='end & accent tables'
    if idx==8:
        l='cabinets & shelvings'
    return l

In [None]:
def plots(imgs, figsize=(60,30), rows=1, interp=False, titles=None):
    if type(imgs[0]) is np.ndarray:
        imgs = np.array(imgs).astype(np.uint8)
        if (imgs.shape[-1] !=3 ):
            imgs = imgs.transpose((0,2,3,1))
    f = plt.figure(figsize=figsize)
    cols = len(imgs)//rows if len(imgs)%2 == 0 else len(imgs)//rows+1
    for i in range(len(imgs)):
        sp = f.add_subplot(rows,cols,i+1)
        sp.axis('Off')
        if titles is not None:
            sp.set_title(map_to_labels(titles[i]),fontsize=16)
        plt.imshow(imgs[i],interpolation=None if interp else 'none')

In [None]:
# random cropping in ImageDataGenerator 
def random_crop(img, random_crop_size):
    # Note: image_data_format is 'channel_last'
    assert img.shape[2] == 3
    height, width = img.shape[0], img.shape[1]
    dy, dx = random_crop_size
    x = np.random.randint(0, width - dx + 1, )
    y = np.random.randint(0, height - dy + 1)
    return img[y:(y+dy), x:(x+dx), :]

def crop_generator(batches, crop_length):
    """Take as input a Keras ImageGen (Iterator) and generate random
    crops from the image batches generated by the original iterator.
    """
    while True:
        batch_x, batch_y = next(batches)
        batch_crops = np.zeros((batch_x.shape[0], crop_length, crop_length, 3))
        for i in range(batch_x.shape[0]):
            batch_crops[i] = random_crop(batch_x[i], (crop_length, crop_length))
        yield (batch_crops, batch_y)

In [None]:
class image_representation():
    def __init__(self, img_abs_path, img_size, model, filters, thres):
        self.img_abs_path = img_abs_path
        self.img_size = img_size
        self.model = model
        self.filters = filters
        self.thres = thres
        
    def load_image(self, visualize=False):
        original = load_img(self.img_abs_path, target_size=(self.img_size, self.img_size))
        if visualize:
            plt.figure(figsize=PLOT_SIZE_2)
            plt.title(self.img_abs_path)
            plt.axis('off')
            plt.imshow(original)
            plt.show()
        return original
    
    @staticmethod
    def visualize_filters(filters):
        # plot first few filters, now only plot filters as 2-dimensional
        square, ix = int(math.sqrt(filters.shape[-1])), 1
        plt.figure(figsize=PLOT_SIZE_2) 
        # plot all 64 features in an 8x8 squares
        ix = 1
        for _ in range(square):
            for _ in range(square):
                # specify subplot and turn of axis
                ax = plt.subplot(square, square, ix)
                ax.set_xticks([])
                ax.set_yticks([])
                # plot filter channel in grayscale
                plt.imshow(filters[:, :, 0, ix-1], cmap='gray')
                ix += 1
        # show the figure
        plt.show()
    
    def exact_feature_maps(self, visualize):
        original = self.load_image(visualize=visualize)
        img = img_to_array(original)
        # expand dimensions so that it represents a single 'sample', i.e. 3 dims -> 4 dims
        img = np.expand_dims(img, axis=0)
        # NOTE: self.model.predict(img) does not work in tf2.4.0rc for some reason, so use model() instead, because it returns tensor so convert to numpy
        self.feature_maps = self.model(img).numpy()
        
        if visualize:
            square, ix = int(math.sqrt(self.filters.shape[-1])), 1
            plt.figure(figsize=PLOT_SIZE_2) 
            # plot all 64 features in an 8x8 squares
            ix = 1
            for _ in range(square):
                for _ in range(square):
                    # specify subplot and turn of axis
                    ax = plt.subplot(square, square, ix)
                    ax.set_xticks([])
                    ax.set_yticks([])
                    # plot filter channel in grayscale
                    plt.title("%i" %(ix-1))
                    plt.imshow(self.feature_maps[0, :, :, ix-1], cmap='gray')
                    ix += 1
            # show the figure
            plt.show()
        return self.feature_maps
                
    def exact_active_channels(self): 
        self.act_idx = []
        act_num = 0
        max_thres = self.feature_maps[0, :, :, :].max()
        for idx in range(self.filters.shape[-1]):
            fmap_1 = abs(self.feature_maps[0, :, :, idx]).flatten()
            act = fmap_1[np.where(fmap_1 >= self.thres)]
            if len(act) > np.ceil(self.feature_maps[0, :, :, :].shape[-1] / 3):
                self.act_idx.append(idx)
                act_num+=1          
#             if any(i >= max_thres for i in abs(fmap_1)):
#                 print('strongest activation is at index: ', idx)
#         print('total number of active channels is', act_num, 'out of', self.filters.shape[-1])
#         print('active index is', self.act_idx)
        return self.act_idx
        
    def create_bovw(self, abbv):
        act_words = [abbv + '_' + str(i) for i in self.act_idx]
        return act_words
    
# img_abs_path = glob(meta_path + '/*/*')
# for i in img_abs_path[:1]:
#     print('-------------------------------------')
#     print(i)
#     img_rep = image_representation(img_abs_path=i, img_size=TARGET_SIZE, model=model_54, filters=filters_54, thres=thres_54)
#     fmaps = img_rep.exact_feature_maps(visualize=True)
#     print(img_rep.exact_active_channels())

In [None]:
# find threshold per layer to decide which feature map is active
def calculate_threshold(model, imgs_num = 1000, target_size=TARGET_SIZE):
    img_abs_path = glob(meta_path + '/*/*')
    np.random.seed(1)
    np.random.shuffle(img_abs_path)

    imgs = []
    for i in img_abs_path[:imgs_num]:
        original = load_img(i, target_size=(target_size, target_size))
        # convert the image to an array
        img = img_to_array(original)
        # expand dimensions so that it represents a single 'sample', i.e. 3 dims -> 4 dims
        img = np.expand_dims(img, axis=0)
        img /= 255.
        imgs.append(img)

    imgs = np.vstack(imgs)
    feature_maps = np.array(model(imgs))
    print('feature maps shape is', feature_maps.shape) # (4, M, dim, dim, channel)
    thres = np.percentile(abs(feature_maps[:, :, :, :]), 99)
    return thres

# calculate_threshold(model=model_2)

In [None]:
generator = ImageDataGenerator(validation_split=0) # split to train and test set
  
# create data generator
# class names must match names of subdirectories
train_generator = generator.flow_from_directory(meta_path, target_size=(TARGET_SIZE, TARGET_SIZE),
                                                        classes=CLS,
                                                         batch_size=BATCH_SIZE, class_mode='categorical', subset='training',
                                                         shuffle=True, seed=SEED)  # The validation data is then picked as the last 10% 
val_generator = generator.flow_from_directory(meta_path, target_size=(TARGET_SIZE, TARGET_SIZE),
                                                        classes=CLS,
                                                         batch_size=BATCH_SIZE, class_mode='categorical', subset='validation',
                                                         shuffle=True, seed=SEED)  # in order to get val file name, shuffle needs to be True
# random crop
train_crop_generator = crop_generator(train_generator, RANDOM_CROP_SIZE)
val_crop_generator = crop_generator(val_generator, RANDOM_CROP_SIZE)

In [None]:
imgs, labels = next(train_generator)
print(imgs.shape)
print(labels)
plots(imgs, titles=labels)

### Image representation

In [None]:
# load pre-trained resnet50 model
model = ResNet50(weights='imagenet', include_top=False, input_shape=(TARGET_SIZE, TARGET_SIZE, 3)) # include_top=True includes fully-connected layer
print(model.summary())

filters_name = ['conv1_conv', 'conv2_block1_2_conv', 'conv2_block2_2_conv', 'conv2_block3_2_conv',
               'conv3_block1_2_conv', 'conv3_block2_2_conv', 'conv3_block3_2_conv', 'conv3_block4_2_conv',
               'conv4_block1_2_conv', 'conv4_block2_2_conv', 'conv4_block3_2_conv', 'conv4_block4_2_conv', 'conv4_block5_2_conv', 'conv4_block6_2_conv',
               'conv5_block1_2_conv', 'conv5_block2_2_conv', 'conv5_block3_2_conv']
# summarize filter shapes
print('index, layer_name, filter_shape is as below')
for i, layer in enumerate(model.layers):
    # check for convolutional layer
    if layer.name in filters_name:
        filters = model.get_layer(layer.name).get_weights()[0]
        print(i, layer.name, filters.shape)

In [None]:
# redefine model to output of convolutional layer, before bn
for l in [2, 10, 22, 32, 42, 54, 64, 74, 84]:
    vars()['model_' + str(l)] = Model(inputs=model.input, outputs=model.layers[l].output) 
    vars()['filters_' + str(l)], vars()['biases_' + str(l)] = model.layers[l].get_weights() # Creates a model that will return these outputs, given the model input

In [None]:
# # run this cell at first time, then you can print out the result and run the next cell instead going forward to save time
# # 2, 10, 22, 32, 42, 54, 64, 74, 84, 96
# thres_2 = calculate_threshold(model=model_2)
# thres_10 = calculate_threshold(model=model_10)
# thres_22 = calculate_threshold(model=model_22)
# thres_32 = calculate_threshold(model=model_32)
# thres_42 = calculate_threshold(model=model_42)
# thres_54 = calculate_threshold(model=model_54)
# thres_64 = calculate_threshold(model=model_64)
# thres_74 = calculate_threshold(model=model_74)
# thres_84 = calculate_threshold(model=model_84)
# print(thres_2, thres_10, thres_22, thres_32, thres_42, thres_54, thres_64, thres_74, thres_84)

In [None]:
thres_2, thres_10, thres_22, thres_32, thres_42, thres_54, thres_64, thres_74, thres_84 =  3.640817880630493, 5.392679691314697, 1.8942697048187256, 1.2156983613967896, 4.521170139312744, 1.711417324543003, 1.8478505671024354, 2.2812237882614212, 3.172137269973753

In [None]:
# plot filters, should be 3-dims but only enable plot 2-dims
image_representation.visualize_filters(filters=filters_32)

In [None]:
# plot feature maps
img_abs_path = glob(meta_path + '/*/*')
for i in img_abs_path[:1]:
    img_rep = image_representation(img_abs_path=i, img_size=TARGET_SIZE, model=model_32, filters=filters_32, 
                                   thres=thres_32)
    img_rep.exact_feature_maps(visualize=True)
    act_idx = img_rep.exact_active_channels()
    print('active index is', act_idx)

### Create bag of virtual words

In [None]:
%%time
# TODO: optimize help layers
# create bag of virtual words
# help_layers = [2, 10, 22, 32, 42, 54, 64, 74, 84]
help_layers = [10, 32, 42, 54]
image_corpus = []
for idx, row in df.iterrows():
    words_all_layer = []
    print('one image representation started')
    print(row['img_abs_path'])
    
    for l in help_layers:
        img_rep = image_representation(img_abs_path=row['img_abs_path'], img_size=TARGET_SIZE, model=vars()['model_' + str(l)], 
                                       filters=vars()['filters_' + str(l)], thres=vars()['thres_' + str(l)])
        img_rep.exact_feature_maps(visualize=False)
        img_rep.exact_active_channels()
        words_per_layer = img_rep.create_bovw(abbv='filters_' + str(l))
        words_all_layer+=words_per_layer
    
    # print('virtual words for this image is', words)
    print('number of virtual words for this image is', len(words_all_layer))
    print('one image representation finished')
    
    a = ' '.join(map(str, words_all_layer))
    image_corpus.append(a)

In [None]:
vectorizer = CountVectorizer(stop_words='english', analyzer='word', lowercase=True)
X_vect = vectorizer.fit_transform(image_corpus)
vocab = tuple(vectorizer.get_feature_names())
X = X_vect.toarray()
X.shape

### Determine optimal topic number

In [None]:
# TODO: jaccard score looks wrong
if False:
    sim_scores = []
    for n in N_TOPICS:
        model = lda_model(X, image_corpus, vocab, n_topics=n, n_iter=1000)
        model.fit()
        sim_score = model.topic_path(df)
        sim_scores.append(sim_score)

In [None]:
if False:
    plt.figure()
    plt.plot(N_TOPICS, sim_scores, label='Average Overlap Between Topics', linewidth=2, markersize=12)
    plt.xlabel('Number of topics')
    plt.ylabel('Average Jaccard similarity')   
    plt.title('Average Jaccard Similarity Between Topics')
    plt.show()

### Topic representation

In [None]:
model = lda_model(X, image_corpus, vocab, n_topics=5, n_iter=1000)
model.fit()
sim_score = model.topic_path(df)
model.topic_representation()

## Multimodel LDA

In [None]:
# combine image corpus with text
df['img_attr'] = image_corpus
multi_corpus = (df['attr_4'] + ' ' + df['img_attr']).tolist()

In [None]:
vectorizer = CountVectorizer(stop_words='english', analyzer='word', lowercase=True)
X_vect = vectorizer.fit_transform(multi_corpus)
vocab = tuple(vectorizer.get_feature_names())
X = X_vect.toarray()
X.shape

In [None]:
model = lda_model(X, multi_corpus, vocab, n_topics=6, n_iter=1000)
model.fit()
sim_score = model.topic_path(df)
model.topic_representation()

## Recommend top n assortment

In [None]:
N_TOP_ASSORT = 7
N_TOPCIS = 6

In [None]:
df['name_jpg'] = df['name'] + '.jpg'

In [None]:
for topic_no in range(N_TOPCIS):
    keys = df.loc[df['topic'] == topic_no, 'name_jpg']
    values = X[keys.index]
    sim = cosine_similarity(values, values)
    vars()['sim_matrix_' + str(topic_no)] = pd.DataFrame(sim, columns = keys, index = keys).drop_duplicates()

In [None]:
# simple search engine
# top n assortment per image
# img_name = df['name_jpg'].iloc[-40]
img_name = df['name_jpg'].iloc[10]

topic_no = df.loc[df['name_jpg'] == img_name, 'topic'].item()
top_n_assort = vars()['sim_matrix_' + str(topic_no)].loc[img_name, :].sort_values(ascending = False).head(N_TOP_ASSORT)

i = 0
plt.figure(figsize = (80, 80))
for name, sim_score in top_n_assort.iteritems():
    try: 
        ax = plt.subplot(1, N_TOP_ASSORT, i+1)
        ax.set_xticks([])
        ax.set_yticks([])
        img_path = df.loc[df['name_jpg'] == name, 'img_abs_path'].item()
        # plot filter channel in grayscale
        plt.title('{}: \n sim score is {:.2f}'.format(name, sim_score))
        plt.imshow(load_img(img_path, target_size=(TARGET_SIZE, TARGET_SIZE)))
    except Exception as e:
        print(e)
    i+=1
plt.show()

## Before web application

In [None]:
# create pickle files for recommender engine
sim_names = []
sim_values = []

for idx, row in df.iterrows():
    topic_no = row['topic']
    a = vars()['sim_matrix_' + str(topic_no)][row['name_jpg']].sort_values(ascending = False).head(N_TOP_ASSORT)
    sim_name = a.index.tolist()
    sim_value = a.values.tolist()
    sim_names.append(sim_name)
    sim_values.append(sim_value)

sim_names = pd.DataFrame(sim_names, index=df['name_jpg'].values)
sim_values = pd.DataFrame(sim_values, index=df['name_jpg'].values)
sim_names.to_pickle(Path(static_model_path, 'similarNames.pkl'))
sim_values.to_pickle(Path(static_model_path, 'similarValues.pkl'))

In [None]:
# create image dict and put it in ./Engine/imageRecommeder/commands/ in def importDB()
images = []
for i in df['name_jpg']:
    img = {}
    img['name'] = i
    img['caption'] = img['name'].rsplit('.', 1)[0]
    images.append(img)

pickle.dump(images, open(Path(static_model_path, 'image_dict'), "wb"))

In [None]:
# remove images existing in static folder and move all from meta.nosync there
if not os.path.exists(static_image_path):
    shutil.rmtree(static_image_path)
    
# create new dir
if not os.path.exists(static_image_path):
    os.makedirs(static_image_path)
    
for subdir in os.listdir(meta_path):
    for file in os.listdir(Path(meta_path, subdir)):
         shutil.copy(Path(meta_path, subdir, file), static_image_path)