In [None]:
import pandas as pd
import re
from data_cleaning import *  # call our own function from a python file
from pretrain_model import * # call our own function from a python file
from IPython.display import Image  # display images

# pd.set_option('display.max_colwidth', -1) # displace all text within a col 

# data pre-processing

simple_preprocess() function in word2vec will convert a document into a list of lowercase tokens, punctuation and numbers will be removed. However, additional pre-processing is needed to remove stop words before NLP analysis

1) replace t-shirt/T-shirt with tshirt/Tshirt to be counted as one token

2) materials: 
convert (Viscose 100%) to (Viscose100%, Viscose)
convert (100% Viscose) to (Viscose100%, Viscose)

3) remove brands: 
PrettyLittleThing, ASOS DESIGN, 'ASOS', "YAS", "Ditsy", "Noisy", "May", "Ted","Baker", "River","Island", "Karen","Scott", "PrettyLittleThing", "Roxy", "DESIGN", "Chi", "Alfani", "Boohoo", "Sofie", "Schnoor", "Ellesse", "Jeannie", "TFNC", "Sacred", "Hawk", "Urban", "Bliss", "Puma", "adidas", "Stella" etc.

4) remove words: 
cm
size,
‘Web ID:’ 
Approx. model height is 5'10" and she is wearing a size 4/S
Made In USA Made In USA


In [None]:
df = pd.read_csv('all.csv').drop('Unnamed: 0', axis=1).drop_duplicates('alltext')

In [None]:
df = data_cleaning(df) # own function from a python file

In [None]:
df.iloc[1128,0]

In [None]:
df.sample(5)


note: no need to hold out the test data, as data is unlabelled (NLP is unsupervised learning)

# NLP Analysis

# Method 1: pretrained model for word2vec

#### used pretrained model, which uses 100k or 1M words to develop each word vector
#### did not use our own word vector because our sample size is too small - we only around 50k unique words in our webscraped description) 

In [None]:
pretrain_model = loadGloveModel('glove.42B.300d.txt') # own function from a python file

In [None]:
pretrain_similar = find_similar(df, 100, pretrain_model, count = 20)  # own function from a python file
pretrain_similar

In [None]:
for i in range(len(pretrain_similar)):
    display(Image(filename = './product_images/'+str(pretrain_similar[i][0])+'.jpg', width=200, height=200))

# Method 2: doc2vec

In [None]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess

In [None]:
# MyDocs reading from a data frame
class MyDocs(object):
    def __iter__(self):
        for i in range(df.shape[0]):
            yield TaggedDocument(words=simple_preprocess(df.iloc[i,0]), tags=['%s' % df.iloc[i,-1]])  # generator 
                    # the "tag" for each item description will be the item itself (image id), since we want to find which description is most similar to another item description
                

### note: no pretrained model for doc to vec because each document is unique per training set. word to vec can be generalized.

In [None]:
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

In [None]:
%%time
import multiprocessing
import os
cores = multiprocessing.cpu_count()

if not os.path.exists('models/doc2vec.model'):
    print("start traing doc2vec model...")
    documents = MyDocs()                                                          # workers=cores means number of CPUs
    doc2vec_model = Doc2Vec(dm=1, dbow_words=1, vector_size=200, window=3, min_count=2, workers=cores)  # dm=1, dbow_words=1 means train two models and take the average of distributed memory method, and distributed bad of words method
    doc2vec_model.build_vocab(documents)
    doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=100) 
    if not os.path.exists('models'):
        os.makedirs('models')
        doc2vec_model.save('models/doc2vec.model')
    else:
        doc2vec_model.save('models/doc2vec.model')
else:
    doc2vec_model = Doc2Vec.load('models/doc2vec.model')

In [None]:
documents = MyDocs()  # just to show what was feed into the doc2vec_model.train()
list(documents)

##### Function to display similar images to the one you specify

In [None]:
def load_img_d2v(model, item, n): 
    doc2vec_model = Doc2Vec.load(model)
    print(doc2vec_model.docvecs.most_similar(item, topn=n))
    d2v_similar = doc2vec_model.docvecs.most_similar(item, topn=n)

    # show img
    display(Image(filename = './product_images/'+item+'.jpg', width=200, height=200))
    for i in range(len(d2v_similar)):
        display(Image(filename = './product_images/'+str(d2v_similar[i][0])+'.jpg', width=200, height=200))

In [None]:
# dm = 1, dbow_words=1

load_img_d2v('models/doc2vec.model', '100', 20)    

### product similarity based on image analysis (to compare against doc2vec similar images)

In [None]:
img_score = pd.read_csv('product_similarity_matrix.csv')
img_score = img_score.drop('Unnamed: 0', axis=1)
img_score.index = img_score.columns.values.tolist() # change row names to img name
img_score.to_pickle('product_similarity_matrix.pkl')

In [None]:
img_score = pd.read_pickle("product_similarity_matrix.pkl")

In [None]:
img_score.shape
img_score.head()

In [None]:
item = 10609
img_similar = img_score['product_images/'+str(item)+'.jpg'].sort_values(ascending =False)[:10].reset_index()

print(img_similar)

# show img
for i in range(len(img_similar)):
    display(Image(filename = './'+img_similar['index'][i], width=200, height=200))



# Kmean clustering with doc2vec

In [None]:
df1 = df[df.website =='ASOS'] # only did ASOS because we want a direct comparison with the TSNE plot generated from LDA (for LDA we only perfromed style clustering on ASOS items) 
df1 = df1.reset_index()

In [None]:
df1.shape

In [None]:
df1

In [None]:
# MyDocs reading from a data frame
class MyDocs(object):
    def __iter__(self):
        for i in range(df1.shape[0]):
            yield TaggedDocument(words=simple_preprocess(df1.iloc[i,1]), tags=['%s' % df1.iloc[i,-1]])  

In [None]:
%%time

if not os.path.exists('models/doc2vec.model'):
    print("start traing doc2vec model...")
    documents = MyDocs()                                                          
    doc2vec_model = Doc2Vec(dm=1, dbow_words=1, vector_size=200, window=3, min_count=2, workers=cores)
    doc2vec_model.build_vocab(documents)
    doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=100) 
    if not os.path.exists('models'):
        os.makedirs('models')
        doc2vec_model.save('models/doc2vec.model')
    else:
        doc2vec_model.save('models/doc2vec.model')
else:
    doc2vec_model = Doc2Vec.load('models/doc2vec.model')
    
    # no pretrained model for doc to vec because each document is unique per training set. word to vec can be generalized

### use 6 clusters to compare against TSNE from LDA model

In [None]:
doc2vec_model.docvecs[6895] # vector representing index 6895 (or item 10214)


In [None]:
import nltk
from nltk.cluster.kmeans import KMeansClusterer
NUM_CLUSTERS = 6
vectors = []

#model = Doc2Vec.load('models/doc2vec_dm0_word0_vec200_win3_min2_epoch100.model')
for i in range(len(df1)):
    vectors.append(doc2vec_model.docvecs[i])

kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25) # kmean with cosine disctance
assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)


In [None]:
len(assigned_clusters)

In [None]:
df1['cluster'] = assigned_clusters # cluster for each description 
df1['vectors'] = vectors  # document vector for each description

In [None]:
df1

In [None]:
vect_list = df1['vectors'].apply(lambda x: list(x)).tolist()  # convert document vector into a nested list so we can feed it into the TSNE model
vect_list

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)  # 2 dimensions 
X_tsne = tsne.fit_transform(vect_list) # TSNE reduce the document vector from 200 dimensions down to 2 dimensions
X_tsne

In [None]:
df1['X_tsne'] =X_tsne[:, 0] # plot one of the dimension on X-axis
df1['Y_tsne'] =X_tsne[:, 1] # plot another dimension on Y-axis

cluster_colors = {0: 'blue', 1: 'green', 2: 'yellow', 3: 'red', 4: 'skyblue', 5:'salmon', 6:'orange', 7:'maroon', 8:'crimson', 9:'black', 10:'gray'}

df1['colors'] = df1['cluster'].apply(lambda l: cluster_colors[l]) # assign color to each cluster (for graphing purpose)

In [None]:
from bokeh.plotting import figure, show, output_notebook, save  # for TSNE graph
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()

In [None]:
source = ColumnDataSource(dict(     # for plotting
    x=df1['X_tsne'],
    y=df1['Y_tsne'],
    color=df1['colors'],
    topic_cluster= df1['cluster'],
    label=df1['cluster'],
    title= df1['id'],
    website = df1['website']
))

In [None]:
title = 'T-SNE visualization of topics'

plot_lda = figure(plot_width=1000, plot_height=600,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y', legend='label', source=source, color='color', alpha=0.8, size=10)

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "Prod_name: @title, website: @website - Topic: @topic_cluster "}
plot_lda.legend.location = "top_left"

show(plot_lda)

#save the plot
save(plot_lda, '{}.html'.format(title))