## Use NLP to discover the current product trends

### Topic modeling visualization

In [115]:
%pylab inline

import pandas as pd
import numpy as np
import pickle as pk
from scipy import sparse as sp

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [116]:
df = pd.read_csv('./all.csv')
docs = df['alltext']
df.sample(5)

Unnamed: 0.1,Unnamed: 0,alltext,brand,category,description,img_2,img_id,img_url,material,price,product_name,website,id
747,747,"Short Jersey Dress Short, fitted dress in ribb...",HM,Dress,"Short, fitted dress in ribbed viscose jersey w...",,258 Short Jersey Dress,https://lp2.hm.com/hmgoepprod?set=source[/62/0...,Fitted,$24.99,Short Jersey Dress,HM,747
4421,4421,Missguided Tropical Print Tie Front Top Club T...,,Top,Club Tropicana print vibes (all that’s missing...,,,https://images.asos-media.com/products/missgui...,"Lightweight, satin-style fabric,Super-smooth, ...",$29.00,Missguided Tropical Print Tie Front Top,ASOS,4421
2695,2695,"Brooklyn Front-Slit Dress Fits true to size, o...",LIKELY,Dress,"Fits true to size, order your normal size,Desi...",,img9733743_fpx_tif,https://images.bloomingdalesassets.com/is/imag...,,$198.00,Brooklyn Front-Slit Dress,Bloomingdales,2695
10534,10534,Emotional Tropical Dress - White Available in ...,Fashion Nova,Dress,"Available in White and Blue,Tropical Print,Pol...",https://cdn.shopify.com/s/files/1/0293/9277/pr...,,https://cdn.shopify.com/s/files/1/0293/9277/pr...,Made in USALining: 100% PolyesterÂ,29.99,Emotional Tropical Dress - White,Fashion Nova,10534
5527,5527,Honey Punch Cami Top In Sheer Metallic Two-Pie...,,Top,"Co-ord style,It’s got a BFF,Bandeau lining,Sco...",,,https://images.asos-media.com/products/honey-p...,"Fine mesh fabric ,Sheer delight,Body: 100% Pol...",$29.00,Honey Punch Cami Top In Sheer Metallic Two-Piece,ASOS,5527


### Pre-process and vectorize the documents

In [117]:
from nltk.tokenize import RegexpTokenizer

def preprocessDoc(docs):
    docs = np.array(docs)
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    docs = [[token for token in doc if not token.isdigit()] for doc in docs]  #remove numbers
    docs = [[token for token in doc if len(token) > 3] for doc in docs] #remove words that are short 
  
    return docs

docs = preprocessDoc(docs)

#### Compute bigrams/trigrams and remove less words and very common words:

In [118]:
from gensim.corpora import Dictionary
from gensim.models import Phrases
bigram = Phrases(docs, min_count=10, threshold=10)  # only ones that appear 10 times or more.
trigram = Phrases(bigram[docs])

for i in range(len(docs)):
    for token in bigram[docs[i]]:
        if '_' in token:
            docs[i].append(token)
#            print ("bi:", token)
    for token in trigram[docs[i]]:
        if token.count('_') > 2 :
            docs[i].append(token)
#            print ("tri:", token)

dictionary_ = Dictionary(docs)
print (len(dictionary_))
dictionary_.filter_extremes(no_below=10, no_above=0.2)
print (len(dictionary_))



5598
2301


### Vectorize data

- bag-of-words : frequency of words

In [119]:
corpus = [dictionary_.doc2bow(doc) for doc in docs]
print('unique tokens: %d' % len(dictionary_)) #gensim.corpora.dictionary.Dictionary
print('Number of records: %d' % len(corpus)) #List

unique tokens: 2301
Number of records: 10610


## Train LDA model

In [120]:
from gensim.models import LdaModel

# Set training parameters.
num_topics = 6
chunksize = 500 # size of the doc looked at every pass
passes = 20 # number of passes through documents
iterations = 400
eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
dictionary_[0]  # This is only to "load" the dictionary. Otherwise, you will get the value error. 
id2word = dictionary_.id2token

model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every)

In [121]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [122]:
pyLDAvis.gensim.prepare(model, corpus, dictionary_)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [123]:
def explore_topic(lda_model, topic_number, topn, output=True):
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))   
    return terms

In [124]:
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for i in range(num_topics):
    print('Topic '+str(i)+' |---------------------\n')
    tmp = explore_topic(model,topic_number=i, topn=10, output=True )

term                 frequency

Topic 0 |---------------------

shirt                0.031
button               0.027
crew                 0.024
sleeve               0.024
long                 0.023
viscose              0.020
nylon                0.020
placket              0.019
touch                0.018
stripe               0.018
Topic 1 |---------------------

fastening            0.074
black                0.065
made                 0.050
softly               0.041
self                 0.036
available_black      0.036
shoulder             0.024
lining               0.018
sleeveless           0.015
final                0.014
Topic 2 |---------------------

lightweight          0.042
floral               0.035
midi                 0.032
stretch              0.030
lining               0.030
lace                 0.030
lightweight_woven    0.027
over                 0.025
ruffle               0.024
wrap                 0.023
Topic 3 |---------------------

kind                 0.028
kin

### Based on the above, giving a general name of each cluster.

In [125]:
top_labels = {0: 'floral', 1:'drapes', 2:'shirt', 3:'lace skirt', 4:'black bodysuit', 5:'conscious'}

In [126]:
import re
import nltk

from nltk.corpus import stopwords

stops = set(stopwords.words('english'))
def prod_to_wordlis( prod, remove_stopwords=True ):
    '''
        Function converts text to a sequence of words,
        Returns a list of words.
    '''
    # 1. Remove non-letters
    paper_text = re.sub("[^a-zA-Z]"," ", prod)
    # 2. Convert words to lower case and split them
    words = paper_text.lower().split()
    # 3. Remove stop words
    words = [w for w in words if not w in stops]
    # 4. Remove short words
    words = [t for t in words if len(t) > 2]
    
    return(words)

In [127]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvectorizer = TfidfVectorizer(input='content', analyzer = 'word', lowercase=True, stop_words='english',\
                                  tokenizer=prod_to_wordlis, ngram_range=(1, 3), min_df=40, max_df=0.20,\
                                  norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True)

dtm = tvectorizer.fit_transform(df['alltext']).toarray()

In [128]:
from collections import OrderedDict
def get_doc_topic_dist(model, corpus, kwords=False):
    
    '''
    LDA transformation, for each doc only returns topics with non-zero weight
    This function makes a matrix transformation of docs in the topic space.
    '''
    top_dist =[]
    keys = []

    for d in corpus:
        tmp = {i:0 for i in range(num_topics)}
        tmp.update(dict(model[d]))
        vals = list(OrderedDict(tmp).values())
        top_dist += [array(vals)]
        if kwords:
            keys += [array(vals).argmax()]

    return array(top_dist), keys

In [129]:
top_dist, lda_keys= get_doc_topic_dist(model, corpus, True)
features = tvectorizer.get_feature_names()

In [130]:
top_ws = []
for n in range(len(dtm)):
    inds = np.intp(argsort(dtm[n])[::-1][:4])
    tmp = [features[i] for i in inds]
    top_ws += [' '.join(tmp)]
    
df['Text_Rep'] = pd.DataFrame(top_ws)
df['clusters'] = pd.DataFrame(lda_keys)
df['clusters'].fillna(10, inplace=True)

cluster_colors = {0: 'blue', 1: 'green', 2: 'yellow', 3: 'red', 4: 'skyblue', 5:'salmon', 6:'orange', 7:'maroon', 8:'crimson', 9:'black', 10:'gray'}

df['colors'] = df['clusters'].apply(lambda l: cluster_colors[l])

In [131]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(top_dist)

In [132]:
df['X_tsne'] =X_tsne[:, 0]
df['Y_tsne'] =X_tsne[:, 1]

In [133]:
from bokeh.plotting import figure, show, output_notebook, save
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()

In [134]:
source = ColumnDataSource(dict(
    x=df['X_tsne'],
    y=df['Y_tsne'],
    color=df['colors'],
    label=df['clusters'].apply(lambda l: top_labels[l]),
    topic_cluster= df['clusters'],
    title= df[u'product_name'],
    img_url = df['img_url'],
    website = df['website']
))

In [135]:
title = 'T-SNE visualization of topics'

plot_lda = figure(plot_width=1000, plot_height=600,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y', legend='label', source=source, color='color', alpha=0.8, size=10)

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "Prod_name: @title, website: @website - Topic: @topic_cluster "}
plot_lda.legend.location = "top_left"

show(plot_lda)

#save the plot
save(plot_lda, '{}.html'.format(title))

  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")


'/Users/hueyling/anaconda3/jupyter/Tensorflow/fashionMaterials/T-SNE visualization of topics.html'