In [75]:
import numpy as np
import pandas as pd
import gensim 
import time
from gensim.scripts import glove2word2vec
from gensim.parsing.preprocessing import strip_punctuation, strip_multiple_whitespaces, preprocess_string
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, pca
import matplotlib.pyplot as plt
from bokeh.plotting import figure, output_file, show, save
import matplotlib
matplotlib.rcParams['figure.figsize'] = [18.0, 12.0]

import numpy as np

from bokeh.layouts import widgetbox, layout
from bokeh.models.widgets import Select
from bokeh.plotting import figure, save, ColumnDataSource
from bokeh.transform import factor_cmap
from bokeh.palettes import Accent, inferno
from bokeh.models import HoverTool
from bokeh.models.widgets import Panel, Tabs
from bokeh.palettes import Category10, viridis

In [2]:
df = pd.read_csv('/Users/bsliz/Downloads/Womens Clothing E-Commerce Reviews.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [4]:
df['Department Name'].value_counts()

Tops        10468
Dresses      6319
Bottoms      3799
Intimate     1735
Jackets      1032
Trend         119
Name: Department Name, dtype: int64

# Drop missing reviews

In [5]:
df.shape

(23486, 11)

In [6]:
df = df[pd.notnull(df['Review Text'])]
df.shape

(22641, 11)

# Load model

In [7]:
glove2word2vec.glove2word2vec('/Users/bsliz/Downloads/glove.840B.300d.txt', 'hodor')

(2196017, 300)

In [8]:
model = gensim.models.KeyedVectors.load_word2vec_format('hodor')

# Process text

In [9]:
def prep_string(x):
    filters = [strip_multiple_whitespaces, strip_punctuation, str.lower]
    clean_string = preprocess_string(x, filters)
    return clean_string
    

In [10]:
df['clean_review'] = df['Review Text'].apply(prep_string)

# Vectorize documents

In [11]:
def vectorize_document(document, model):
    vector_list = []
    for word in document:
        try:
            word_vector = model.get_vector(word)
            vector_list.append(word_vector)
        except KeyError:
            pass
    mean_document_vector = np.mean(np.array(vector_list), axis=0)
    return mean_document_vector

In [12]:
document_vectors = df['clean_review'].apply(lambda x: vectorize_document(x, model))

In [13]:
vec_df = pd.DataFrame(list(document_vectors))

In [14]:
vec_df.shape

(22641, 300)

# TSNE

In [15]:
pca = PCA(n_components=50)
compressed = pca.fit_transform(vec_df)

In [16]:
compressed.shape

(22641, 50)

In [17]:
tsne_space = TSNE(n_components=2).fit_transform(compressed)

In [18]:
tsne_space.shape

(22641, 2)

# Bokeh

In [90]:
def derive_palette(col):
    n = len(col.unique())
    if col.dtype == 'int64':
        pal = viridis(n)
    else:
        try:
            pal = Category10[n]
        except KeyError:
            pal = inferno(n)
    return pal

def label_colors(col):
    pal = derive_palette(col)
    levels = col.unique()
    mapping = {levels[i]: p for i, p in enumerate(pal)}
    return col.map(mapping)

In [62]:
def make_source(color_var, tsne_space, df):
    source = ColumnDataSource(data=dict(
        x=tsne_space[:,1],
        y=tsne_space[:,0],
        attribute=df[color_var],
        colors=label_colors(df[color_var]),
        review=list(df['Review Text']) 
    ))
    return source

In [96]:
my_tabs =[]
for col in ['Department Name', 'Recommended IND', 'Division Name', 'Class Name', 'Age', 'Rating', 'Positive Feedback Count']:
    p = figure(tools='box_zoom, hover, reset')
    source = make_source(col)
    p.scatter('x', 'y', color='colors', alpha=0.3, line_width=0, source=source)
    h = p.select_one(HoverTool)
    h.point_policy = 'follow_mouse'
    h.tooltips = [('Review','@review'),
                  (col, '@attribute')]
    tab = Panel(child=p, title=col)
    my_tabs.append(tab)

tabs = Tabs(tabs=my_tabs)
save(tabs, 'tsne_bokeh.html')

'/Users/bsliz/tsne_bokeh.html'