In [6]:
import os, json, imp
import numpy as np
import pandas as pd
import tqdm

In [8]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

from sentence_transformers import SentenceTransformer

import umap

def get_embedding(text, n_components=None):
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    embeddings = np.vstack(model.encode(text))

    if n_components:
        return umap.UMAP(n_components=n_components).fit_transform(embeddings)
    return embeddings

In [9]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [10]:
from react_jupyter_widget_example import DietParselantro

In [11]:
url = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Musical_Instruments_5.json.gz'
df = pd.read_json(url, compression='gzip', lines=True)

In [12]:
df['clean_text'] = df['reviewText'].str.lower()
df = df.iloc[0:250]
df

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,clean_text
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5,good,1393545600,"02 28, 2014","not much to write about here, but it does exac..."
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5,Jake,1363392000,"03 16, 2013",the product does exactly as it should and is q...
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5,It Does The Job Well,1377648000,"08 28, 2013",the primary job of this device is to block the...
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014",nice windscreen protects my mxl mic and preven...
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5,No more pops when I record my vocals.,1392940800,"02 21, 2014",this pop filter is great. it looks and perform...
...,...,...,...,...,...,...,...,...,...,...
245,A37AQI4AU3JWSR,B0001FTVD6,Joshua,"[0, 0]",Donr be fooled by the imitations... should be ...,5,Best rack screws for your money.,1355788800,"12 18, 2012",donr be fooled by the imitations... should be ...
246,AUK79PXTAOJP9,B0001FTVD6,~ Kyle,"[0, 0]",Great rack mount screws. Rubber washers are pe...,5,Great,1373241600,"07 8, 2013",great rack mount screws. rubber washers are pe...
247,A2PN3GY7I3EKC1,B0001FTVD6,"N. McArthur ""MyTech""","[0, 0]","Other than that, when you need 10-32 rack scre...",5,25 Screws.... why not 24? Even numbers would ...,1402963200,"06 17, 2014","other than that, when you need 10-32 rack scre..."
248,A3CSSZ6U5J4YS5,B0001FTVD6,overbybr,"[0, 0]","There are other rack screws out there, but Rax...",5,As good as it gets,1403827200,"06 27, 2014","there are other rack screws out there, but rax..."


In [13]:
n_components = 16
embedding_path = f'embeddings/parsed_page_content_{n_components}d_embedding.csv'

text = df.clean_text

if not os.path.exists(embedding_path):
    emb = pd.DataFrame(get_embedding(text), index=text.index)
    emb.to_csv(embedding_path)
else:
    emb = pd.read_csv(embedding_path).loc[text.index]
    emb = emb.drop(columns=['Unnamed: 0'])

In [17]:
# Hierarchical categories can be defined with a . separator
regex = {"filter": "filter",
         "cable": "cable(s?)",
         "cable.piano accessories": "piano|keyboard|pedal"}

In [18]:
# Instantiate DietParselantro with some regex categories defined
DietParselantro(data=df, regex=pd.Series(regex), embedding=emb)

DietParselantro(component='DietParselantro', props={'data': [{'reviewerID': 'A2IBPI20UZIR0U', 'asin': '1384719…

In [16]:
# Instantiate DietParselantro without regex categories defined
DietParselantro(data=df, regex=pd.Series({}), embedding=emb)

  """Entry point for launching an IPython kernel.


DietParselantro(component='DietParselantro', props={'data': [{'reviewerID': 'A2IBPI20UZIR0U', 'asin': '1384719…