In [1]:
import gzip
import json
import pandas as pd
import numpy as np
from operator import itemgetter

In [2]:
def load_data(file_path, product_name=None):
    """
    Purpose: load json data from the sample file, regardless of how many products are
    in the file, concatenate them into a single dataframe for further processing.
    """
    with open(file_path) as f:
        data = json.load(open(file_path))
    prod_revs = data[product_name]
    df = pd.DataFrame.from_dict(prod_revs)
    df = df.dropna(subset=['reviewText'])
    return df

In [3]:
data = load_data("sample_reviews.json", "Disney Mickey Mouse Deluxe Boys' Costume")

In [4]:
data['reviewText']

0     This costume runs really small and is more lik...
1     As a last minute purchase for a halloween cost...
2     Very cute costume which he LOVED!! The only do...
3     This is VERY small. My son is barely in a 4t a...
4     This is adorable. my son is almost 3. 30 lbs a...
                            ...                        
63    This Mickey Mouse costume was adorable. My onl...
64    too small, poor quality... the ears don't even...
65    I bought this for my son for Halloween.  He ju...
66    Held up great through Halloween for my 2 year ...
67    Should've ordered a size larger, but the costu...
Name: reviewText, Length: 68, dtype: object

In [5]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

100%|██████████| 405M/405M [04:58<00:00, 1.36MB/s] 


In [6]:
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']
sentence_embeddings = model.encode(sentences)

In [7]:
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.04094423e-01  5.27476788e-01  1.17977321e+00  1.94365889e-01
  1.59113705e-01  5.45550585e-01  2.81809419e-01  1.21590570e-01
  2.91968167e-01 -3.36857945e-01 -2.49655485e-01  2.93202430e-01
  1.43483594e-01  3.80202472e-01 -2.52250403e-01 -2.62337863e-01
 -2.69968927e-01  4.68089581e-02  3.57253551e-01 -3.58307838e-01
 -2.89180316e-02  6.85316995e-02 -7.84853637e-01 -2.61792779e-01
  1.02695441e+00 -4.05539781e-01  1.93896770e-01 -1.65969059e-01
 -2.89741784e-01 -1.07936382e-01 -6.31145179e-01 -4.68852967e-01
  7.16446459e-01 -9.49060023e-01 -2.94927284e-02  1.37837073e-02
  7.03223273e-02 -2.72194073e-02 -6.25343993e-02 -5.84831893e-01
 -9.02268946e-01 -9.56011653e-01  4.95161116e-01  1.95363391e-04
 -1.52250719e+00 -6.86905742e-01 -9.08193231e-01 -1.79840565e-01
 -6.49180412e-01 -2.96626706e-02 -1.83349538e+00  1.43428385e-01
  1.93934143e-01  1.91225968e-02 -4.56058800e-01  9.44656789e-01
  1.66152

In [8]:
embedder = SentenceTransformer('bert-base-nli-mean-tokens')

In [12]:
corpus = data['reviewText'].to_list()

In [13]:
corpus_embeddings = embedder.encode(corpus)

In [14]:
from sklearn.cluster import KMeans

num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

In [15]:
cluster_assignment

array([1, 0, 1, 4, 4, 0, 3, 3, 0, 4, 1, 0, 2, 4, 4, 0, 4, 2, 1, 4, 0, 4,
       1, 1, 0, 3, 4, 2, 1, 2, 4, 1, 2, 1, 0, 0, 3, 3, 2, 2, 3, 1, 4, 2,
       1, 0, 1, 1, 2, 4, 0, 2, 3, 4, 0, 1, 1, 4, 0, 4, 1, 4, 1, 1, 2, 1,
       4, 2], dtype=int32)

In [26]:
data[cluster_assignment ==1]['reviewText'].to_list()

['This costume runs really small and is more likely a 3T than a 4T. It\'s a little difficult to get off and on, too. I didn\'t really like the "ears" either. The hat-style ears are really small and the chin strap is uncomfortable. I ended up cutting off the strings and my son can still wear the "ears" part okay. Cute costume though - looked adorable with his little sister as Minnie Mouse on Halloween.',
 'Very cute costume which he LOVED!! The only downfall is the way the hat connects. He would not keep it on because the velcro scratched his ears.',
 'The hat/ears run small.  Suit OK.  In hindsight, I wish we had just bought Mickey ears from elsewhere and a Mickey t-shirt.',
 'Thus is more a 4T, will not fit 5/6 it states L/G 4-6 but untrue. Also the bow is not sequence, it\'s just red. I would recommend hand washing and line dry, otherwise it gets very linty and the coat tails curl under. My son just turned 5, I bought this 8 months ago, my son was 42" tall in a size 4-5T and he out g

In [27]:
data[cluster_assignment ==2]['reviewText'].to_list()

['Cute but thin quality/ material and my son did not like it at all. Ran true to size but since he had no interest, this was returned.',
 'This costume looks very cheap. Everything from the styrofoam mickey ears to the one piece outfit. I was disappointed in this costume. I think i would have been better off getting a $19 costume.',
 'Not so much:( I ordered a size up and it was still too small. Plus, I know costumes are typically not the same quality as regular clothing, but this looked soooo ceap! Would not recommend',
 'Cheaper than expected for the price',
 'sizes run real small so take that into account.',
 'Did not like.  I thought it had a really cheap look.',
 'The "hat" ears would maybe have fit an infant. I ordered a 3T and it fit, but barely. The ears didn\'t fit at all.',
 'Cute, but it is not something you can easily wash, also the ears do not stand up like they do in the photo.  Our son liked it for Trick or Treating though.',
 'It was what one might expect for a costume.

In [28]:
data[cluster_assignment ==3]['reviewText'].to_list()

['For the price is ok',
 'This was so cute',
 'Really cute',
 'Very cute costume. It fit perfectly. Just as described.',
 'PERFECT',
 'GreAt',
 'Beautiful!']

In [29]:
data[cluster_assignment ==4]['reviewText'].to_list()

['This is VERY small. My son is barely in a 4t and can barely fit into this costume. My son, without clothes underneath is in floods.\nAs expected, it is very cheaply put together as well.',
 "This is adorable. my son is almost 3. 30 lbs and about 31 inches tall. it's baggy on him, but the length is spot on. the ears head piece runs a little small, i think. either that or my kid has a huge head. either way i'm going to have to add an extender in it so he can wear it comfortably.",
 "Cute outfit but ears are kinda flimsy on the hat and don't always stick up straight.  Also, I think it runs a little small - my 3 year old is on the shorter side and it was barely long enough for him.  I think it might be too short for many 4 year olds.",
 'My son is three and 30 pounds\nThe costume was too short on him\nThe head piece was way too small for his head',
 "My son is 2 and I ordered the 3 4t hoping to be a little big to put clothes underneath. It's way to small. Barrely could get his shoulders 