In [5]:
import collections
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

In [6]:
# Reads file and extracts all the reviews/text content

file = open("finefoods.txt", "r", encoding = "ISO-8859-1")
text= file.readlines()
reviews = [line.split('review/text: ')[1] for line in text if "review/text:" in line]
len(reviews)

568454

In [7]:
# Get all stopwords

data = open("LongStop.txt", "r", encoding = "ISO-8859-1")
data = data.readlines()
stop_words = []
for line in data:
  for word in line.split():
    stop_words.append(word.lower())

In [8]:
# Get all the words present in review/text

words = []
for line in reviews:
  for word in line.split():
    words.append(word.lower())

In [9]:
# Get distinct words

L = list(set(words))
print(len(L))

506062


In [10]:
# Cleaned Set - Removed StopWords, Punctuations

W = [word for word in L if (word not in stop_words) and word.isalpha()]
print(len(W))

86738


In [11]:
# Finds the count of all the words before any alterations like performing set to remove duplicates

countofwords = collections.Counter(words)
len(countofwords)

506062

In [12]:
# Getting the words present in the clean set only

for word in list(countofwords):
  if word not in W:
    del countofwords[word]

len(countofwords)

86738

In [13]:
# Top 500 words:

data = countofwords.most_common(500)
data

[('good', 147853),
 ('great', 130957),
 ('taste', 127457),
 ('will', 125047),
 ('love', 121572),
 ('coffee', 115167),
 ('product', 101020),
 ('tea', 95106),
 ('flavor', 93135),
 ('food', 86883),
 ('buy', 71498),
 ('find', 69122),
 ('best', 63498),
 ('eat', 59276),
 ('dog', 58456),
 ('time', 54812),
 ('better', 54727),
 ('amazon', 52356),
 ('price', 50586),
 ('bought', 48573),
 ('chocolate', 47238),
 ('tastes', 42856),
 ('cup', 41910),
 ('drink', 41459),
 ('bag', 40613),
 ('well', 40252),
 ('bit', 40040),
 ('water', 39156),
 ('recommend', 38588),
 ('order', 38501),
 ('nice', 36066),
 ('loves', 35679),
 ('box', 35450),
 ('sweet', 35136),
 ('sugar', 34633),
 ('add', 34005),
 ('hot', 32434),
 ('cat', 32001),
 ('lot', 31898),
 ('ordered', 30685),
 ('favorite', 30010),
 ('small', 29825),
 ('mix', 29435),
 ('eating', 29250),
 ('dogs', 29091),
 ('free', 28627),
 ('definitely', 28593),
 ('store', 28283),
 ('buying', 28160),
 ('brand', 27534),
 ('easy', 27136),
 ('thought', 27091),
 ('pretty', 2

In [14]:
# Vectorize

top_500_words = [ele[0] for ele in list(data)]
vectorizing = CountVectorizer(vocabulary = top_500_words)
vec = vectorizing.fit_transform(reviews)

In [15]:
top_500_words

['good',
 'great',
 'taste',
 'will',
 'love',
 'coffee',
 'product',
 'tea',
 'flavor',
 'food',
 'buy',
 'find',
 'best',
 'eat',
 'dog',
 'time',
 'better',
 'amazon',
 'price',
 'bought',
 'chocolate',
 'tastes',
 'cup',
 'drink',
 'bag',
 'well',
 'bit',
 'water',
 'recommend',
 'order',
 'nice',
 'loves',
 'box',
 'sweet',
 'sugar',
 'add',
 'hot',
 'cat',
 'lot',
 'ordered',
 'favorite',
 'small',
 'mix',
 'eating',
 'dogs',
 'free',
 'definitely',
 'store',
 'buying',
 'brand',
 'easy',
 'thought',
 'pretty',
 'green',
 'local',
 'quality',
 'hard',
 'regular',
 'organic',
 'healthy',
 'perfect',
 'long',
 'going',
 'high',
 'day',
 'flavors',
 'big',
 'enjoy',
 'years',
 'strong',
 'stuff',
 'feel',
 'chips',
 'happy',
 'treats',
 'bad',
 'real',
 'milk',
 'people',
 'worth',
 'purchased',
 'ingredients',
 'highly',
 'bags',
 'fresh',
 'cats',
 'chicken',
 'natural',
 'thing',
 'size',
 'snack',
 'products',
 'amount',
 'work',
 'grocery',
 'delicious',
 'tasted',
 'salt',
 'l

In [16]:
# K-Means

kmeans = MiniBatchKMeans(n_clusters=10, random_state = 0).fit(normalize(vec))

# From each centroid, select the top 5 words that represent the centroid

output_words, feature_values = [], []
sortingCenter = kmeans.cluster_centers_.argsort()[:,::-1]

for i in range(len(sortingCenter)):
  word_list = []
  feature_List = []
  for word in list(sortingCenter[i, :5]):
    word_list.append(top_500_words[word])
    feature_List.append(kmeans.cluster_centers_[i, word])
  output_words.append(word_list)
  feature_values.append(feature_List)

In [17]:
# Display the top 5 words that represent the centroid

for i in range(0,10):
  print("\nCluster " +str(i+1)+ ":")
  print("Word List:", output_words[i]) 
  print("Feature Values:", feature_values[i])


Cluster 1:
Word List: ['dog', 'treats', 'dogs', 'loves', 'treat']
Feature Values: [0.3625607220808383, 0.1711261115644266, 0.11148634211935403, 0.09853883927518238, 0.08082640804802005]

Cluster 2:
Word List: ['amazon', 'find', 'product', 'price', 'love']
Feature Values: [0.29778634268259774, 0.07557348729366777, 0.07053053203413101, 0.0693976662908947, 0.05200028908801511]

Cluster 3:
Word List: ['food', 'cat', 'cats', 'eat', 'dog']
Feature Values: [0.4221378500538321, 0.11956680069265775, 0.080429065535092, 0.0687086830360354, 0.06603196068516683]

Cluster 4:
Word List: ['tea', 'flavor', 'taste', 'good', 'green']
Feature Values: [0.547064304203827, 0.06709559478156352, 0.0666658378114378, 0.06159740960833416, 0.06023932606306848]

Cluster 5:
Word List: ['coffee', 'cup', 'flavor', 'good', 'taste']
Feature Values: [0.4944227063929971, 0.10578325086323535, 0.07649579911387093, 0.07069847102990541, 0.06697151057541496]

Cluster 6:
Word List: ['product', 'great', 'good', 'will', 'taste']