In [1]:
#In this workbook, sentiment of reviews will be measured using the stars and then the text of the review. 
#The sentiment will be mapped with dates and be visualized through three years.
#Cluster analysis - topic modeling - What are people talking about?
# What are my questions?

#Q. In last three years, what are the most popular words that people have used to describe maggie?
#Q. How has the sentiment evolved over a period of three years?
#Q. 

#Possible topics - taste, texture, packaging, time of arrival

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
#read the cleaned csv

maggie = pd.read_csv("maggie.csv")

In [3]:
maggie.head()

Unnamed: 0,stars,comment,date
0,1,Family fun pack! They supposedly forgot to men...,2018-11-02
1,1,The product is expired or something is very wr...,2019-01-16
2,1,This product was supplied in damaged condition...,2017-11-29
3,1,Can you use a product when it is tasted by mou...,2019-09-13
4,5,"Maggi is called national food of India, lol. I...",2019-06-09


In [4]:
maggie["date"].max()

'2020-06-22'

In [5]:
maggie["date"].min()

'2017-11-29'

In [48]:
#Clean text column -- remove stopwords, punctuation
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re

In [56]:
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'
exclusionList = ['maggie','product','would','kar','ki','noodles', 'sn', 'maggi', "pack", "noodle"]
exclusions = '|'.join(exclusionList)

# cleaning master function
def clean_text(text, bigrams = False):
    text = text.lower() # lower case
    text = re.sub('['+my_punctuation + ']+', ' ', text) # strip punctuation
    text = re.sub('\s+', ' ', text) #remove double spacing
    text = re.sub('([0-9]+)', '', text) # remove numbers
    text = re.sub(exclusions, '', text) #remove common words like maggie and noodles
    text_token_list = [word for word in text.split(' ')
                            if word not in my_stopwords] # remove stopwords

    text_token_list = [word_rooter(word) if '#' not in word else word
                        for word in text_token_list] # apply word rooter
    if bigrams:
        text_token_list = text_token_list+[text_token_list[i]+'_'+text_token_list[i+1]
                                            for i in range(len(text_token_list)-1)]
    text = ' '.join(text_token_list)
    return text

In [57]:
maggie['comment'] = maggie.comment.apply(clean_text)

In [58]:
from sklearn.feature_extraction.text import CountVectorizer

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df = 25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(maggie['comment']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()

In [63]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 4

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

In [64]:
model.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=4, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [65]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [66]:
no_top_words = 10
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights
0,noodl,980.9,receiv,740.2,et,870.6,regular,850.0
1,good,808.0,tast,631.6,atta,849.5,review,849.2
2,tast,697.4,ant,567.7,realli,849.2,noodl,695.9
3,min,522.5,famili,566.3,best,631.4,damag,568.0
4,ag,458.5,groceri,566.2,rs,631.2,larg,566.2
5,like,349.2,minut,348.7,tast,630.9,plastic,566.2
6,differ,348.8,insid,283.9,cook,566.5,longer,566.2
7,make,348.6,condit,283.8,got,566.2,good,522.7
8,et,348.6,like,283.5,deliv,566.2,time,457.6
9,look,348.5,one,283.4,even,566.2,alway,348.3


In [None]:
#Classify the sentiments --Positive and negative
#Simple frequency analysis - Most popular words in reviews


In [None]:
#Stars are in the range of 1-5. Anything between below 3 will be considered a negative sentiment, 3 as neutral and 
#above 3 as positive.

#Count #of 1's, 2's and 3's and 4's and 5's in ratings and create a separate cluster

#Step 1: Create a new colummn labelling the sentiments as positive, negative and neutral
maggie["sentiment"] = 