In [1]:
import pandas as pd
import numpy as np

import re
import string
import sys, spacy, logging
from pprint import pprint

import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
from gensim.models import CoherenceModel

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('stopwords')

from tqdm.notebook import tqdm

from wordcloud import WordCloud, STOPWORDS

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings("ignore",category=UserWarning)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\shans\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shans\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data Acquisition

In [2]:
reviews_df = pd.read_csv('cafes_reviews.csv')
reviews_df.shape

(3262, 6)

In [3]:
reviews_df = reviews_df.dropna().reset_index()
reviews_df = reviews_df.drop(columns='index')
reviews_df.shape

(2066, 6)

In [4]:
reviews_df.head()

Unnamed: 0,cafe,street,user,time,stars,reviews
0,Bud's Coffee,Queen St E,Jeffrey Taylor,2 weeks ago,5 stars,Great coffee and fresh baked goods.\nTalking w...
1,Bud's Coffee,Queen St E,Dasong Zou,2 weeks ago,5 stars,I had a great tea here. Very pleasant.
2,Bud's Coffee,Queen St E,t s,a month ago,5 stars,My Flat white was superb. It’s a tiny spot wi...
3,Bud's Coffee,Queen St E,Francesca Arkley,2 months ago,5 stars,Best coffee and lemonade and chai lattes! Bud'...
4,Bud's Coffee,Queen St E,Sean Persad,3 months ago,5 stars,Like the sidewalk ordering window. The America...


In [5]:
reviews_df.nunique()

cafe         12
street        2
user       1949
time         34
stars         5
reviews    2048
dtype: int64

# Data Cleaning

In [6]:
df = reviews_df.copy()

Convert stars to numerical values

In [7]:
df['stars'] = [int(s.split(' ')[1]) for s in df['stars']]
df.head()

Unnamed: 0,cafe,street,user,time,stars,reviews
0,Bud's Coffee,Queen St E,Jeffrey Taylor,2 weeks ago,5,Great coffee and fresh baked goods.\nTalking w...
1,Bud's Coffee,Queen St E,Dasong Zou,2 weeks ago,5,I had a great tea here. Very pleasant.
2,Bud's Coffee,Queen St E,t s,a month ago,5,My Flat white was superb. It’s a tiny spot wi...
3,Bud's Coffee,Queen St E,Francesca Arkley,2 months ago,5,Best coffee and lemonade and chai lattes! Bud'...
4,Bud's Coffee,Queen St E,Sean Persad,3 months ago,5,Like the sidewalk ordering window. The America...


Add sentiment label

In [8]:
df['sentiment'] = ["neg" if s<3 else "pos" if s>3 else "neu" for s in df['stars']]

#verify
df[df['stars'] == 2]

Unnamed: 0,cafe,street,user,time,stars,reviews,sentiment
121,Bud's Coffee,Queen St E,Milena Samra,4 years ago,2,"Hey, if you're looking to pay almost $5 for an...",neg
212,Juice & Java Cafe,Queen St E,Vivienne McCuaig,3 years ago,2,The location is great and the food and coffee ...,neg
220,Juice & Java Cafe,Queen St E,Angela Arnold,3 years ago,2,Overpriced! Same food you can make at home wit...,neg
226,Juice & Java Cafe,Queen St E,Van Nguyen,3 years ago,2,Long wait time and not as described.\n\nI orde...,neg
234,Juice & Java Cafe,Queen St E,Lorraine Syratt,3 years ago,2,Two stars for effort. Gave us the wrong order...,neg
...,...,...,...,...,...,...,...
1837,Coffee Island,Bay St,Joe Korkis,5 years ago,2,Too expensive,neg
1850,Coffee Island,Bay St,Cindy Basha,5 years ago,2,My big problem with this coffee shop is well.....,neg
1878,White Rabbit Caffe,Bay St,Ramya Gudipudi,7 months ago,2,Ordered oat latte but had weird after taste fo...,neg
1908,White Rabbit Caffe,Bay St,Timothy K,a year ago,2,"The White Rabb was really missing ""it"" (see ph...",neg


Check if the reviews are English

In [9]:
from langdetect import detect_langs, LangDetectException

languages = []
for review in df['reviews']:
    try:
        lang_list = detect_langs(review)
        lang_str = str(lang_list[0]).split(':')[0]
    except LangDetectException:
        lang_str = 'unknown'
    languages.append(lang_str)

df['language'] = languages
df.head()

Unnamed: 0,cafe,street,user,time,stars,reviews,sentiment,language
0,Bud's Coffee,Queen St E,Jeffrey Taylor,2 weeks ago,5,Great coffee and fresh baked goods.\nTalking w...,pos,en
1,Bud's Coffee,Queen St E,Dasong Zou,2 weeks ago,5,I had a great tea here. Very pleasant.,pos,en
2,Bud's Coffee,Queen St E,t s,a month ago,5,My Flat white was superb. It’s a tiny spot wi...,pos,en
3,Bud's Coffee,Queen St E,Francesca Arkley,2 months ago,5,Best coffee and lemonade and chai lattes! Bud'...,pos,en
4,Bud's Coffee,Queen St E,Sean Persad,3 months ago,5,Like the sidewalk ordering window. The America...,pos,en


In [None]:
df['language'].value_counts()

en         1989
it           12
ro            9
de            9
fr            7
ca            5
da            5
so            4
es            3
pt            3
unknown       3
af            2
sk            2
cy            2
cs            2
nl            2
sv            2
el            1
et            1
tl            1
pl            1
hr            1
Name: language, dtype: int64

*Basic Cleaning*
1. lowercase
2. punctuation removal
3. white space removal
4. speical characters removal
5. tokenization

*Text Filtering*
1. stop words removal
2. text normalization (lemmatization)
3. text filtering: noun, adj, verb, adv (optional)
4. filter non-english reviews

In [10]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(['get', 'go','always','back','would','also','one','and',
                    'go', 'get', 'do',  'also', 'may'])
print(len(stop_words))

# Extend stop words from spacy library
sw_spacy = spacy.load('en_core_web_sm').Defaults.stop_words
stop_words.extend(sw_spacy)
print(len(stop_words))

allowed_postags = ['NOUN','ADJ', 'VERB', 'ADV']

192
518


In [11]:
def text_cleaning(sentence):
    # Remove punctuation with white space (to avoid concatenation)
    result = sentence.translate(str.maketrans(string.punctuation," "*len(string.punctuation))) 

    # replace multiple spaces with simple space
    result = re.sub(' +', ' ', result) 

    # remove special characters
    #result = result.str.replace('[^A-Za-z0-9]+', ' ')

    # Converts a document into a list of lowercase tokens
    # ignoring tokens that are too long or too short
    result = gensim.utils.simple_preprocess(str(result), deacc=True) 
    #yield(result)

    # Remove stop words -----
    text = [word for word in result if word not in stop_words]

    # Initialize the output text list
    text_out = []

    # Load the small English model, disable the parser and NER components
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    doc = nlp(" ".join(text)) # join all strings together

    # Text normalization: lemma_ returns the base form of the word
    text_out.append([token.lemma_ for token in doc])
    
    # Text filtering: keep only words that have a POS tag in allowed_postags
    # text_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    
    return text_out[0]

In [12]:
df['tokenized'] = [text_cleaning(w) for w in df['reviews']]
df['tokenized'].head()

0    [great, coffee, fresh, baked, good, talk, sara...
1                               [great, tea, pleasant]
2    [flat, white, superb, tiny, spot, limited, sea...
3    [good, coffee, lemonade, chai, latte, bud, del...
4    [like, sidewalk, ordering, window, americano, ...
Name: tokenized, dtype: object

In [13]:
df['cleaned_text'] =  [' '.join(map(str, s)) for s in df['tokenized']]
df['cleaned_text'].head()

0    great coffee fresh baked good talk sarah day love
1                                   great tea pleasant
2    flat white superb tiny spot limited seat clean...
3    good coffee lemonade chai latte bud delicious ...
4         like sidewalk ordering window americano good
Name: cleaned_text, dtype: object

In [14]:
df.head()

Unnamed: 0,cafe,street,user,time,stars,reviews,sentiment,language,tokenized,cleaned_text
0,Bud's Coffee,Queen St E,Jeffrey Taylor,2 weeks ago,5,Great coffee and fresh baked goods.\nTalking w...,pos,en,"[great, coffee, fresh, baked, good, talk, sara...",great coffee fresh baked good talk sarah day love
1,Bud's Coffee,Queen St E,Dasong Zou,2 weeks ago,5,I had a great tea here. Very pleasant.,pos,en,"[great, tea, pleasant]",great tea pleasant
2,Bud's Coffee,Queen St E,t s,a month ago,5,My Flat white was superb. It’s a tiny spot wi...,pos,en,"[flat, white, superb, tiny, spot, limited, sea...",flat white superb tiny spot limited seat clean...
3,Bud's Coffee,Queen St E,Francesca Arkley,2 months ago,5,Best coffee and lemonade and chai lattes! Bud'...,pos,en,"[good, coffee, lemonade, chai, latte, bud, del...",good coffee lemonade chai latte bud delicious ...
4,Bud's Coffee,Queen St E,Sean Persad,3 months ago,5,Like the sidewalk ordering window. The America...,pos,en,"[like, sidewalk, ordering, window, americano, ...",like sidewalk ordering window americano good


In [15]:
df.to_csv('cleaned_reviews.csv')

# Model Building

Identify positive and negative reviews

In [None]:
positive_reviews = df['cleaned_text'][df.sentiment=='pos'].tolist()
negative_reviews = df['cleaned_text'][df.sentiment=='neg'].tolist()
neutral_reviews = df['cleaned_text'][df.sentiment=='neu'].tolist()

In [None]:
positive_reviews[:5]

In [None]:
negative_reviews[:5]

Calculate words frequency of pos & neg reviews

In [None]:
def word_freq(reviews):
    freqs = {}

    ys = np.ones((len(reviews), 1))

    yslist = np.squeeze(ys).tolist()
    for y, review in zip(yslist, reviews):
        for word in review:
            pair = (word, y)
            freqs[pair] = freqs.get(pair, 0) + 1

    #sort by frequency count    
    freqs = dict(sorted(freqs.items(), key=lambda x:x[1], reverse=True))

    return freqs

In [None]:
reviews_pos = df['tokenized'][df.sentiment=='pos'].tolist()
freqs_pos = word_freq(reviews_pos)

freqs_pos

In [None]:
reviews_neg = df['tokenized'][df.sentiment=='neg'].tolist()
freqs_neg = word_freq(reviews_neg)
freqs_neg

Visualize using wordcloud

In [None]:
# Import WordCloud and STOPWORDS
from wordcloud import WordCloud
from wordcloud import STOPWORDS

def get_wordCloud(text):

    # Create stopword list
    stopword_list = set(STOPWORDS) 

    # Create WordCloud 
    word_cloud = WordCloud(width = 550, height = 550, 
                           background_color ='white', 
                           stopwords = stopword_list, 
                           min_font_size = 12).generate(text) 

    return word_cloud

In [None]:
# Create a figure and subplots for each word cloud
fig, axes = plt.subplots(2,2, figsize=(10,10))
fig.suptitle('Reviews on Bay St. vs. Queen St. E', fontsize=20)

# Create and plot a word cloud for each text
wc_queen_pos = get_wordCloud(''.join(df.cleaned_text[(df.street =='Queen St E') & (df.sentiment == 'pos')]))
wc_queen_neg = get_wordCloud(''.join(df.cleaned_text[(df.street =='Queen St E') & (df.sentiment == 'neg')]))
wc_bay_pos = get_wordCloud(''.join(df.cleaned_text[(df.street =='Bay St') & (df.sentiment == 'pos')]))
wc_bay_neg = get_wordCloud(''.join(df.cleaned_text[(df.street =='Bay St') & (df.sentiment == 'neg')]))

title_size = 16
axes[0][0].imshow(wc_queen_pos)
axes[0][0].axis('off')
axes[0][0].set_title('Positive Reviews on Queen St', fontsize=title_size)

axes[0][1].imshow(wc_queen_neg)
axes[0][1].axis('off')
axes[0][1].set_title('Negative Reviews on Queen St', fontsize=title_size)

axes[1][0].imshow(wc_bay_pos)
axes[1][0].axis('off')
axes[1][0].set_title('Positive Reviews on Bay St', fontsize=title_size)

axes[1][1].imshow(wc_bay_neg)
axes[1][1].axis('off')
axes[1][1].set_title('Negative Reviews on Bay St', fontsize=title_size)

# Show the plot
plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

Sigmoid Function
- Gives values between -1 and 1 

In [None]:
def sigmoid(z): 
    # calculate the sigmoid of z
    h = 1/(1 + np.exp(-z))
    return h

Cost Function and Gradient Descent

In [None]:
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    '''
    
    m = len(x)
  
    for i in range(0, num_iters):
        
        # get z, the dot product of x and theta
        z = np.dot(x,theta)
        
        # get the sigmoid of z
        h = sigmoid(z)
        
        # calculate the cost function
        J = (-1/m)*(np.dot(y.T,np.log(h)) + np.dot((1-y).T,np.log(1-h)))
        #print(J)
        
        # update the weights theta
        theta = theta - (alpha/m)*np.dot(x.T, h-y)
        
    J = float(J)
    return J, theta

Extract features

In [None]:
def extract_features(review, freqs):
    '''
    Input: 
        tweet: a list of words for one review (processed)
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
        
    # loop through each word in the list of words -- df['tokenized]
    for word in review:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word,1),0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word,0),0)
        
    assert(x.shape == (1, 3))
    return x

Train Test Split

In [None]:
model = SentimentIntensityAnalyzer()
model

In [None]:
from tqdm import tqdm

res = {}
for i, row in tqdm(df.iterrows(), total = len(df)):
    text = row['reviews']
    myid = row['index']
    res[myid] = model.polarity_scores(text)

# Keeping the results as Dataframe
Vader_res = pd.DataFrame(res).T
Vader_res

In [None]:
Vader_res = pd.merge(df, Vader_res, left_index=True, right_index=True) 
Vader_res