In [1]:
import pandas as pd
import numpy as np
from nltk import RegexpTokenizer
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

import gensim

from sklearn.cluster import KMeans
from sklearn import metrics
import pylab as pl
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

import emoji
import regex as re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics import pairwise_distances


import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
reviewer_reviews_df = pd.read_csv('../cleaning/cleaned_reviewer_reviews.csv', index_col=0).reset_index(drop=True)
reviewer_reviews_df.head(2)

Unnamed: 0,name,address,review,date,link,reviewer,keep,cleaned_text
0,Kazoku Japanese Cuisine,"1 Goldhill Plaza, Singapore ...",\n1-for-1 Don\nKazoku Chirashi Don (S$29.90++)...,4d ago,https://www.burpple.com/kazoku-japanese-cuisin...,alamakgirl,1,kazoku chirashi s2990 thick slice tuna salmon ...
1,Tigerlily Patisserie,"350 Joo Chiat Road, Singapore ...","\nBrunch\nBeehive (S$15+)\nLemon, thyme and ly...",Feb 26 at 12:44pm,https://www.burpple.com/tigerlily-patisserie?b...,alamakgirl,1,beehive s15 lemon thyme lychee honey jelly lig...


## Doc2Vec

In [3]:
tokenizer = RegexpTokenizer(r'\w+')

def nlp_clean(data):
    new_data = []
    for d in data:
        new_str = d.lower()
        dlist = tokenizer.tokenize(new_str)
        new_data.append(dlist)
        # new_list = []
        # for token in dlist:
        #     word, pos = nltk.pos_tag([token])[0]
        #     if pos != 'NN' and pos != 'NNS' and pos != 'NNP' and pos != 'NNPS': #remove nouns that do not determine sentiments
        #         new_list.append(word)
        # new_data.append(new_list)
    return new_data

class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
              yield gensim.models.doc2vec.TaggedDocument(doc, [self.labels_list[idx]])

In [4]:
# tokenize reviews
tokenized_reviews = nlp_clean(reviewer_reviews_df['cleaned_text'])

In [5]:
bigram = gensim.models.Phrases(tokenized_reviews) # higher threshold fewer phrases.
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
sentences = bigram_mod[tokenized_reviews]

model = gensim.models.Word2Vec(vector_size=1000, min_count=5, alpha=0.025, min_alpha=0.025, seed=123) #### TO TUNE
model.build_vocab(sentences)
model.train(sentences, total_examples= model.corpus_count, epochs = 10, start_alpha=0.002, end_alpha=-0.016)


(1700825, 1942820)

## Kmeans

In [6]:
kmeans = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=model.wv.vectors.astype('double'))

In [7]:
# Positive words
model.wv.similar_by_vector(kmeans.cluster_centers_[0], topn=100, restrict_vocab=None)

[('make', 0.9999999403953552),
 ('taste', 0.9999997615814209),
 ('enjoy', 0.9999997615814209),
 ('serve', 0.9999997615814209),
 ('not', 0.9999997019767761),
 ('egg', 0.9999997019767761),
 ('cheese', 0.9999997019767761),
 ('available', 0.9999997019767761),
 ('rice', 0.9999997019767761),
 ('sweet', 0.9999996423721313),
 ('crispy', 0.9999995827674866),
 ('there', 0.9999995827674866),
 ('meat', 0.9999995827674866),
 ('fill', 0.9999995827674866),
 ('pork', 0.9999995231628418),
 ('u', 0.9999995231628418),
 ('sauce', 0.9999995231628418),
 ('cake', 0.9999995231628418),
 ('tender', 0.9999995231628418),
 ('menu', 0.9999995231628418),
 ('like', 0.9999995231628418),
 ('dish', 0.9999995231628418),
 ('flavour', 0.9999995231628418),
 ('soup', 0.999999463558197),
 ('quite', 0.999999463558197),
 ('chef', 0.999999463558197),
 ('fresh', 0.999999463558197),
 ('good', 0.999999463558197),
 ('new', 0.999999463558197),
 ('texture', 0.999999463558197),
 ('use', 0.999999463558197),
 ('didnt', 0.9999994039535522

In [8]:
# negative words
model.wv.similar_by_vector(kmeans.cluster_centers_[1], topn=100, restrict_vocab=None)

[('taste', 0.9999997615814209),
 ('serve', 0.9999997615814209),
 ('enjoy', 0.9999997019767761),
 ('not', 0.9999997019767761),
 ('rice', 0.9999997019767761),
 ('there', 0.9999997019767761),
 ('menu', 0.9999997019767761),
 ('use', 0.9999996423721313),
 ('grill', 0.9999996423721313),
 ('love', 0.9999996423721313),
 ('restaurant', 0.9999996423721313),
 ('available', 0.9999996423721313),
 ('make', 0.9999995827674866),
 ('meat', 0.9999995827674866),
 ('butter', 0.9999995827674866),
 ('cheese', 0.9999995231628418),
 ('chicken', 0.9999995231628418),
 ('sweet', 0.9999995231628418),
 ('soup', 0.9999995231628418),
 ('like', 0.9999995231628418),
 ('food', 0.9999995231628418),
 ('flavour', 0.9999995231628418),
 ('good', 0.999999463558197),
 ('best', 0.999999463558197),
 ('quite', 0.999999463558197),
 ('tender', 0.999999463558197),
 ('fill', 0.999999463558197),
 ('dessert', 0.999999463558197),
 ('fresh', 0.999999463558197),
 ('mushroom', 0.999999463558197),
 ('chef', 0.999999463558197),
 ('texture',

In [9]:
words = pd.DataFrame()
words['words'] = model.wv.index_to_key # get all words
words['vectors'] = words['words'].apply(lambda x: model.wv[x])  # get embeddings for each word 
words['cluster'] = words['vectors'].apply(lambda x: 1 if kmeans.predict([x])[0]== 0 else -1) # get kmeans cluster for each word
words['closeness_score'] = words['vectors'].apply(lambda x: 1/(kmeans.transform([x]).min())) # minimum distance from each word to the center of the cluster. Closer to center = Stronger Positive/Negative Score
words['sentiment_coeff'] = words['closeness_score'] * words['cluster'] # negative score = negative sentiment, positive score = positive sentiment

In [10]:
words

Unnamed: 0,words,vectors,cluster,closeness_score,sentiment_coeff
0,not,"[-0.20506302, 0.5709192, -0.4291008, -0.116481...",-1,0.061030,-0.061030
1,sauce,"[-0.18723677, 0.5244518, -0.39323488, -0.10695...",-1,0.070024,-0.070024
2,good,"[-0.16635017, 0.4629707, -0.34783772, -0.09442...",-1,0.086607,-0.086607
3,flavour,"[-0.16395748, 0.45727187, -0.34328416, -0.0924...",-1,0.088629,-0.088629
4,dish,"[-0.16656905, 0.46609807, -0.35059276, -0.0935...",-1,0.085496,-0.085496
...,...,...,...,...,...
6007,moresweetthansmoky,"[-0.007788965, 0.021740288, -0.016395465, -0.0...",1,0.705439,0.705439
6008,anglaise,"[-0.0069757467, 0.018018488, -0.013356992, -0....",1,0.610357,0.610357
6009,fung,"[-0.005109146, 0.0142147, -0.011513885, -0.002...",1,0.558445,0.558445
6010,tote,"[-0.008581543, 0.023909938, -0.017610073, -0.0...",1,0.723859,0.723859


In [11]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

## Document Sentiments

In [12]:
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [13]:
results = reviewer_reviews_df.copy()
results

Unnamed: 0,name,address,review,date,link,reviewer,keep,cleaned_text
0,Kazoku Japanese Cuisine,"1 Goldhill Plaza, Singapore ...",\n1-for-1 Don\nKazoku Chirashi Don (S$29.90++)...,4d ago,https://www.burpple.com/kazoku-japanese-cuisin...,alamakgirl,1,kazoku chirashi s2990 thick slice tuna salmon ...
1,Tigerlily Patisserie,"350 Joo Chiat Road, Singapore ...","\nBrunch\nBeehive (S$15+)\nLemon, thyme and ly...",Feb 26 at 12:44pm,https://www.burpple.com/tigerlily-patisserie?b...,alamakgirl,1,beehive s15 lemon thyme lychee honey jelly lig...
2,Putien (Northpoint City) ...,"930 Yishun Avenue 2, Singapore ...",\nBirthday Treat \n20% discount \nValid during...,Feb 24 at 10:47pm,https://www.burpple.com/putien-8?bp_ref=%2Ff%2...,alamakgirl,1,20 discount valid birthday month member starte...
3,Our Tampines Hub Hawker Centre (...,"1 Tampines Walk, Singapore ...",\nSet C\nSet C (S$2.50)\n‘Cos it’s Friday \nGo...,Feb 24 at 8:33am,https://www.burpple.com/our-tampines-hub?bp_re...,alamakgirl,1,set c s250 co friday s250 cashback pay paylah ...
4,Hokkaido Ramen Santouka (Clarke ...,"6 Eu Tong Sen Street, Singapore ...",\nBirthday Treat\n50% off Tokusen Toroniku Ram...,Feb 19 at 12:27pm,https://www.burpple.com/hokkaido-ramen-santouk...,alamakgirl,1,50 tokusen toroniku ramen s23 s1150 need redee...
...,...,...,...,...,...,...,...,...
5356,Paradise Dynasty (Westgate) ...,"3 Gateway Drive, Singapore ...","\nBaby Spinach Vermicelli 5.5++\nAgain, light ...","Jan 2, 2020",https://www.burpple.com/paradise-dynasty-10?bp...,thefoodcompendium,1,light somehow satisfy mom love esp sweetness s...
5357,Paradise Dynasty (Westgate) ...,"3 Gateway Drive, Singapore ...",\nChengdu Salivating Chicken 10.8++\nWow this ...,"Jan 2, 2020",https://www.burpple.com/paradise-dynasty-10?bp...,thefoodcompendium,1,wow pretty solid quite faithful classic flavou...
5358,Paradise Dynasty (Westgate) ...,"3 Gateway Drive, Singapore ...",\nStewed Bamboo Shoots 7.8++\nWow this wasnt w...,"Jan 2, 2020",https://www.burpple.com/paradise-dynasty-10?bp...,thefoodcompendium,1,wow wasnt expect lightly quite delicious mild ...
5359,Paradise Dynasty (Westgate) ...,"3 Gateway Drive, Singapore ...",\nDan Dan Mian 8.8++\nReally restaurant standa...,"Jan 2, 2020",https://www.burpple.com/paradise-dynasty-10?bp...,thefoodcompendium,1,restaurant standard basically play rather safe...


In [14]:
results['tokenized'] = tokenized_reviews
results['bigram'] = results['tokenized'].apply(lambda x: bigram_mod[x])
results['bigram'] = results['bigram'].apply(lambda x: ' '.join(x))

In [15]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
transformed = tfidf.fit_transform(results.bigram)
features = pd.Series(tfidf.get_feature_names())

In [16]:
def create_tfidf_dictionary(x, transformed_file, features):
    tfidf_words = transformed_file[x.name].tocoo() # get the tfidf values for all words in the review text
    tfidf_words.col = features.iloc[tfidf_words.col].values # get each word
    dictionary = dict(zip(tfidf_words.col, tfidf_words.data)) # dictionary of (word, tfidf value)
    return dictionary

def replace_tfidf_words(x, transformed_file, features):
    dictionary = create_tfidf_dictionary(x, transformed_file, features) 
    return list(map(lambda y:dictionary[y], x.bigram.split())) # replace each word in the review text with its tfidf value

In [17]:
replaced_tfidf_scores = results.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes to calculate

In [18]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [19]:
replaced_closeness_scores = results.bigram.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [20]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, results.bigram, results.reviewer, results.review, results.link]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'reviewer', 'review', 'link']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')

In [21]:
replacement_df.to_csv('sentiments_generated.csv')

## Accuracy of Recommendations

In [22]:
doc2vec_recommendations = pd.read_csv('../recommendation/doc2vec_content_recommendation.csv', index_col = 0)
lda_recommendations = pd.read_csv('../recommendation/lda_content_recommendation.csv', index_col = 0)

In [23]:
import ast
doc2vec_recommendations['recommendations'] = doc2vec_recommendations['recommendations'].apply(lambda x: ast.literal_eval(x))
lda_recommendations['recommendations'] = lda_recommendations['recommendations'].apply(lambda x: ast.literal_eval(x))

In [24]:
def get_accuracy_counts(data):
    count = 0
    new_rest_count_list = []
    old_rest_correct_count_list = []
    old_rest_wrong_count_list = []

    for idx, row in data.iterrows():
        recommendations = row['recommendations']
        reviewer = row['reviewer']
        reviewer_sentiments = replacement_df[replacement_df['reviewer'] == reviewer]
        new_rest_count = 0
        old_rest_correct_count = 0
        old_rest_wrong_count = 0

        for recommendation in recommendations:
            result_df = reviewer_sentiments[reviewer_sentiments['link'] == recommendation]
            result_df = result_df.drop_duplicates(['link'])
            if len(result_df) == 0: # not inside the reviewers reviews. New restaurant
                new_rest_count+=1
            else:
                if int(result_df['prediction']) == 1:
                    old_rest_correct_count += 1

                else:
                    old_rest_wrong_count += 1

        new_rest_count_list.append(new_rest_count)
        old_rest_correct_count_list.append(old_rest_correct_count)
        old_rest_wrong_count_list.append(old_rest_wrong_count)

        count +=1
        if count%1000 == 0:
            print(count)
    data['new_rest_count'] = new_rest_count_list
    data['old_rest_correct_count'] = old_rest_correct_count_list
    data['old_rest_wrong_count'] = old_rest_wrong_count_list

    data['old_recomm_count'] = data['old_rest_correct_count']+data['old_rest_wrong_count']
    data['perc_correct_counts'] = data['old_rest_correct_count']/data['old_recomm_count']
    mean = data['perc_correct_counts'].mean()
    data['perc_new'] = data['new_rest_count']/5
    mean_new = data['perc_new'].mean()
    print(f'Percentage of Correct Counts:{mean}')
    print(f'Percentage of New Restaurants:{mean_new}')
    return data

In [25]:
doc2vec_recommendations = get_accuracy_counts(doc2vec_recommendations)

1000
2000
3000
4000
5000
Percentage of Correct Counts:0.7398706577974874
Percentage of New Restaurants:0.6086551016601368


In [26]:
doc2vec_recommendations.groupby('reviewer')[['perc_correct_counts', 'perc_new']].mean()

Unnamed: 0_level_0,perc_correct_counts,perc_new
reviewer,Unnamed: 1_level_1,Unnamed: 2_level_1
Eatravel,0.756571,0.629021
MightyFoodie,0.911364,0.60046
Vanessa_Kou,0.647126,0.866258
alamakgirl,0.728946,0.606601
juliuslim,0.87277,0.652983
thefoodcompendium,0.659996,0.488586
thiampeng,0.707316,0.679208
veronicaphua,0.930151,0.712442


In [27]:
lda_recommendations = get_accuracy_counts(lda_recommendations)

1000
2000
3000
4000
5000
Percentage of Correct Counts:0.7269715854291829
Percentage of New Restaurants:0.6030591307591884


In [28]:
lda_recommendations.groupby('reviewer')[['perc_correct_counts', 'perc_new']].mean()

Unnamed: 0_level_0,perc_correct_counts,perc_new
reviewer,Unnamed: 1_level_1,Unnamed: 2_level_1
Eatravel,0.751434,0.662238
MightyFoodie,0.817409,0.532414
Vanessa_Kou,0.627505,0.867485
alamakgirl,0.758138,0.615512
juliuslim,0.882902,0.653461
thefoodcompendium,0.657882,0.470528
thiampeng,0.652564,0.690347
veronicaphua,0.939092,0.706452


## Check robustness using VADER sentiments

In [29]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [30]:
sid_obj = SentimentIntensityAnalyzer()
results["Sentiment_score"] = results["bigram"].apply(lambda x: round(sid_obj.polarity_scores(x)['compound'],4))
results['Sentiment'] = results['Sentiment_score'].apply(lambda x:  1 if x>0  else 0 if x < 0 else 0)

In [31]:
replacement_df['vader'] = results['Sentiment']

In [32]:
len(replacement_df[replacement_df['vader'] == replacement_df['prediction']])/len(replacement_df)

0.6023130013057265