In [1]:
import pandas as pd
import numpy as np
from nltk import RegexpTokenizer
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

import gensim

from sklearn.cluster import KMeans
from sklearn import metrics
import pylab as pl
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

import emoji
import regex as re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics import pairwise_distances


import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
reviewer_reviews_df = pd.read_csv('../cleaning/cleaned_reviewer_reviews.csv', index_col=0).reset_index(drop=True)
reviewer_reviews_df.head(2)

Unnamed: 0,name,address,review,date,link,reviewer,keep,cleaned_text
0,Kazoku Japanese Cuisine,"1 Goldhill Plaza, Singapore ...",\n1-for-1 Don\nKazoku Chirashi Don (S$29.90++)...,4d ago,https://www.burpple.com/kazoku-japanese-cuisin...,alamakgirl,1,kazoku chirashi s2990 thick slice tuna salmon ...
1,Tigerlily Patisserie,"350 Joo Chiat Road, Singapore ...","\nBrunch\nBeehive (S$15+)\nLemon, thyme and ly...",Feb 26 at 12:44pm,https://www.burpple.com/tigerlily-patisserie?b...,alamakgirl,1,beehive s15 lemon thyme lychee honey jelly lig...


## Doc2Vec

In [3]:
tokenizer = RegexpTokenizer(r'\w+')

def nlp_clean(data):
    new_data = []
    for d in data:
        new_str = d.lower()
        dlist = tokenizer.tokenize(new_str)
        new_data.append(dlist)
        # new_list = []
        # for token in dlist:
        #     word, pos = nltk.pos_tag([token])[0]
        #     if pos != 'NN' and pos != 'NNS' and pos != 'NNP' and pos != 'NNPS': #remove nouns that do not determine sentiments
        #         new_list.append(word)
        # new_data.append(new_list)
    return new_data

class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
              yield gensim.models.doc2vec.TaggedDocument(doc, [self.labels_list[idx]])

In [4]:
# tokenize reviews
tokenized_reviews = nlp_clean(reviewer_reviews_df['cleaned_text'])

In [5]:
bigram = gensim.models.Phrases(tokenized_reviews) # higher threshold fewer phrases.
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
sentences = bigram_mod[tokenized_reviews]

model = gensim.models.Word2Vec(vector_size=1000, min_count=5, alpha=0.025, min_alpha=0.025, seed=123) #### TO TUNE
model.build_vocab(sentences)
model.train(sentences, total_examples= model.corpus_count, epochs = 10, start_alpha=0.002, end_alpha=-0.016)


(1745359, 1994950)

## Kmeans

In [6]:
kmeans = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=model.wv.vectors.astype('double'))

In [7]:
# positive words
model.wv.similar_by_vector(kmeans.cluster_centers_[1], topn=10, restrict_vocab=None)

[('cheese', 0.9999998211860657),
 ('taste', 0.9999998211860657),
 ('sauce', 0.9999997615814209),
 ('available', 0.9999997019767761),
 ('soup', 0.9999997019767761),
 ('serve', 0.9999996423721313),
 ('singapore_singapore', 0.9999996423721313),
 ('meat', 0.9999996423721313),
 ('like', 0.9999996423721313),
 ('restaurant', 0.9999996423721313)]

In [8]:
# negative words
model.wv.similar_by_vector(kmeans.cluster_centers_[0], topn=10, restrict_vocab=None)

[('restaurant', 0.9999998211860657),
 ('menu', 0.9999997615814209),
 ('sauce', 0.9999997615814209),
 ('crispy', 0.9999997019767761),
 ('try', 0.9999996423721313),
 ('good', 0.9999996423721313),
 ('flavour', 0.9999996423721313),
 ('serve', 0.9999996423721313),
 ('love', 0.9999996423721313),
 ('not', 0.9999996423721313)]

In [9]:
positive_cluster_index = 0
positive_cluster_center = kmeans.cluster_centers_[positive_cluster_index]
negative_cluster_center = kmeans.cluster_centers_[1-positive_cluster_index]

In [10]:
words = pd.DataFrame(model.wv.index_to_key)
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: model.wv[f'{x}'])  # words 
words['cluster'] = words.vectors.apply(lambda x: kmeans.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [11]:
words

Unnamed: 0,words,vectors,cluster
0,not,"[-0.19624005, 0.53075457, -0.35639185, -0.0945...",1
1,sauce,"[-0.19803816, 0.5389103, -0.36085296, -0.09584...",1
2,good,"[-0.16736849, 0.4524026, -0.30366668, -0.08051...",1
3,flavour,"[-0.17609513, 0.47711167, -0.31993493, -0.0840...",1
4,dish,"[-0.18176459, 0.4937847, -0.3317702, -0.086094...",1
...,...,...,...
6286,kacang,"[-0.007196612, 0.018632693, -0.013336318, -0.0...",0
6287,basilico,"[-0.015414443, 0.041045096, -0.027755762, -0.0...",0
6288,fuss,"[-0.010579281, 0.028247027, -0.01899261, -0.00...",0
6289,spotlight,"[-0.009490893, 0.02812152, -0.0174913, -0.0054...",0


In [12]:
words['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(kmeans.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [13]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

## Document Sentiments

In [14]:
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [15]:
file_weighting = reviewer_reviews_df.copy()
file_weighting

Unnamed: 0,name,address,review,date,link,reviewer,keep,cleaned_text
0,Kazoku Japanese Cuisine,"1 Goldhill Plaza, Singapore ...",\n1-for-1 Don\nKazoku Chirashi Don (S$29.90++)...,4d ago,https://www.burpple.com/kazoku-japanese-cuisin...,alamakgirl,1,kazoku chirashi s2990 thick slice tuna salmon ...
1,Tigerlily Patisserie,"350 Joo Chiat Road, Singapore ...","\nBrunch\nBeehive (S$15+)\nLemon, thyme and ly...",Feb 26 at 12:44pm,https://www.burpple.com/tigerlily-patisserie?b...,alamakgirl,1,beehive s15 lemon thyme lychee honey jelly lig...
2,Putien (Northpoint City) ...,"930 Yishun Avenue 2, Singapore ...",\nBirthday Treat \n20% discount \nValid during...,Feb 24 at 10:47pm,https://www.burpple.com/putien-8?bp_ref=%2Ff%2...,alamakgirl,1,20 discount valid birthday month member starte...
3,Our Tampines Hub Hawker Centre (...,"1 Tampines Walk, Singapore ...",\nSet C\nSet C (S$2.50)\n‘Cos it’s Friday \nGo...,Feb 24 at 8:33am,https://www.burpple.com/our-tampines-hub?bp_re...,alamakgirl,1,set c s250 co friday s250 cashback pay paylah ...
4,Hokkaido Ramen Santouka (Clarke ...,"6 Eu Tong Sen Street, Singapore ...",\nBirthday Treat\n50% off Tokusen Toroniku Ram...,Feb 19 at 12:27pm,https://www.burpple.com/hokkaido-ramen-santouk...,alamakgirl,1,50 tokusen toroniku ramen s23 s1150 need redee...
...,...,...,...,...,...,...,...,...
5356,Paradise Dynasty (Westgate) ...,"3 Gateway Drive, Singapore ...","\nBaby Spinach Vermicelli 5.5++\nAgain, light ...","Jan 2, 2020",https://www.burpple.com/paradise-dynasty-10?bp...,thefoodcompendium,1,light somehow satisfy mom love esp sweetness s...
5357,Paradise Dynasty (Westgate) ...,"3 Gateway Drive, Singapore ...",\nChengdu Salivating Chicken 10.8++\nWow this ...,"Jan 2, 2020",https://www.burpple.com/paradise-dynasty-10?bp...,thefoodcompendium,1,wow pretty solid quite faithful classic flavou...
5358,Paradise Dynasty (Westgate) ...,"3 Gateway Drive, Singapore ...",\nStewed Bamboo Shoots 7.8++\nWow this wasnt w...,"Jan 2, 2020",https://www.burpple.com/paradise-dynasty-10?bp...,thefoodcompendium,1,wow wasnt expect lightly quite delicious mild ...
5359,Paradise Dynasty (Westgate) ...,"3 Gateway Drive, Singapore ...",\nDan Dan Mian 8.8++\nReally restaurant standa...,"Jan 2, 2020",https://www.burpple.com/paradise-dynasty-10?bp...,thefoodcompendium,1,restaurant standard basically play rather safe...


In [16]:
file_weighting['tokenized'] = tokenized_reviews
file_weighting['bigram'] = file_weighting['tokenized'].apply(lambda x: bigram_mod[x])
file_weighting['bigram'] = file_weighting['bigram'].apply(lambda x: ' '.join(x))

In [17]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.bigram)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.bigram)

In [18]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features) 
    print(x.bigram.split())  
    return list(map(lambda y:dictionary[f'{y}'], x.bigram.split()))

In [19]:
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes to calculate

['kazoku', 'chirashi', 's2990', 'thick_slice', 'tuna', 'salmon', 'swordfish', 'scallop', 'octopus', 'sashimi', 'omelette', 'salmon', 'fish_roe', 'rice', 'prefers', 'thin_slice', 'sashimi', 'check', 'kazoku', 's1290', 'similar', 'dish', 'thin_slice', 'sashimi', 'plus', 'easy', 'wallet', 'aburi', 'mentai', 'salmon', 's1190', 'torched', 'salmon_sashimi', 'rice', 'topped', 'strip', 'mentaiko', 'mayonnaise', 'fish_roe', 'dont_forget', 'use', '1for1', 'bento', 'beyond', 'member', 'sirloin', 'steak', 'salmon_sashimi', 's1990', 'select', 'doneness', 'steak', 'steak', 'serve', 'medium', 'quite', 'juicy_tender', 'aburi', 'mentai', 'ebi', 'salmon', 'roll', 's1590', 'piece', 'sushi', 'roll', 'stuffed', 'prawn', 'crispy', 'bread', 'crust', 'crabstick', 'wrap', 'torched', 'salmon', 'topped', 'mentaiko', 'mayonnaise', 'fly_fish', 'roe', 'yummy', 'combination', 'spicy', 'sashimi', 'carpaccio', 's1390', 'tuna', 'salmon', 'swordfish', 'scallop', 'octopus', 'sashimi', 'topped', 'spicy', 'thai', 'seafood'

In [20]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [21]:
replaced_closeness_scores = file_weighting.bigram.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [22]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.bigram, file_weighting.reviewer, file_weighting.review, file_weighting.link]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'reviewer', 'review', 'link']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')

In [23]:
replacement_df[replacement_df['prediction'] == 0]['review'][3327] # example of negative review

"\nTruffle Fries & Cauliflower Almond Poppers\nThe deep-fried cauliflower and almond was so interesting and came with a cashew dip. The fries though were a joke. Around the price of PS cafe's but maybe one third the serving.\n"

In [29]:
replacement_df[replacement_df['prediction'] == 1]['review'][5357] # example of positive review

"\nChengdu Salivating Chicken 10.8++\nWow this was pretty solid, and quite faithful to the Classic flavours. Not a lot of sauce but it was really intense, and the cold chicken was v tender. Got a nice kick to it as well, really well executed and definitely recommended if you're looking for something authentic\n"

## Accuracy of Recommendations

In [30]:
doc2vec_recommendations = pd.read_csv('../recommendation/doc2vec_content_recommendation.csv', index_col = 0)
lda_recommendations = pd.read_csv('../recommendation/lda_content_recommendation.csv', index_col = 0)

In [31]:
import ast
doc2vec_recommendations['recommendations'] = doc2vec_recommendations['recommendations'].apply(lambda x: ast.literal_eval(x))
lda_recommendations['recommendations'] = lda_recommendations['recommendations'].apply(lambda x: ast.literal_eval(x))

In [32]:
def get_accuracy_counts(data):
    count = 0
    new_rest_count_list = []
    old_rest_correct_count_list = []
    old_rest_wrong_count_list = []

    for idx, row in data.iterrows():
        recommendations = row['recommendations']
        reviewer = row['reviewer']
        reviewer_sentiments = replacement_df[replacement_df['reviewer'] == reviewer]
        new_rest_count = 0
        old_rest_correct_count = 0
        old_rest_wrong_count = 0

        for recommendation in recommendations:
            result_df = reviewer_sentiments[reviewer_sentiments['link'] == recommendation]
            result_df = result_df.drop_duplicates(['link'])
            if len(result_df) == 0: # not inside the reviewers reviews. New restaurant
                new_rest_count+=1
            else:
                if int(result_df['prediction']) == 1:
                    old_rest_correct_count += 1

                else:
                    old_rest_wrong_count += 1

        new_rest_count_list.append(new_rest_count)
        old_rest_correct_count_list.append(old_rest_correct_count)
        old_rest_wrong_count_list.append(old_rest_wrong_count)

        count +=1
        if count%1000 == 0:
            print(count)
    data['new_rest_count'] = new_rest_count_list
    data['old_rest_correct_count'] = old_rest_correct_count_list
    data['old_rest_wrong_count'] = old_rest_wrong_count_list

    data['old_recomm_count'] = data['old_rest_correct_count']+data['old_rest_wrong_count']
    data['perc_correct_counts'] = data['old_rest_correct_count']/data['old_recomm_count']
    mean = data['perc_correct_counts'].mean()
    print(f'Percentage of Correct Counts:{mean}')

In [33]:
get_accuracy_counts(doc2vec_recommendations)

1000
2000
3000
4000
5000
Percentage of Correct Counts:0.6985944683095886


In [35]:
get_accuracy_counts(lda_recommendations)

1000
2000
3000
4000
5000
Percentage of Correct Counts:0.7003980654761867
