# Imports

In [1]:
import pandas as pd
import json
import numpy as np
import gensim
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models.wrappers import LdaMallet
import operator 
import joblib
from nltk.tokenize import word_tokenize
import nltk
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
import string

In [2]:
# configs
mallet_path = './Data/Mallet/bin/mallet'

# Topic Modelling with LDA Mallet
* Get the topics from every review
* takes the longest time

In [31]:
review_df = pd.read_pickle("./Data/review.pkl")  

In [32]:
def get_topic_model(df):
    
    # tokenize
    docs=df['text'].apply(word_tokenize)
    # a mapping between words and their integer ids
    d= gensim.corpora.Dictionary(docs)
    # convert docs to vec
    v = [d.doc2bow(doc) for doc in docs]
    
    # LDA mallet
    ldamallet= gensim.models.wrappers.LdaMallet(mallet_path, corpus = v, num_topics = 8, id2word = d)
    
    return ldamallet

In [None]:
# unhash if need to retrain
# ldamallet = get_topic_model(review_df)
# joblib.dump(ldamallet, 'reviews_ldamallet.jl')

# load model
ldamallet_disk = joblib.load('reviews_ldamallet.jl')

In [9]:
#map the reviews data to the model, v is the reviews transformed to word vectors by doc2bow
m = ldamallet[v[0:len(v)]]

#assign topic to each review
topic = []
for x in m:
    #find the topic with the highest proportions
    t = max(x, key = operator.itemgetter(1))
    topic.append(t[0])
    
top_10k = pd.Series(topic)

In [21]:
#map topic names to the topic numbers
topic_dict = {0:'Atmosphere', 1:'Food', 2:'Service',
             3:'Food', 4:'Waiting time', 5:'Food',
             6:'Food', 7:'Hospitality'}

top_10k = top_10k.map(topic_dict)

In [23]:
top_10k.to_csv("./Data/top_10k_mallet.csv")

# Sentiment Analysis
* score the topics from the restaurants
* create dataset with only restaurants


## df construction

In [46]:
#load top_10k_mallet.csv with sub topics
lda_mallet = pd.read_csv('./Data/top_10k_mallet.csv')
#load english data'set with around 10k rows
review_df = pd.read_pickle("./Data/review.pkl")  
business_df = pd.read_pickle("./Data/business.pkl")  

In [48]:
top_10k = review_df.merge(business_df, on='business_id', how='inner')

In [49]:
lda_mallet['Unnamed: 0'] = top_10k.name
lda_mallet.rename(columns = {'Unnamed: 0':'name', '0':'topic'}, inplace = True)
lda_mallet.head()

Unnamed: 0,name,topic
0,Pho Bistro,Food
1,Pho Bistro,Food
2,Pho Bistro,Atmosphere
3,Pho Bistro,Service
4,Pho Bistro,Hospitality


In [50]:
# leave only restaurants
top_10k = top_10k.dropna()
top_10k = top_10k[top_10k['categories'].str.contains("Restaurant")]

## Standard Sentiment Analysis

In [51]:
#Standard Sentiment Analysis

#load pos_lexicon and neg_lexicon for standard sentiment analysis
pos_lexicon = './Data/positive-words.txt'
neg_lexicon = './Data/negative-words.txt'

#subset the required columns
top_10k_standard = top_10k.loc[:, ['name','business_id', 'text']]

#split the reviews into texts by space
top_10k_standard['text_sep'] = top_10k_standard['text'].map(lambda x:x.split())

#create sentiment lexicons
def create_dictionary(path):
 
    dictionary = {}
    f = open(path, 'r', encoding = "ISO-8859-1")
    for line in f:
        line = line.strip()
        dictionary[line] = 1

    f.close()
    return dictionary
    


#calculate sentiment score
def sentiment_score(dictionary):

    score = top_10k_standard['text_sep'].map(lambda row: list(map(lambda x: 1 if x in dictionary else 0, row)))
    score_list = []

    for row in score:
        score_list.append(sum(row))
        
    score_df = pd.DataFrame(score_list)
    return score_df


In [52]:
#create positive and negative lexicons 
pos_dict = create_dictionary(pos_lexicon)
neg_dict = create_dictionary(neg_lexicon)

#calculate sentiment score for each review
top_10k_standard['pos_score'] = sentiment_score(pos_dict)
top_10k_standard['neg_score'] = sentiment_score(neg_dict)

#normalize the score to 0-5
top_10k_standard['topic_score'] = top_10k_standard['pos_score'] / (top_10k_standard['pos_score'] + abs(top_10k_standard['neg_score']))*5

In [53]:
def final_score_standard(dataset):

    reviews_score = dataset['topic_score']
    reviews_score = pd.DataFrame(reviews_score)
    
    #add new columns to lda_mallet to have individual review score
    lda_mallet['topic_score'] = reviews_score
    top_10k_score = lda_mallet.loc[:, ['name', 'topic', 'topic_score']]
    
    #calculate the average ratings for each topic
    top_10k_score = round(abs((top_10k_score.groupby(['name',  'topic']).sum() /
                    (top_10k_score.groupby(['name', 'topic']).count() + 1))), 1)
    
    pd.set_option('display.max_rows', 100)
    return top_10k_score

final_score_standard = final_score_standard(top_10k_standard)
final_score_standard

Unnamed: 0_level_0,Unnamed: 1_level_0,topic_score
name,topic,Unnamed: 2_level_1
Joe's Throwback Barber Shop,Food,0.0
Joe's Throwback Barber Shop,Hospitality,0.0
Joe's Throwback Barber Shop,Service,0.0
Leland's Barbershop,Atmosphere,0.0
Leland's Barbershop,Food,0.0
...,...,...
ā café,Waiting time,2.0
ōLiv Tucson,Atmosphere,0.0
ōLiv Tucson,Food,0.0
ōLiv Tucson,Hospitality,0.0
