<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Model-Building-on-Reviews-to-predict-the-Aspects" data-toc-modified-id="Model-Building-on-Reviews-to-predict-the-Aspects-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Model Building on Reviews to predict the Aspects</a></span><ul class="toc-item"><li><span><a href="#Importing-libraries" data-toc-modified-id="Importing-libraries-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Importing libraries</a></span></li><li><span><a href="#Pre-processing-on-reviews" data-toc-modified-id="Pre-processing-on-reviews-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Pre-processing on reviews</a></span></li></ul></li><li><span><a href="#Sentiment-Analysis" data-toc-modified-id="Sentiment-Analysis-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Sentiment Analysis</a></span><ul class="toc-item"><li><span><a href="#Test-Codes" data-toc-modified-id="Test-Codes-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Test Codes</a></span></li></ul></li><li><span><a href="#Restaurant-Ratings" data-toc-modified-id="Restaurant-Ratings-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Restaurant Ratings</a></span></li><li><span><a href="#User-weights" data-toc-modified-id="User-weights-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>User weights</a></span></li><li><span><a href="#Aspect-Dictionary-and-User-Weights" data-toc-modified-id="Aspect-Dictionary-and-User-Weights-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Aspect Dictionary and User Weights</a></span><ul class="toc-item"><li><span><a href="#Combined-Dataset" data-toc-modified-id="Combined-Dataset-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Combined Dataset</a></span></li></ul></li></ul></div>

### Model Building on Reviews to predict the Aspects

#### Importing libraries

In [1]:
import os
#os.chdir(r"D:\Capstone Prep\Multi Label Classification")
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import warnings
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import LabelPowerset
import numpy as np

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

from collections import Counter, defaultdict
import re
import gensim

import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
# from neuralcoref import Coref
# coref = Coref(nlp=spacy)
# nlp= spacy.load("en_core_web_sm")

warnings.filterwarnings('ignore')

In [2]:
tree = ET.parse("Restaurants_Train.xml", ET.XMLParser(encoding= "utf-8"))
root = tree.getroot()

In [3]:
labeled_reviews = []
for sentence in root.findall("sentence"):
    entry = {}
    aterms = []
    aspects = []
    sentiment = []
    if sentence.find("aspectTerms"):
        for aterm in sentence.find("aspectTerms").findall("aspectTerm"):
            aterms.append(aterm.get("term"))
    if sentence.find("aspectCategories"):
        for aspect in sentence.find("aspectCategories"):
            aspects.append(aspect.get("category"))
        for aspect in sentence.find("aspectCategories"):
            sentiment.append(aspect.get("polarity"))
            
    entry["text"], entry["terms"], entry["aspects"], entry["sentiment"] = sentence[0].text, aterms, aspects, sentiment
    labeled_reviews.append(entry)
labeled_df = pd.DataFrame(labeled_reviews)
print("We have", len(labeled_reviews), "labeled reviews.")

We have 3044 labeled reviews.


In [4]:
# Save annotated reviews in a pickle file
labeled_df.to_pickle("annotated_reviews_df.pkl")

# Read annotated reviews df -> labeled dataset for training
annotated_reviews_df = pd.read_pickle("annotated_reviews_df.pkl")
annotated_reviews_df.head()

Unnamed: 0,text,terms,aspects,sentiment
0,But the staff was so horrible to us.,[staff],[service],[negative]
1,"To be completely fair, the only redeeming fact...",[food],"[food, anecdotes/miscellaneous]","[positive, negative]"
2,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]",[food],[positive]
3,Where Gabriela personaly greets you and recomm...,[],[service],[positive]
4,"For those that go once and don't enjoy it, all...",[],[anecdotes/miscellaneous],[positive]


In [5]:
# Convert the multi-labels into arrays
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(annotated_reviews_df.aspects) # aspects
X = annotated_reviews_df["text"] # reviews

# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size= 0.25, random_state= 0)

# save the the fitted binarizer labels
filename = 'mlb.pkl'
pickle.dump(mlb, open(filename, 'wb'))

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2283,)
(761,)
(2283, 5)
(761, 5)


#### Pre-processing on reviews

In [6]:
# Preprocessing methods: # https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
# https://towardsdatascience.com/tf-idf-explained-and-python-sklearn-implementation-b020c5e83275

# Text preprocessing, tokenizing and filtering of stopwords

# CountVectorizer supports counts of N-grams of words or consecutive characters. Once fitted, the vectorizer has built a 
# dictionary of feature indices

# TF-IDF: Term Frequency—Inverse Document Frequency: defines how important a word is for a document, while also taking into 
# account the relation to other documents from the same corpus. TF-IDF is a score which is applied to every word in every 
# document in our dataset. And for every word, the TF-IDF value increases with every appearance of the word in a document, 
# but is gradually decreased with every appearance in other documents. 

# CountVectorizer: To transform a count matrix to a normalized tf or tf-idf representation

# LabelPowerset allows for multi-label classification
# Build a pipeline for multinomial naive bayes classification

text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', LabelPowerset(MultinomialNB(alpha=1e-1))),])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

# Calculate accuracy
np.mean(predicted == y_test)

0.8662286465177398

In [7]:
# Test if SVM performs better
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', LabelPowerset(
                             SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, max_iter=6, random_state=42)))])
_ = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)

#Calculate accuracy
np.mean(predicted_svm == y_test)

0.8633377135348226

In [9]:
# Train naive bayes on full dataset and save model
text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', LabelPowerset(MultinomialNB(alpha=1e-1))),])
text_clf = text_clf.fit(X, y)

# save the model to disk
filename = 'naive_model1.pkl'
pickle.dump(text_clf, open(filename, 'wb'))

In [10]:
# Prediction:

#mlb.inverse_transform(predicted)
pred_df = pd.DataFrame(
    {'reviews': X_test,
     'pred_category': mlb.inverse_transform(predicted)
    })

In [11]:
# pd.set_option('display.max_colwidth', -1)
pred_df

Unnamed: 0,reviews,pred_category
453,It's better than being on the roof of Sutton P...,"(anecdotes/miscellaneous,)"
1611,"Don't expect to sit down inside though, there ...","(ambience,)"
2078,"Again, if you are in this neighborhood - by al...","(anecdotes/miscellaneous,)"
2715,Go there to relax and feel like your somewhere...,"(anecdotes/miscellaneous,)"
2602,"As far as the service goes, the waitresses wer...","(service,)"
...,...,...
1903,Its worth the wait though.,"(anecdotes/miscellaneous,)"
2262,"Saul is pretty good, but definitely not great.","(anecdotes/miscellaneous,)"
666,"The food was very good, a great deal, and the ...","(food,)"
2441,"The vibe is very relaxed and cozy, service was...","(food, service)"


In [12]:
pred_df.to_csv('pred_df.csv',index=False)

### Sentiment Analysis

In [13]:
# Loading positive and negative words

neg_file = open("neg_words.txt", encoding = "ISO-8859-1")
pos_file = open("pos_words.txt", encoding = "ISO-8859-1")

neg = [line.strip() for line in neg_file.readlines()] # Readlines returns a list of the lines in the file
pos = [line.strip() for line in pos_file.readlines()]

opinion_words = neg + pos

# Run below code only if running for the first time

# Word2Vec consists of models for generating word embedding. 
# Words that occur in similar context tend to be closer to each other in vector space


 
# glove_input_file = 'glove.6B.100d.txt' # A pre-trained model for sentiment analysis #comment this after running once

glove_vec_file = "glove.6B.100d.txt.word2vec" 
word2vec = gensim.models.KeyedVectors.load_word2vec_format(glove_vec_file, binary= False)
# KeyedVectors:  a mapping between keys and vectors.

pickle.dump(word2vec, open("word2vec_glove.pkl", "wb"))
    
# load above saved word embedding
word2vec = pickle.load(open("word2vec_glove.pkl", "rb"))

# load the multi label binarizer from the aspect model that we've build above
mlb = pickle.load(open("mlb.pkl", "rb"))

# load the fitted Naive Bayes Model
naive_model1 = pickle.load(open("naive_model1.pkl", "rb"))

In [14]:
# Classes in Multi Label Binarizer
mlb.classes_

array(['ambience', 'anecdotes/miscellaneous', 'food', 'price', 'service'],
      dtype=object)

In [15]:
def check_similarity(aspects, word):
    # checks for similarity between the aspect and given word and returns the most similar aspect for the given word
    similarity = []
    for aspect in aspects:
        similarity.append(word2vec.n_similarity([aspect], [word]))
        
        # setting threshold for maximum similarity
    if max(similarity) > 0.18:
        return aspects[np.argmax(similarity)] # Returns the indices of the maximum value
    else:
        return None
    
    
def assign_term_to_aspect(aspect_sent, terms_dict, sent_dict, pred):
    
    # This function takes in a Sentiment dictionary and appends the aspect dictionary 
    # aspect_sent: Total sentiment tally
    # terms_dict: Dictionary with individual aspects and their associated sentiments
    # sent_dict: Counter of the form : Counter(term: sentiment score)
    # returns two types of aspect dictionaries: updated terms_dict and aspect_sent
    
    aspects = ["ambience", "food", "price", "service"]
    
    # checking word2vec
    
    for term in sent_dict:
        try:
            
            if check_similarity(aspects, term.split()[-1]): # use .split() because word2vec can't process Compund Nouns
                terms_dict[check_similarity(aspects, term.split()[-1])][term] += sent_dict[term]
#                 sent_dict[term]
                if sent_dict[term] > 0:
                    aspect_sent[check_similarity(aspects, term.split)]["pos"] += sent_dict[term]
                else:
                    aspect_sent[check_similarity(aspects, term.split)]["neg"] += abs(sent_dict[term])
            elif(pred[0] ==  "anecdotes/miscellaneous"):
                continue
            elif(len(pred) == 1):
                terms_dict[pred[0]][term] += sent_dict[term]
#                 sent_dict[term]
                if sent_dict[term] > 0:
                    aspect_sent[pred[0]]["pos"] += sent_dict[term]
                else:
                    aspect_sent[pred[0]]["neg"] += abs(sent_dict[term])
                    
            # if unable to classify then put it in miscellaneous bucket
            
            else:
                terms_dict["misc"][term] += sent_dict[term]
#                 sent_dict[term]
                if sent_dict[term] > 0:
                    aspect_sent["misc"]["pos"] += sent_dict[term]
                else:
                    aspect_sent["misc"]["neg"] += abs(sent_dict[term])
                    
        except:
#             print(term, "Not in vocab")
            continue
    return aspect_sent, terms_dict


def feature_sentiment(sentence):
    # input: dictionary and sentence
    # this function appends dictionary with new features if features didn't exist previously then updates sentiments
    # to each of the new and oexistingld features
    # returns updated dictionary
    
    sent_dict = Counter()
    sentence = nlp(sentence)
    debug = 0
    for token in sentence:
        # check if word is an opinion word then assign a sentiment
        if token.text in opinion_words:
            sentiment = 1 if token.text in pos else -1
            # if target is an adverb modifier (eg: pretty, highly, etc.) but also an opinion word, ignore and pass
            if token.dep_ == "advmod":
                continue
            elif token.dep_ == "amod": # opinion words that are adjectives, verbs, adverbs, etc.
                sent_dict[token.head.text] += sentiment
                
            else:
                for child in token.children:
                    #  It checks for child tokens for each adjective and picks up the adverbs
                    # if there is an adjective modifier (eg: pretty, very, etc.), then add more weight to sentiment
                    if((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        sentiment *= 1.5
                    # add negative sentiment for negative words
                    if child.dep_ == "neg":
                        sentiment *= -1
                    
                for child in token.children:
                    # if it's a verb then check if it's a direct object
                    # direct object is the noun or noun phrase that's receiving the action of the verb
                    if (token.pos_ == "VERB") & (child.dep_ == "dobj"):
                        sent_dict[child.text] += sentiment
                    
                        # check for conjugates (both a and b) and add them to dictionary
                        subchildren = []
                        conj = 0
                        for subchild in child.children:
                            if subchild.text == "and":
                                conj = 1
                            if (conj == 1) and (subchild.text != "and"):
                                subchildren.append(subchild.text)
                                conj = 0
                        for subchild in subchildren:
                            sent_dict[subchild] += sentiment
                            
                # check for negation
                for child in token.head.children:
                    noun = ""
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if (child.dep_ == "neg"): 
                        sentiment *= -1
                        
                # check for nouns
                for child in token.head.children:
                    noun = ""
                    if (child.pos_ == "NOUN") and (child.text not in sent_dict):
                        noun = child.text
                        # Also, check for compound nouns
                        for subchild in child.children:
                            if subchild.dep_ == "compound":
                                noun = subchild.text + " " + noun
                        sent_dict[noun] += sentiment
                    debug += 1
                
    return sent_dict

def classify_and_sent(sentence, aspect_sent, terms_dict):
    # classifies the sentence into a category and assign a sentiment
    # aspect_dict: parent dictionary with all aspects
    # input the sentence and aspect dictionary which is going to be updated
    # output will be the updated aspect dictionary
    
    # classify the sentence using NB Classifier
    predicted = naive_model1.predict([sentence])
    pred = mlb.inverse_transform(predicted)
#     print(pred)
    # this will take your labels and transform them back to the classes with the encoding. 
    
    # get aspect names and repective sentiments in dictionary form
    sent_dict = feature_sentiment(sentence)
    
    # categorize the aspect names into given 4 aspects in aspect_dict
    aspect_sent, terms_dict = assign_term_to_aspect(aspect_sent, terms_dict, sent_dict, pred[0])
    return aspect_sent, terms_dict

# def replace_pronouns(text):
#     coref.one_shot_coref(text)
#     return coref.get_resolved_utterances()[0]



def split_sentence(text):
    # splits review into list of sentences using spacy's sentence parser
    
    review = nlp(text)
#     print(review)
    bag_sentence = []
    start = 0
    for token in review:
#         print(token) # words
#         print(token.i)
        if token.sent_start == 1:
#             print(token)
            bag_sentence.append(review[start: (token.i-1)])
            start = token.i # index
#             print(bag_sentence)
#             print("a")
            
        if token.i == len(review)-1:
            bag_sentence.append(review[start: (token.i+1)])
#             print("b")
#         print(bag_sentence)
    return bag_sentence

# remove special characters using regex
def remove_special_char(sentence):
    return re.sub(r"[^a-zA-Z0-9.',:;?]+", " ", sentence)

def review_pipe(review, aspect_sent, terms_dict= {"ambience": Counter(), "food": Counter(), "price": Counter(), 
                                                  "service": Counter(), "misc": Counter()}):
#     review = replace_pronouns(review)
    sentences = split_sentence(review)
#     print(sentences)
    for sentence in sentences:
        sentence = remove_special_char(str(sentence))
#         print("Sentences:", sentences)
#         print("Review:", review)
        aspect_sent, terms_dict = classify_and_sent(sentence.lower(), aspect_sent, terms_dict)
    return aspect_sent, terms_dict
    

#### Test Codes

In [16]:
# test code for feature_sentiment

# test1
# sentence = "As far as the service goes, the waitresses were not particularly friendly, but they got the job done."
sentence = "I came here with my friends on a Tuesday night. The sushi here is amazing. Our waiter was very helpful, but the music was terrible."
feature_sentiment(sentence)

Counter({'sushi': 1, 'waiter': 1, 'music': -1})

In [17]:
# test2

sentence= "This is place is in the bottom 1% of my Chinese food experiences.  The atmosphere in the restaurant is creepy and the food tastes bad."
feature_sentiment(sentence)

Counter({'atmosphere': -1, 'food': -1})

In [18]:
# test case1
terms_dict = {"ambience": Counter(), "food": Counter(), "price": Counter(), "service": Counter(), "misc": Counter()}
aspect_sent = {"ambience": Counter(), "food": Counter(), "price": Counter(), "service": Counter(), "misc": Counter()}
review = "This is place is in the bottom 1% of my Chinese food experiences.  The atmosphere in the restaurant is creepy and the food tastes bad."
review_pipe(review, aspect_sent, terms_dict)

({'ambience': Counter(),
  'food': Counter(),
  'price': Counter(),
  'service': Counter(),
  'misc': Counter()},
 {'ambience': Counter({'atmosphere': 1}),
  'food': Counter({'food': 1}),
  'price': Counter(),
  'service': Counter(),
  'misc': Counter()})

In [19]:
# test case 2
test1={'ambience':Counter(), 'food':Counter(), 'price':Counter(), 'service':Counter(),'misc':Counter()}
test2={'ambience':Counter(), 'food':Counter(), 'price':Counter(), 'service':Counter(),'misc':Counter()}
# review = "top notch"
review = "This place is amazing.  I love the food and the staff, everybody is so nice especially the owner. The food is fresh and very good quality."
review_pipe(review, test1, test2)

({'ambience': Counter(),
  'food': Counter(),
  'price': Counter(),
  'service': Counter(),
  'misc': Counter()},
 {'ambience': Counter(),
  'food': Counter({'food': 2, 'quality': 1}),
  'price': Counter({'owner': 1}),
  'service': Counter({'place': 1, 'staff': 1}),
  'misc': Counter()})

In [20]:
# split_sentence("top notch")
split_sentence("I came here with my friends on a Tuesday night. The sushi here is amazing. Our waiter was very helpful, but the music was terrible.")

[I came here with my friends on a Tuesday night,
 The sushi here is amazing,
 Our waiter was very helpful, but the music was terrible.]

In [21]:
# use the word embeddings from word2vec to supplement the Naive Bayes Categorization of Aspects

# word2vec.n_similarity(["food"], ["sushi"]) # similarity b/w 2 terms

In [22]:
os.getcwd()

'C:\\Users\\Chief\\Documents\\Praxis\\Capstone Project\\Notebooks\\threshold'

### Restaurant Ratings

In [17]:
# for 1 lakh reviews
dataset = pd.read_csv("cleaned_reviews.csv")

In [18]:
# os.getcwd()

In [19]:
def scoringFunc(ratingsDict):
    if(ratingsDict['pos']+ratingsDict['neg'] == 0.0):
        return 0.0
    else:
        return (ratingsDict['pos']/(ratingsDict['pos']+ratingsDict['neg']))*5.0

restaurant_ratings = pd.DataFrame(columns=['restaurant','food_ratings','service_ratings','ambience_ratings','price_ratings',
                                           'misc_ratings'])

for res_id in dataset["business_id"].unique():
    
    terms_dict = {'ambience':Counter(), 'food':Counter(), 'price':Counter(), 'service':Counter(),'misc':Counter()}
    aspect_sent = {'ambience':Counter(), 'food':Counter(), 'price':Counter(), 'service':Counter(),'misc':Counter()}
    
    for review in dataset[dataset["business_id"] == res_id]["text"]:
        aspect_sent, terms_dict = review_pipe(review, aspect_sent, terms_dict)
    
    restaurant_ratings = restaurant_ratings.append({"restaurant" : res_id,
                                                    "food_ratings" : scoringFunc(aspect_sent['food']),
                                                    "service_ratings" : scoringFunc(aspect_sent['service']),
                                                    "ambience_ratings" : scoringFunc(aspect_sent['ambience']),
                                                    "price_ratings" : scoringFunc(aspect_sent['price'])
                                                    },ignore_index=True)

KeyboardInterrupt: 

### User weights

In [20]:
def scoringFunc(ratingsDict):
    if(ratingsDict['pos']+ratingsDict['neg'] == 0.0):
        return 0.0
    else:
        return (ratingsDict['pos']/(ratingsDict['pos']+ratingsDict['neg']))*100.0


user_weights = pd.DataFrame(columns=['user','food_weight','service_weight','ambience_weight','price_weight'
                                          ,'misc_weight'])

for user_id in dataset["user_id"].unique():
    
    terms_dict = {'ambience':Counter(), 'food':Counter(), 'price':Counter(), 'service':Counter(),'misc':Counter()}
    aspect_sent = {'ambience':Counter(), 'food':Counter(), 'price':Counter(), 'service':Counter(),'misc':Counter()}
    
    for review in dataset[dataset["user_id"] == user_id]["text"]:
        aspect_sent, terms_dict = review_pipe(review, aspect_sent, terms_dict)
    
    user_weights = user_weights.append({"user" : user_id,
                                                    "food_weight" : scoringFunc(aspect_sent['food']),
                                                    "service_weight" : scoringFunc(aspect_sent['service']),
                                                    "ambience_weight" : scoringFunc(aspect_sent['ambience']),
                                                    "price_weight" : scoringFunc(aspect_sent['price']),
                                                    "misc_weight" : scoringFunc(aspect_sent['misc'])
                                                    },ignore_index=True)
    
    
user_weights['Total'] = user_weights['food_weight'] + user_weights['service_weight'] + user_weights['ambience_weight'] + user_weights['price_weight']
user_weights['food_weight'] = round((user_weights['food_weight']/user_weights['Total']) * 100,2)
user_weights['service_weight'] = round((user_weights['service_weight']/user_weights['Total']) * 100,2)
user_weights['ambience_weight'] = round((user_weights['ambience_weight']/user_weights['Total']) * 100,2)
user_weights['price_weight'] = round((user_weights['price_weight']/user_weights['Total']) * 100,2)
del user_weights['Total']

In [21]:
user_weights.to_csv('user_weights_16052021_0.18.csv',index=False)

In [17]:
def scoringFunc(ratingsDict):
    if(ratingsDict['pos']+ratingsDict['neg'] == 0.0):
        return 0.0
    else:
        return (ratingsDict['pos']/(ratingsDict['pos']+ratingsDict['neg']))*100.0


user_weights = pd.DataFrame(columns=['user','food_weight','service_weight','ambience_weight','price_weight'
                                          ,'misc_weight'])

for user_id in dataset["user"].unique():
    
    terms_dict = {'ambience':Counter(), 'food':Counter(), 'price':Counter(), 'service':Counter(),'misc':Counter()}
    aspect_sent = {'ambience':Counter(), 'food':Counter(), 'price':Counter(), 'service':Counter(),'misc':Counter()}
    
    for review in dataset[dataset["user"] == user_id]["text"]:
        aspect_sent, terms_dict = review_pipe(review, aspect_sent, terms_dict)
    
    user_weights = user_weights.append({"user" : user_id,
                                                    "food_weight" : scoringFunc(aspect_sent['food']),
                                                    "service_weight" : scoringFunc(aspect_sent['service']),
                                                    "ambience_weight" : scoringFunc(aspect_sent['ambience']),
                                                    "price_weight" : scoringFunc(aspect_sent['price']),
                                                    "misc_weight" : scoringFunc(aspect_sent['misc'])
                                                    },ignore_index=True)
    
    
user_weights['Total'] = user_weights['food_weight'] + user_weights['service_weight'] + user_weights['ambience_weight'] + user_weights['price_weight']
user_weights['food'] = round((user_weights['food_weight']/user_weights['Total']) * 100,2)
user_weights['service'] = round((user_weights['service_weight']/user_weights['Total']) * 100,2)
user_weights['ambience_weight'] = round((user_weights['ambience_weight']/user_weights['Total']) * 100,2)
user_weights['price_weight'] = round((user_weights['price_weight']/user_weights['Total']) * 100,2)
del user_weights['Total']

### Aspect Dictionary and User Weights

In [25]:
# To get Sorted Aspect Dictionary and User Weights

aspectDictionary = {"ambience":{},"food":{},"price":{},"service":{},"misc":{}}
sorted_aspectDictionary = {"ambience":{},"food":{},"price":{},"service":{},"misc":{}}
user_weights = pd.DataFrame(columns=['user','food_weight','service_weight','ambience_weight','price_weight'])

def scoringFunc(ratingsDict):
    total = 0.0
    total = total + ratingsDict['pos'] + ratingsDict['neg']
    return round(total, 2)

def addToDictionary(terms_dict):
    aspects = list(aspectDictionary.keys())
    for aspect in aspects:
        for word in list(terms_dict[aspect].keys()):
            if word not in list(aspectDictionary[aspect].keys()):
                aspectDictionary[aspect][word] = terms_dict[aspect][word]
            else:
                aspectDictionary[aspect][word] += terms_dict[aspect][word]

for user_id in dataset["user_id"].unique():
    
    terms_dict = {'ambience':Counter(), 'food':Counter(), 'price':Counter(), 'service':Counter(),'misc':Counter()}
    aspect_sent = {'ambience':Counter(), 'food':Counter(), 'price':Counter(), 'service':Counter(),'misc':Counter()}
    
    for review in dataset[dataset["user_id"] == user_id]["text"]:
        aspect_sent, terms_dict = review_pipe(review, aspect_sent, terms_dict)
    
    user_weights = user_weights.append({"user" : user_id,
                                                    "food_weight" : scoringFunc(aspect_sent['food']),
                                                    "service_weight" : scoringFunc(aspect_sent['service']),
                                                    "ambience_weight" : scoringFunc(aspect_sent['ambience']),
                                                    "price_weight" : scoringFunc(aspect_sent['price'])
                                                    },ignore_index=True)
    addToDictionary(terms_dict)

for aspect in list(sorted_aspectDictionary.keys()):
    sorted_aspectDictionary[aspect] = dict(sorted(aspectDictionary[aspect].items(),key = lambda item: item[1],reverse= True))
    
user_weights['Total'] = user_weights['food_weight'] + user_weights['service_weight'] + user_weights['ambience_weight'] + user_weights['price_weight']
user_weights['food_weight'] = round((user_weights['food_weight']/user_weights['Total']) * 100,2)
user_weights['service_weight'] = round((user_weights['service_weight']/user_weights['Total']) * 100,2)
user_weights['ambience_weight'] = round((user_weights['ambience_weight']/user_weights['Total']) * 100,2)
user_weights['price_weight'] = round((user_weights['price_weight']/user_weights['Total']) * 100,2)
del user_weights['Total']

user_weights.to_csv('user_weights_15052021_0.30.csv',index=False)

import pickle
f = open("sorted_aspectDictionary.pkl","wb")
pickle.dump(sorted_aspectDictionary,f)
f.close()

In [27]:
user_weights2 = pd.read_pickle("sorted_aspectDictionary.pkl")
user_weights2_df = pd.DataFrame.from_dict(user_weights2)
user_weights2_df.to_csv("dictionary_0.3_dictionary.csv")
user_weights2

{'ambience': {'atmosphere': 5935,
  'flavor': 2875,
  'fun': 1896,
  'flavors': 1837,
  'decor': 1808,
  'ambiance': 1747,
  'seating': 1461,
  'vibe': 1438,
  'perfection': 1070,
  'texture': 1069,
  'brunch': 887,
  'hype': 873,
  'ambience': 827,
  'cafe': 684,
  'sweetness': 606,
  'delight': 526,
  'mess': 510,
  'smell': 504,
  'noise': 479,
  'lighting': 459,
  'patio': 458,
  'pleasure': 361,
  'liking': 356,
  'style': 317,
  'charm': 293,
  'diner': 266,
  'blend': 266,
  'feeling': 265,
  'quaint': 263,
  'brew': 259,
  'consistency': 244,
  'bathrooms': 230,
  'layout': 208,
  'chewy': 208,
  'vibes': 205,
  'mood': 185,
  'spicy': 182,
  'hassle': 177,
  'seasoning': 173,
  'amuse': 165,
  'cheesy': 163,
  'cleanliness': 160,
  'richness': 160,
  'bland': 155,
  'impression': 154,
  'chill': 148,
  'quieter': 147,
  'tastes': 140,
  'sense': 138,
  'ale': 137,
  'warmth': 129,
  'crowd': 127,
  'excitement': 123,
  'gran gusto': 115,
  'casual': 115,
  'hole': 112,
  'humo

In [18]:
# user_weights.to_csv('user_weights10_01052021_0.20.csv',index=False)

In [20]:
restaurant_ratings = pd.read_csv("restaurant_ratings_05052021_0.20.csv")
user_ratings = pd.read_csv("user_weights_glove_05052021_0.20.csv")
dataset.head()

Unnamed: 0,review_id,business_id,user_id,stars,text,date,useful,funny,cool,lang,businessname
0,0mOuimxEtWuv63REW07u5g,PYDzfxSLUCCwQysbOkFSVA,k0d3Jnxulohu1HdJj1Hfkg,4.0,This a great place to go to be social with a g...,2014-02-23 22:02:44,1,0,0,en,State Park
1,kCyJd4kZ_nzf67myR6ZbBA,CoZmZKv2lCYd-UoAsAUobA,u0x3SXagjYDbI2N4sgJ0Tw,4.0,This restaurant is delicious! I've had Ethiopi...,2015-10-07 21:21:56,0,0,0,en,Asmara Restaurant
2,vwaXupz9HVDC5hLqV5M6Eg,ch1ercqwoNLpQLxpTb90KQ,80MUDP_Ny_J8jeShVxzdlw,5.0,Great spot. Comfortable little joint smack dab...,2014-06-11 15:27:40,1,1,2,en,Boston Tea Stop
3,SjyigQPHeo5DEMt3Y0xXyQ,HxegWRjhi7m73mXRI8qQIg,p8yQsVA51dzkc9cecDpvrw,4.0,"I mostly eat here at lunch, for the tremendous...",2007-05-17 13:48:57,3,0,2,en,Rangzen Tibetan Place
4,0pjigXXVTpXvfdO4phZoGg,ch1ercqwoNLpQLxpTb90KQ,byro3oSQQ1gRESKlfiAqtQ,2.0,Highlights: Recently revisited BTS -- one of m...,2010-11-22 07:06:31,5,2,8,en,Boston Tea Stop


#### Combined Dataset

In [21]:
combined = dataset.merge(restaurant_ratings,left_on='business_id',right_on='restaurant').merge(user_ratings,left_on='user_id',right_on='user')
combined = combined[["review_id","business_id","food_ratings","service_ratings","ambience_ratings","price_ratings","user_id","food_weight","service_weight","ambience_weight","price_weight"]]
combined.to_csv("combined.csv",index=False)

In [22]:
pd.read_csv("combined.csv")

Unnamed: 0,review_id,business_id,food_ratings,service_ratings,ambience_ratings,price_ratings,user_id,food_weight,service_weight,ambience_weight,price_weight
0,0mOuimxEtWuv63REW07u5g,PYDzfxSLUCCwQysbOkFSVA,2.475728,2.000000,3.000000,0.000000,k0d3Jnxulohu1HdJj1Hfkg,100.0,0.0,0.0,0.0
1,5nodndnhlTRSNBtDtXb70w,PYDzfxSLUCCwQysbOkFSVA,2.475728,2.000000,3.000000,0.000000,x9Tu88OtnHpD4jZivjQH2w,,,,
2,JMp7GaSu3OAguLZQWc1BOg,PKnj9SK8M9aSrWpf0rQtPg,3.348624,3.518519,4.285714,5.000000,x9Tu88OtnHpD4jZivjQH2w,,,,
3,Ua0-TPKLFshpNOKQaVXBKw,FxveeHL_B0Kkz1KjPKyF3A,2.763158,1.702128,2.500000,0.000000,x9Tu88OtnHpD4jZivjQH2w,,,,
4,mMKHamzU0MvMoUKyXNthQA,A3D8gBGNmt51kvcI0GkaGQ,3.517007,2.068966,0.789474,1.666667,x9Tu88OtnHpD4jZivjQH2w,,,,
...,...,...,...,...,...,...,...,...,...,...,...
109098,qGDjnd8rcQLITdebxESRNw,ovIxZ7qvNT76tilujesmIA,3.200000,5.000000,0.000000,0.000000,z5EZMQOjb4O1GFWhX_L41w,100.0,0.0,0.0,0.0
109099,Fr08TOAfEzIIPLN6VF84ig,ovIxZ7qvNT76tilujesmIA,3.200000,5.000000,0.000000,0.000000,FJKJQEehh4oTTXzttoUF3Q,100.0,0.0,0.0,0.0
109100,IMZ9_xIOoxW9adXZTx1HWw,ovIxZ7qvNT76tilujesmIA,3.200000,5.000000,0.000000,0.000000,s_GaSif_DbS5pAEUxSoWwQ,,,,
109101,YcJTiZGLQtwxEm7MNsqrYQ,ovIxZ7qvNT76tilujesmIA,3.200000,5.000000,0.000000,0.000000,8Cku0Kg2QXWkTbgXRY2u-g,,,,
