In [None]:
import json
from nltk.corpus import wordnet as wn
import nltk
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

DATA_PATH = "C:/Users/utkar/Downloads/large_data.json" # Change this for data

#find sentiment vader
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [None]:
from collections import defaultdict
# The synonyms of the 3 categories
temp_dict = defaultdict(list)
for key in ["ambiance","food","service"]:
        for syn in wn.synsets(key):
                temp_dict[key].append(syn.name())
m = {'food':0,'service':1,'ambiance':2, 'other':3}
m_back = {0: 'food', 1: 'service', 2: 'ambiance', 3: 'other'}

In [None]:
def categorized_sentence(sentence):
    noun_labels = []
    noun_chunk_int_label = {}
    noun_chunk_final_label = {}
    
    # Clean the text
    sentence = sentence.strip()
    sentence = nlp(sentence, disable=['parser', 'ner'])
    sentence = [tok.lemma_.lower().strip() for tok in sentence if (tok.lemma_ != '-PRON-' and tok.is_stop==False and tok.is_punct==False)]
    sentence = ' '.join(sentence)
    
    # Remove all the numbers
    formatted_chunk = ''.join(e for e in sentence if (e.isalpha() or e==' '))
    
    for word in formatted_chunk.split(' '):
        word_hypernym_set = set()
        for syn in wn.synsets(word, pos=wn.NOUN):
            #print(wn.synset(syn.name()).hypernym_paths())
            for hypernym in wn.synset(syn.name()).hypernym_paths()[0]:
                word_hypernym_set.add(hypernym.name())

            #print(word_hypernym_set)
            word_label = set()
            for key in temp_dict:
                for syn in temp_dict[key]:
                    if syn in word_hypernym_set:
                        word_label.add(key)
                        noun_chunk_int_label[key] = 1
                        noun_chunk_final_label[m[key]] = 1
                noun_labels.append(word_label)
    if len(noun_chunk_final_label.keys())>0:
        return set(noun_chunk_final_label.keys())
    else:
        return set([3])

In [None]:
# Load all data
all_restaurants_reviews = []
inputfile = open(DATA_PATH, "r")
index = 0
#Read all the restaurant data
for line in inputfile:
    index += 1
    # Fetch the yelp ui
    restaurant = json.loads(line)

    if "google" in restaurant:
        restaurant_id = restaurant["google"]["place_id"]
    else:
        restaurant_id = restaurant["yelp"]["id"]
    
    # Initialize th erestaurant object
    r_dict = {
        'id': restaurant_id,
        'food_neg': 0,
        'food_neu': 0,
        'food_pos': 0,
        'food_compound': 0,
        'service_neg': 0,
        'service_neu': 0,
        'service_pos': 0,
        'service_compound': 0,
        'ambiance_neg': 0,
        'ambiance_neu': 0,
        'ambiance_pos': 0,
        'ambiance_compound': 0,
        'other_neg': 0,
        'other_neu': 0,
        'other_pos': 0,
        'other_compound': 0
    }

    count = {
        "food": 0,
        "service": 0,
        "ambiance": 0,
        "other": 0
    }

    # Focus on the yelp reviews
    restaurant_yelpui = restaurant['uiyelp']
    for review in restaurant_yelpui['reviews']:
        for sentence in nltk.sent_tokenize(review['text']):
            category = categorized_sentence(sentence)
            snt = analyser.polarity_scores(sentence)

            for c in category:
                count[m_back[c]] += 1
                for s in snt:
                    r_dict[m_back[c] + "_" + s] += snt[s]

    for c in m.keys():
        r_dict[c + "_neg"] /= (1, count[c])[count[c]>0]
        r_dict[c + "_neu"] /= (1, count[c])[count[c]>0]
        r_dict[c + "_pos"] /= (1, count[c])[count[c]>0]
        r_dict[c + "_compound"] /= (1, count[c])[count[c]>0]

    all_restaurants_reviews.append(r_dict)
    
    print("Restaurant " + str(index) + " processed.")
        

In [None]:
all_restaurants_reviews[0]

In [None]:
import csv
with open('C:/Users/utkar/Downloads/output.csv', 'w') as f:  # Just use 'w' mode in 3.x
    w = csv.DictWriter(f, all_restaurants_reviews[0].keys())
    w.writeheader()
    for restaurant in all_restaurants_reviews:
        w.writerow(restaurant)