In [16]:
import json
import sys

REVIEW_DATA_FILE = "Phoenix_restaurant_reviews.json"

# To resolve the ascii encoding error
reload(sys)
sys.setdefaultencoding('utf-8')


# Code to read json data by line
def decode_json(line):
    try:
        return json.loads(line)
    except:
        return None

def read_reviews(file_name):
    restaurant_reviews = []
    with open(file_name) as f:
        for line in f:
            restaurant_reviews.append(decode_json(line))
    return restaurant_reviews

In [17]:
restaurant_reviews = read_reviews(REVIEW_DATA_FILE)

In [18]:
import nltk
nltk.download('averaged_perceptron_tagger')

# Get positive chunk
positiveChunkPOS_tag = "positive: {<JJ> <NN>|<JJ> <NNS>|<RB> <JJ>|<RBR> <JJ>}"
def get_positive_chunks(POStagged_reviews):
    positive_score = 0.0
    detected_positive = False
    positive_parser = nltk.RegexpParser(positiveChunkPOS_tag)
    chunk_reviews = positive_parser.parse(POStagged_reviews)
    subtrees = chunk_reviews.subtrees()
    results_positive_phrases = []
    for each_subtree in subtrees:
        if each_subtree.label() == "positive":
            noun_phrase = ""
            (terms, tags) = zip(*each_subtree)
            for i in range(0, len(terms)):
                noun_phrase = noun_phrase + " " + terms[i]
            polarity_score = Pattern.sentiment(noun_phrase.strip())
            if polarity_score[0] >= (0.2) and polarity_score[1] >= 0.5:
                results_positive_phrases.append(noun_phrase)
                positive_score += Pattern.sentiment(noun_phrase)[0]
                detected_positive = True
    return detected_positive, positive_score, results_positive_phrases

In [19]:
negativeChunkPOS_tag = "negative: {<JJ> <NN>|<JJ> <NNS>|<RB> <JJ>|<RBR> <JJ>}"

def get_negative_chunks(POStagged_reviews):
    negative_score = 0.0
    detected_negative = False
    results_negative_phrases = []
    negative_parser = nltk.RegexpParser(negativeChunkPOS_tag)
    chunk_reviews = negative_parser.parse(POStagged_reviews)
    subtrees = chunk_reviews.subtrees()
    for subtree in subtrees:
        if subtree.label() == 'negative':
            noun_phrase = ""
            (terms, tags) = zip(*subtree)
            for i in range(0, len(terms)):
                noun_phrase = noun_phrase + " " + terms[i]
            polarity_score = Pattern.sentiment(noun_phrase.strip())
            if polarity_score[0] <= (-0.1) and polarity_score[1] >= 0.4:
                results_negative_phrases.append(noun_phrase)
                negative_score += Pattern.sentiment(noun_phrase)[0]
                detected_negative = True
    return detected_negative, negative_score, results_negative_phrases

In [20]:
import pattern.en as Pattern

def generate_results(restaurant_reviews):
    id = 0
    results_csv = []
    for each_review_list in restaurant_reviews:
        for each_review in each_review_list:
            id += 1
            results_csv_row = {}
            review_text = each_review['text']
            results_csv_row["Reviews"] = review_text
            results_csv_row["Stars"] = each_review['stars']
            results_csv_row["Business Id"] = each_review['business_id']
            tokenize_reviews = nltk.word_tokenize(review_text)
            POStagged_reviews = nltk.pos_tag(tokenize_reviews)
            detected_positive, positive_score, results_positive_phrases = get_positive_chunks(POStagged_reviews)
            detected_negative, negative_score, results_negative_phrases = get_negative_chunks(POStagged_reviews)
            if detected_positive or detected_negative:
                results_csv_row["Positive_Phrases"] = results_positive_phrases
                results_csv_row["Negative_Phrases"] = results_negative_phrases
                results_csv_row["Positive_Polarity"] = positive_score
                results_csv_row["Negative_Polarity"] = negative_score
                results_csv_row["Text sentiment"] = Pattern.sentiment(review_text.strip())
                results_csv.append(results_csv_row)
    return results_csv

In [21]:
import csv
import codecs

def write_csv_output(results_csv, file_name):
    csvfile = codecs.open(file_name.strip(".json") + "_Results.csv", 'w', 'utf-8')
    fieldnames = ["Business Id", "Reviews", "Stars", "Positive_Phrases", "Positive_Polarity", "Negative_Phrases",
                  "Negative_Polarity", "Text sentiment"]
    results_writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    for results_csv_row in results_csv:
        results_writer.writerow(results_csv_row)

In [None]:
results_csv = generate_results(restaurant_reviews)

In [15]:
write_csv_output(results_csv, REVIEW_DATA_FILE)