In [1]:
import pyspark
from pyspark import SparkContext, SparkConf
from Classifier.Classifier import YelpClassifier
import sys

## FlatMap Functions

In [2]:
def input_line_to_review(input_line):
    # some reviews may be malformed/not contain all of the fields
    tokens = input_line
    review_id, num_stars, review_text = tokens[0], tokens[1], tokens[2]
    return [(review_id, num_stars, review_text)]

def input_line_to_review_no_label(input_line):
    # some reviews may be malformed/not contain all of the fields
    tokens = input_line
    review_id, review_text = tokens[0], tokens[2]
    return [(review_id, review_text)]

def get_review_stars(actual_rdd):
    # Transformations:
    # 1. [(review_id, num_stars, review_text_as_string)] --> [(review_id, true_num_stars)]

    reviews_and_stars = actual_rdd \
                        .flatMap(filter_only_review_id_and_stars) \
                        .sortByKey()

    return reviews_and_stars

# Converts a review to a key-value pair of only review ID and its number of stars
def filter_only_review_id_and_stars(review):
    review_id, num_stars, review_text = review
    return [(review_id, num_stars)]


In [3]:
def initRDDs(training_data_file, test_data_file):

    # takes text file of reviews --> RDD
    try:
        train_rdd = training_data_file.flatMap(input_line_to_review)
        test_rdd = test_data_file.flatMap(input_line_to_review_no_label)
        actual_rdd = test_data_file.flatMap(input_line_to_review)
        return train_rdd, test_rdd, actual_rdd
    except:
        print >> sys.stderr, "Unable to load train and test data files"
        return


## Read Review Data and Run Driver Functions

In [5]:
reviews = sqlContext.read.json ("/Users/rileycampbell/Desktop/GitHub/yelp-star-prediction/yelp_dataset/review.json")
review_rows = reviews.select("review_id", "stars", "text")

In [6]:
pracitce_df = review_rows.head(10000)


training_file = sc.parallelize(pracitce_df[0:9000])
test_file = sc.parallelize(pracitce_df[9000:10000])


train_rdd, test_rdd, actual_rdd = initRDDs(training_file, test_file)

# Run the driver functions
# classifier.train runs on the training RDD
# classifier.classify runs on the testing RDD
classifier = YelpClassifier()
classifier.train(train_rdd)
predictedReviews = classifier.classify(test_rdd)

## Determine Accuracy of Model Predictions

In [7]:
actualReviews = get_review_stars(actual_rdd).collect()
hits = 0
star_hits_dict = {1: 0, 2:0, 3: 0, 4:0, 5: 0}
total = 0
star_total_dict = {1: 0, 2:0, 3: 0, 4:0, 5: 0}

for predicted, actual in zip(predictedReviews, actualReviews):
    actualLabel = int(actual[1])
    predictedLabel = int(predicted[1][0])
    if actualLabel == predictedLabel:
        star_hits_dict[actualLabel] += 1
        hits += 1
    star_total_dict[actualLabel] += 1
    total += 1

accuracy = float(hits) / float(total)
one_star_acc = float(star_hits_dict[1]) / float(star_total_dict[1])
three_star_acc = float(star_hits_dict[3]) / float(star_total_dict[3])
five_star_acc = float(star_hits_dict[5]) / float(star_total_dict[5])

In [8]:
print("\n\n\n\t\t\t\t\t############################################################")
print("\t\t\t\t\t\t\t\t\t\t\t\t")
print("\t\t\t\t\t\tOVERALL ACCURACY: %f (%d/%d)\t\t" %(accuracy, hits, total))
print("\t\t\t\t\t\t1 STAR REVIEW ACCURACY: %f (%d/%d)\t" \
    %(one_star_acc, star_hits_dict[1], star_total_dict[1]))
print("\t\t\t\t\t\t3 STAR REVIEW ACCURACY: %f (%d/%d)\t" \
    %(three_star_acc, star_hits_dict[3], star_total_dict[3]))
print("\t\t\t\t\t\t5 STAR REVIEW ACCURACY: %f (%d/%d)\t" \
    %(five_star_acc, star_hits_dict[5], star_total_dict[5]))
print("\t\t\t\t\t\t\t\t\t\t\t\t")
print("\t\t\t\t\t############################################################\n\n\n\n")




					############################################################
												
						OVERALL ACCURACY: 0.416000 (416/1000)		
						1 STAR REVIEW ACCURACY: 0.500000 (71/142)	
						3 STAR REVIEW ACCURACY: 0.500000 (56/112)	
						5 STAR REVIEW ACCURACY: 0.459091 (202/440)	
												
					############################################################




