# Part3 Superscore

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
## Importing Basic Packages
import matplotlib.pyplot as plt
import numpy as np

# Import NLP Packages
import re
import nltk
from wordcloud import WordCloud

# NLTK Stop words
# nltk.download('stopwords')
# nltk.download('wordnet')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

from textblob import Word

In [3]:
spark = SparkSession.builder.config('spark.driver.memory', '8g').config('spark.executor.memory','8g').config('spark.driver.maxResultSize','0').config('spark.sql.autoBroadcastJoinThreshold','-1').config('spark.sql.broadcastTimeout','1200').config('spark.default.parallelism','8').appName("superscore").getOrCreate()

# Part 1 Review

In [4]:
path = 'part1_dataclean_review.csv'
review = spark.read.csv(path, header=True, multiLine=True)

AnalysisException: Path does not exist: file:/E:/courses/part1_dataclean_review.csv

In [None]:
review.count()

In [None]:
review.printSchema()

## Sentiment Analysis

In [None]:
review.select(['text']).show(5)

### Negative Reviews Forsight

In [None]:
neg_reviews = review.filter(review.stars <= 2.5)  # reviews with star <= 2.5
print(neg_reviews.count(), len(neg_reviews.columns))

In [None]:
neg_reviews.show(5)

In [None]:
from pyspark.ml.feature import CountVectorizer, StopWordsRemover, RegexTokenizer

neg_reviews_text = neg_reviews.select(['text'])

tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W")
wordsData = tokenizer.transform(neg_reviews_text)


remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filter_data = remover.transform(wordsData)


cv = CountVectorizer(inputCol='filtered', outputCol='vectors')
model = cv.fit(filter_data)

counts = model.transform(filter_data).select(['filtered','vectors'])

neg_words_top10 = counts.select('vectors', explode("filtered").alias("word")).groupBy("word").count()
neg_words_top10 = neg_words_top10.sort("count", ascending=False)
neg_words_top10.show(10) # list

In [None]:
# Remove non-informative words
tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W")
wordsData = tokenizer.transform(neg_reviews_text)


remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filter_data = remover.transform(wordsData)


cv = CountVectorizer(inputCol='filtered', outputCol='vectors',minDF=2.0, vocabSize=1000)
model = cv.fit(filter_data)

counts = model.transform(filter_data).select(['filtered','vectors'])

non_informative_words = ['food', 'place', 'like', 'service','good','one','get','time','back','us']

neg_words_top150 = counts.select('vectors', explode("filtered").alias("word")).where(col("word").isin(non_informative_words)==False).groupBy("word").count()
neg_words_top150 = neg_words_top150.sort("count", ascending=False)
neg_words_top150 = neg_words_top150.select(['word']).take(150) # list

In [None]:
neg_words_top150_list = []
for nwt150 in neg_words_top150:
    if nwt150['word'].isnumeric():
        continue
    neg_words_top150_list.append(nwt150['word'])

In [None]:
print(neg_words_top150_list)

In [None]:
wordcloud_low = WordCloud(background_color="white").generate(' '.join(neg_words_top150_list))
plt.figure(figsize = (12,10))
plt.imshow(wordcloud_low, interpolation='bilinear')
plt.title('Word Cloud - Negative Yelp Restaurant Reviews', fontsize=16, y=1.01)
plt.axis("off")

### Positive Reviews Forsight

In [None]:
pos_reviews = review.filter(review.stars >= 4.5)  # reviews with star >= 4.5
print(pos_reviews.count(), len(pos_reviews.columns))

In [None]:
pos_reviews.show(5)

In [None]:
pos_reviews_text = pos_reviews.select(['text'])

tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W")
wordsData = tokenizer.transform(pos_reviews_text)


remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filter_data = remover.transform(wordsData)


cv = CountVectorizer(inputCol='filtered', outputCol='vectors')
model = cv.fit(filter_data)

counts = model.transform(filter_data).select(['filtered','vectors'])

pos_words_top10 = counts.select('vectors', explode("filtered").alias("word")).groupBy("word").count()
pos_words_top10 = pos_words_top10.sort("count", ascending=False)
pos_words_top10.show(10) # list

In [None]:
# Remove non-informative words
tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W")
wordsData = tokenizer.transform(pos_reviews_text)


remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filter_data = remover.transform(wordsData)


cv = CountVectorizer(inputCol='filtered', outputCol='vectors',minDF=2.0, vocabSize=1000)
model = cv.fit(filter_data)

counts = model.transform(filter_data).select(['filtered','vectors'])

non_informative_words = ['food', 'place', 'like', 'service','portland','one']

pos_words_top150 = counts.select('vectors', explode("filtered").alias("word")).where(col("word").isin(non_informative_words)==False).groupBy("word").count()
pos_words_top150 = pos_words_top150.sort("count", ascending=False)
pos_words_top150 = pos_words_top150.select(['word']).take(150) # list

In [None]:
pos_words_top150_list = []
for pwt150 in pos_words_top150:
    if pwt150['word'].isnumeric():
        continue
    pos_words_top150_list.append(pwt150['word'])

In [None]:
print(pos_words_top150_list)

In [None]:
wordcloud_low = WordCloud(background_color="white").generate(' '.join(pos_words_top150_list))
plt.figure(figsize = (12,10))
plt.imshow(wordcloud_low, interpolation='bilinear')
plt.title('Word Cloud - Positive Yelp Restaurant Reviews', fontsize=16, y=1.01)
plt.axis("off");

### Rich Features for Review

In [None]:
def get_cleaned_text(value):
    words = re.split("(?:[^a-zA-Z']+)", value.lower())
    result = " "
    result = result.join(Word(w.lower()).lemmatize() for w in words if w.lower() not in stop_words)
    return result

udfget_cleaned_text = udf(get_cleaned_text, StringType())
review = review.withColumn("cleaned_text", lit(udfget_cleaned_text("text")))

review.show(2, truncate=False)

In [None]:
review.printSchema()

### Text Classification

In [None]:
# Add columns 'polarity' and 'subjectivity'
from textblob import TextBlob

def cal_polarity(value):
    return TextBlob(value).sentiment[0]

udfcal_polarity = udf(cal_polarity, DoubleType())
review = review.withColumn("polarity", lit(udfcal_polarity("cleaned_text")))

def cal_subjectivity(value):
    return TextBlob(value).sentiment[1]

udfcal_subjectivity = udf(cal_subjectivity, DoubleType())
review = review.withColumn("subjectivity", lit(udfcal_subjectivity("cleaned_text")))


### Vader Sentiment

In [None]:
# Calculate Vader Sentiment Analysis Scores

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def cal_compound(value):
    return analyzer.polarity_scores(value)['compound']

udfcal_compound = udf(cal_compound, DoubleType())
review = review.withColumn("compound", lit(udfcal_compound("text")))

### Superscore Calculation

In [None]:
review = review.withColumn('superscore',review['stars']+(review['polarity']*review['subjectivity']*review['compound']))

In [None]:
review.show(2, truncate=False)

In [None]:
review.printSchema()

In [None]:
print(review.count(), len(review.columns))

# Save to CSV

In [None]:
review.coalesce(1).write.option("header",True).csv("part3_superscore_review.csv")