In [1]:
from pyspark.sql.functions import *
import string

wine_data =spark.read.option("header", "true").option("mode", "DROPMALFORMED").csv("winemag-data-130k-v2.csv")
wine_data = wine_data.select('points','description').dropna()

In [None]:
# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet
from pyspark.sql.functions import udf

def get_wordnet_pos(treebank_tag):
    """
    return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
    """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN
    
import string
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()
analyzer = SentimentIntensityAnalyzer() 


# Lemmatization is the process of converting a word to its base form.
# The difference between stemming and lemmatization is,
# lemmatization considers the context and converts the word to its meaningful base form, 
# whereas stemming just removes the last few characters, often leading to incorrect meanings and spelling errors.



def clean_text(text):
    #tokenize text and remove puncutation
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]

    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    
    from nltk.corpus import stopwords
    stop=set(stopwords.words('english'))
    text = [x for x in stripped if x not in stop]
    
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    
    # pos tag text
    pos_tags = pos_tag(text)

    # lemmatize text
    text = [lemmatizer.lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]

    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all

    return text

def vadar_analyzer(text):
    text = " ".join(text)
    score =  analyzer.polarity_scores(text)
    return score['compound']

clean_text_udf = udf(clean_text)
vadar_analyzer_udf = udf(vadar_analyzer)

wine = wine_data.withColumn("review", clean_text_udf(col("description")))
wine = wine.withColumn("customer_review_score", vadar_analyzer_udf(col("review")))
wine = wine.withColumn("label", when(col("description")>80, 1.0).otherwise(0.0))
wine = wine.select(["description","label","customer_review_score"])

positive = wine.filter((col("customer_review_score") >= 0.75))
negetive = wine.filter((col("customer_review_score") <= -0.75))        
ironicly = wine.filter((col("customer_review_score") <= -0.75) & (col("label") == 1))    




In [None]:

ironicly = ironicly.rdd
ironicly.collect()

In [None]:
negetive = negetive.rdd
negetive.collect()

In [None]:
positive = positive.rdd
positive.collect()