# Sentiment Analysis 

In [5]:
import os
import re

import numpy as np
import pandas as pd

from pyspark.sql.types import *
from pyspark.sql.functions import expr
from pyspark.sql import Row
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.ml.clustering import LDA
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import functions as F
import sparknlp
from sparknlp import DocumentAssembler, Finisher
from sparknlp.annotator import *
from sparknlp.annotator import StopWordsCleaner
%matplotlib inline
from sparknlp.annotator import PerceptronModel
spark = sparknlp.start()
from pyspark.sql.functions import udf, explode
from pyspark.sql.types import *
from pyspark.ml.feature import CountVectorizer,StringIndexer, RegexTokenizer,StopWordsRemover
from sparknlp.annotator import NGramGenerator
from sparknlp.pretrained import PretrainedPipeline

In [6]:
dataPath = "C:/Users/saika/Desktop/Pyspark/data/shopee_reviews.csv"
df = spark.read.csv(dataPath, header='true', inferSchema = 'true')

In [7]:
df.show()

+--------------------+--------------------+
|               label|                text|
+--------------------+--------------------+
|                   5|Looks ok. Not lik...|
|                   5|Tried, the curren...|
|                   5|Item received aft...|
|                   5|Thanks!!! Works a...|
|                   5|Fast delivery con...|
|                   5|Fast delivery goo...|
|                   5|Got my order and ...|
|                   5|Items received in...|
|                   5|Received in good ...|
|                   1|Item doesn’t work . |
|Asked me to send ...| show a non worki...|
|Don’t waste time ...|                null|
|                   5|         Fast. Great|
|                   5|I've tried it, an...|
|                   5|Hub uses it. Musc...|
|                   5|Well received. Fa...|
|                   5|Product received....|
|                   5|             Good.. |
|                   5|box was a little ...|
|                   4|Fast deliv

In [9]:
df.na.drop(subset=["text"])
from pyspark.sql.functions import trim
text1 = df.withColumn("text", trim(df.text))
text = text1.select('text').filter(F.col('text').isNotNull())

text.limit(5).show(truncate=90)

+------------------------------------------------------------------------------------------+
|                                                                                      text|
+------------------------------------------------------------------------------------------+
|   Looks ok. Not like so durable. Will hv to use a while to recommend others of its worth.|
|Tried, the current can be very powerful depending on the setting, i don't dare to go hi...|
|               Item received after a week. Looks smaller than expected, can’t wait to try!|
|Thanks!!! Works as describe no complaints. Not really expecting any life changing resul...|
|Fast delivery considering it’s from overseas and only tried once. Not sure about the re...|
+------------------------------------------------------------------------------------------+



In [11]:
pipeline = PretrainedPipeline("analyze_sentiment", lang="en")

sent_df = pipeline.transform(text)
sent_df = sent_df.select('text','sentiment.result' ,'sentiment.metadata' )
sent_df.show(truncate = True)

analyze_sentiment download started this may take some time.
Approx size to download 4.9 MB
[OK!]
+--------------------+--------------------+--------------------+
|                text|              result|            metadata|
+--------------------+--------------------+--------------------+
|Looks ok. Not lik...|[negative, negati...|[[confidence -> 0...|
|Tried, the curren...|          [positive]|[[confidence -> 0...|
|Item received aft...|[negative, positive]|[[confidence -> 0...|
|Thanks!!! Works a...|[positive, positi...|[[confidence -> 1...|
|Fast delivery con...|[negative, positive]|[[confidence -> 0...|
|Fast delivery goo...|          [positive]|[[confidence -> 0...|
|Got my order and ...|[positive, positi...|[[confidence -> 0...|
|Items received in...|[negative, negative]|[[confidence -> 0...|
|Received in good ...|[positive, negative]|[[confidence -> 0...|
| Item doesn’t work .|          [negative]|[[confidence -> 0...|
|show a non workin...|          [negative]|[[confidence ->

Results For Shopping Review Data

In [12]:


@udf(StringType())
def get_result(sent_result):
    result = sent_result[0]
    return str(result)

@udf(FloatType())
def get_conf(sent_conf):
    result = sent_conf[0]
    conf = result.get('confidence')
    return float(conf)

df_sent_result = sent_df.select('text',get_result(F.col('result')).alias('result'),get_conf('metadata').alias('confidence') ) #
df_sent_result.na.drop(subset=["confidence"])
df_sent_result.show()
df_sent_result.count()

+--------------------+--------+----------+
|                text|  result|confidence|
+--------------------+--------+----------+
|Looks ok. Not lik...|negative|    0.5262|
|Tried, the curren...|positive|    0.5033|
|Item received aft...|negative|    0.5102|
|Thanks!!! Works a...|positive|       1.0|
|Fast delivery con...|negative|    0.5473|
|Fast delivery goo...|positive|     0.849|
|Got my order and ...|positive|    0.4772|
|Items received in...|negative|    0.5068|
|Received in good ...|positive|     0.461|
| Item doesn’t work .|negative|    0.5143|
|show a non workin...|negative|    0.5178|
|         Fast. Great|negative|     0.514|
|I've tried it, an...|negative|    0.5701|
|Hub uses it. Musc...|negative|    0.5288|
|Well received. Fa...|positive|    0.4973|
|Product received....|negative|     0.514|
|              Good..|positive|    0.4922|
|box was a little ...|negative|    0.5261|
|Fast delivery, ho...|positive|     0.519|
|Fast delivery, pr...|negative|     0.559|
+----------

1519300