In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .getOrCreate()

### Example 1 (English Sentiment Analysis Opinions)

In [11]:
from pyspark.ml import Pipeline
from sparknlp.annotator import DocumentAssembler, Tokenizer, DistilBertForSequenceClassification
import sparknlp

sparknlp.start()

document_assembler = DocumentAssembler()\
    .setInputCol("opinion")\
    .setOutputCol("document")

tokenizer = Tokenizer()\
    .setInputCols("document")\
    .setOutputCol("token")

sequence_classifier = DistilBertForSequenceClassification.pretrained("distilbert_base_sequence_classifier_food", "en")\
    .setInputCols(["document", "token"])\
    .setOutputCol("class")

sentiment_pipeline = Pipeline().setStages([document_assembler, tokenizer, sequence_classifier])

distilbert_base_sequence_classifier_food download started this may take some time.
Approximate size to download 238.2 MB
[OK!]


In [90]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DoubleType, TimestampType
from pyspark.sql.functions import col

schema = StructType([
    StructField("Client ID", IntegerType()),
    StructField("POS ID", IntegerType()),
    StructField("pos_name", StringType()),
    StructField("Article", StringType()),
    StructField("Quantity", FloatType()),
    StructField("Unit Price", FloatType()),
    StructField("Total", DoubleType()),
    StructField("Sale Type", StringType()),
    StructField("Payment Mode", StringType()),
    StructField("opinion", StringType()),
    StructField("Sale Time", TimestampType())
])

df = spark.read.csv("sales_data.csv", header=True, sep=",", schema=schema)

In [13]:
result = sentiment_pipeline \
        .fit(df) \
        .transform(df) \
        .select(df.columns + [col("class.result").getItem(0).alias("sentiment")])

In [15]:
result.select("opinion","sentiment").show(n=10, truncate=False)

+-----------------------------------------------------------------------------------------------------+---------+
|opinion                                                                                              |sentiment|
+-----------------------------------------------------------------------------------------------------+---------+
|Highly recommend the restaurant for its tasty dishes and cozy atmosphere                             |Positive |
|We ordered takeout, and the packaging was eco-friendly. Appreciated the sustainability effort        |Positive |
|Great ambiance and friendly staff. The menu has a variety of options to choose from                  |Positive |
|The staff was attentive, and they ensured our dietary preferences were taken into account            |Positive |
|We tried the chef's specials, and they were a delightful surprise. Unique and delicious              |Positive |
|The cocktails were overpriced, considering the size and alcohol content                

### Example 2 (Arabic Sentiment Analysis Opinions)

In [80]:
from pyspark.ml import Pipeline
from sparknlp.annotator import DocumentAssembler, Tokenizer, BertForSequenceClassification
import sparknlp

sparknlp.start()

document_assembler = DocumentAssembler()\
    .setInputCol("opinion")\
    .setOutputCol("document")

tokenizer = Tokenizer()\
    .setInputCols("document")\
    .setOutputCol("token")

sequence_classifier = BertForSequenceClassification.pretrained("bert_classifier_arabic_marbert_sentiment", "ar")\
    .setInputCols(["document", "token"])\
    .setOutputCol("class")

sentiment_pipeline = Pipeline().setStages([document_assembler, tokenizer, sequence_classifier])

bert_classifier_arabic_marbert_sentiment download started this may take some time.
Approximate size to download 583.2 MB
[OK!]


In [95]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DoubleType, TimestampType
from pyspark.sql.functions import col

schema = StructType([
    StructField("Client ID", IntegerType()),
    StructField("POS ID", IntegerType()),
    StructField("pos_name", StringType()),
    StructField("Article", StringType()),
    StructField("Quantity", FloatType()),
    StructField("Unit Price", FloatType()),
    StructField("Total", DoubleType()),
    StructField("Sale Type", StringType()),
    StructField("Payment Mode", StringType()),
    StructField("opinion", StringType()),
    StructField("Sale Time", TimestampType())
])

df = spark.read.csv("sales_data.csv", header=True, sep=",", schema=schema)

In [96]:
result = sentiment_pipeline \
        .fit(df) \
        .transform(df) \
        .select(df.columns + [col("class.result").getItem(0).alias("sentiment")])

In [83]:
result.select("opinion","sentiment").show(n=20, truncate=False)

+------------------------------------------------------------------------------------------------+---------+
|opinion                                                                                         |sentiment|
+------------------------------------------------------------------------------------------------+---------+
|لقد كانت تجربة طعام رائعة. الخدمة كانت استثنائية، والطعام تجاوز توقعاتنا                        |neutral  |
|كانت تجربة الطعام مخيبة للآمال بالكامل، حيث كانت الأطباق تفتقر إلى الطعم اللذيذ والنضارة اللازمة|negative |
|كان للمطعم ديكور فريد وساحر. مكان عظيم لمناسبة خاصة                                             |positive |
|كان للمطعم جو دافئ وترحاب. مثالية لعشاء مريح                                                    |positive |
|المطعم كان جيدًا للقاءات العادية، لكنه قد لا يكون الخيار الأمثل للمناسبات الخاصة                |neutral  |
|الطعام كان لذيذ حقا! مزيج مثالي من النكهات والقوام                                              |positive |
|كانت الأطعمة عادية