In [1]:
import os

In [2]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages JohnSnowLabs:spark-nlp:1.2.3 pyspark-shell'

In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel

In [4]:
sc = pyspark.SparkContext()
spark = SparkSession(sc)

In [5]:
#Load the input data to be annotated
data = spark. \
        read. \
        parquet("sentiment.parquet"). \
        limit(1000)
data.cache()
data.count()
data.show()

+------+---------+--------------------+
|itemid|sentiment|                text|
+------+---------+--------------------+
|393940|        1|@Natasja_Cupcake ...|
|393941|        1|@Natasja_Cupcake ...|
|393942|        0|@Natasja_Cupcake ...|
|393943|        0|@Natasja_Cupcake ...|
|393944|        1|@Natasja_Cupcake ...|
|393945|        1|@renegade37918  I...|
|393946|        0|@renegadejk529 i ...|
|393947|        1|@RenegadeScribe O...|
|393948|        0|@RenegadeSOA513 ....|
|393949|        1|@RenegadeSOA513 J...|
|393950|        0|@RenegadeSOA513 L...|
|393951|        1|@RenegadEuphoriX ...|
|393952|        1|@RenegadeVyper DO...|
|393953|        1|@Renegal Nah, it ...|
|393954|        1|@Renegat Ñ?ÑƒÐ¿Ðµ...|
|393955|        1|@reneilim don't f...|
|393956|        1|@renelannte mouse...|
|393957|        0|@renemonney Jam W...|
|393958|        0|@renemonster i wa...|
|393959|        1|  @renems enviei rs |
+------+---------+--------------------+
only showing top 20 rows



In [6]:
from sparknlp.annotator import *
from sparknlp.base import DocumentAssembler, Finisher

In [19]:
### Define the dataframe
document_assembler = DocumentAssembler() \
            .setInputCol("text")
    
sentence_detector = SentenceDetectorModel() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = RegexTokenizer() \
            .setInputCols(["sentence"]) \
            .setOutputCol("token")
        
normalizer = Normalizer() \
            .setInputCols(["token"]) \
            .setOutputCol("normal")        
        
spell_checker = NorvigSweetingApproach() \
            .setInputCols(["normal"]) \
            .setOutputCol("spell")
        
sentiment_detector = ViveknSentimentApproach() \
    .setInputCols(["spell", "sentence"]) \
    .setOutputCol("sentiment") \
    .setPositiveSource("vivekn/positive") \
    .setNegativeSource("vivekn/negative") \
    .setPruneCorpus(False) # when training on small data you may want to disable this to not cut off infrequent words
    
finisher = Finisher() \
    .setInputCols(["sentiment"]) \
    .setIncludeKeys(True) \
    ##.setCleanAnnotations(False)
    
pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    normalizer,
    spell_checker,
    sentiment_detector,
    finisher
])

sentiment_data = pipeline.fit(data).transform(data)    

In [20]:
sentiment_data.show()

+------+--------------------+--------------------+
|itemid|                text|  finished_sentiment|
+------+--------------------+--------------------+
|393940|@Natasja_Cupcake ...|result->positive@...|
|393941|@Natasja_Cupcake ...|    result->positive|
|393942|@Natasja_Cupcake ...|result->positive@...|
|393943|@Natasja_Cupcake ...|result->positive@...|
|393944|@Natasja_Cupcake ...|    result->positive|
|393945|@renegade37918  I...|    result->positive|
|393946|@renegadejk529 i ...|    result->positive|
|393947|@RenegadeScribe O...|result->positive@...|
|393948|@RenegadeSOA513 ....|result->positive@...|
|393949|@RenegadeSOA513 J...|    result->positive|
|393950|@RenegadeSOA513 L...|    result->positive|
|393951|@RenegadEuphoriX ...|result->positive@...|
|393952|@RenegadeVyper DO...|result->positive@...|
|393953|@Renegal Nah, it ...|    result->positive|
|393954|@Renegat Ñ?ÑƒÐ¿Ðµ...|    result->positive|
|393955|@reneilim don't f...|result->positive@...|
|393956|@renelannte mouse...|re

In [22]:
for r in sentiment_data.take(20):
    print(r)

Row(itemid=393940, text="@Natasja_Cupcake But how sweet that your kids are spoiling you, so they should!  You've got a couple of great ones, hon!!!", finished_sentiment='result->positive@result->positive')
Row(itemid=393941, text='@Natasja_Cupcake home-made burgers are the best ', finished_sentiment='result->positive')
Row(itemid=393942, text="@Natasja_Cupcake I got yours, but not Cindy's.     Or, I got one but not the other.", finished_sentiment='result->positive@result->positive')
Row(itemid=393943, text='@Natasja_Cupcake I think only 23.  I get confused easy tho. LOL I hope so if so two stopped following me 2. I know one did already.  ', finished_sentiment='result->positive@result->positive@result->positive@result->positive')
Row(itemid=393944, text='@Natasja_Cupcake Just gave ya a shout out in the chat room ', finished_sentiment='result->positive')
Row(itemid=393945, text='@renegade37918  I love the rain, its so relaxing. ', finished_sentiment='result->positive')
Row(itemid=393946,

In [16]:
pipeline.write().overwrite().save("./ps")
pipeline.fit(data).write().overwrite().save("./ms")

In [17]:
Pipeline.read().load("./ps")
PipelineModel.read().load("./ms")

PipelineModel_44c4a9d4974d48682f50

In [24]:
!pip install emot


Collecting emot
  Downloading emot-1.0-py3-none-any.whl
Installing collected packages: emot
Successfully installed emot-1.0


In [48]:
import emot
text = "I love python 👨 :-) :^)"
emoji = emot.emoji(text)
#[{'value': '👨', 'location': [10, 10]}]
emoticons = emot.emoticons(text)
#[{'value': ':-)', 'location': [12, 15]}]
text.replace('👨', '')
for data in emot.emoji(text):
    text = text.replace(data['value'], '')   
for data in emot.emoticons(text):
    text = text.replace(data['value'], '')
text.rstrip()

'I love python'