In [13]:
%AddDeps com.johnsnowlabs.nlp spark-nlp_2.11 1.2.3
%AddDeps org.apache.bahir spark-streaming-twitter_2.11 2.2.0
%AddDeps com.vdurmont emoji-java 3.1.3

Marking com.johnsnowlabs.nlp:spark-nlp_2.11:1.2.3 for download
Preparing to fetch from:
-> file:/tmp/toree_add_deps4361659222086788520/
-> https://repo1.maven.org/maven2
-> New file at /tmp/toree_add_deps4361659222086788520/https/repo1.maven.org/maven2/com/johnsnowlabs/nlp/spark-nlp_2.11/1.2.3/spark-nlp_2.11-1.2.3.jar
Marking org.apache.bahir:spark-streaming-twitter_2.11:2.2.0 for download
Preparing to fetch from:
-> file:/tmp/toree_add_deps4361659222086788520/
-> https://repo1.maven.org/maven2
-> New file at /tmp/toree_add_deps4361659222086788520/https/repo1.maven.org/maven2/org/apache/bahir/spark-streaming-twitter_2.11/2.2.0/spark-streaming-twitter_2.11-2.2.0.jar
Marking com.vdurmont:emoji-java:3.1.3 for download
Preparing to fetch from:
-> file:/tmp/toree_add_deps4361659222086788520/
-> https://repo1.maven.org/maven2
-> New file at /tmp/toree_add_deps4361659222086788520/https/repo1.maven.org/maven2/com/vdurmont/emoji-java/3.1.3/emoji-java-3.1.3.jar


In [3]:
import org.apache.spark.sql.SparkSession
val spark =  SparkSession.builder().getOrCreate()

In [4]:
// Load the input data to be annotated
val data = spark.read.parquet("sentiment.parquet").limit(1000)
data.cache()
data.count()
data.show()

+------+---------+--------------------+
|itemid|sentiment|                text|
+------+---------+--------------------+
|393940|        1|@Natasja_Cupcake ...|
|393941|        1|@Natasja_Cupcake ...|
|393942|        0|@Natasja_Cupcake ...|
|393943|        0|@Natasja_Cupcake ...|
|393944|        1|@Natasja_Cupcake ...|
|393945|        1|@renegade37918  I...|
|393946|        0|@renegadejk529 i ...|
|393947|        1|@RenegadeScribe O...|
|393948|        0|@RenegadeSOA513 ....|
|393949|        1|@RenegadeSOA513 J...|
|393950|        0|@RenegadeSOA513 L...|
|393951|        1|@RenegadEuphoriX ...|
|393952|        1|@RenegadeVyper DO...|
|393953|        1|@Renegal Nah, it ...|
|393954|        1|@Renegat Ñ?ÑƒÐ¿Ðµ...|
|393955|        1|@reneilim don't f...|
|393956|        1|@renelannte mouse...|
|393957|        0|@renemonney Jam W...|
|393958|        0|@renemonster i wa...|
|393959|        1|  @renems enviei rs |
+------+---------+--------------------+
only showing top 20 rows



In [5]:
import com.johnsnowlabs.nlp._
import com.johnsnowlabs.nlp.annotators._
import org.apache.spark.ml.{Pipeline,PipelineModel}
import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel
import com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach
import com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentApproach

val documentAssembler = new DocumentAssembler().setInputCol("text").setOutputCol("document")

val sentenceDetector = new SentenceDetectorModel().setInputCols(Array("document")).setOutputCol("sentence")

val tokenizer = new RegexTokenizer().setInputCols(Array("sentence")).setOutputCol("token")
        
val normalizer = new Normalizer().setInputCols(Array("token")).setOutputCol("normal")        
        
val spellChecker = new NorvigSweetingApproach().setInputCols(Array("normal")).setOutputCol("spell")

// When training on small data you may want to disable this to not cut off infrequent words
val sentimentDetector = new ViveknSentimentApproach().setInputCols(Array("spell", "sentence")).setOutputCol("sentiment").setPositiveSourcePath("vivekn/positive").setNegativeSourcePath("vivekn/negative").setCorpusPrune(false)
    
val finisher = new Finisher().setInputCols(Array("sentiment")).setIncludeKeys(true) //.setCleanAnnotations(false)
    
val pipeline = new Pipeline().setStages(Array(documentAssembler, sentenceDetector, tokenizer, normalizer, spellChecker, sentimentDetector, finisher))

val sentimentData = pipeline.fit(data).transform(data)

In [6]:
sentimentData.show()

+------+--------------------+--------------------+
|itemid|                text|  finished_sentiment|
+------+--------------------+--------------------+
|393940|@Natasja_Cupcake ...|result->positive@...|
|393941|@Natasja_Cupcake ...|    result->positive|
|393942|@Natasja_Cupcake ...|result->positive@...|
|393943|@Natasja_Cupcake ...|result->positive@...|
|393944|@Natasja_Cupcake ...|    result->positive|
|393945|@renegade37918  I...|    result->positive|
|393946|@renegadejk529 i ...|    result->positive|
|393947|@RenegadeScribe O...|result->positive@...|
|393948|@RenegadeSOA513 ....|result->positive@...|
|393949|@RenegadeSOA513 J...|    result->positive|
|393950|@RenegadeSOA513 L...|    result->positive|
|393951|@RenegadEuphoriX ...|result->positive@...|
|393952|@RenegadeVyper DO...|result->positive@...|
|393953|@Renegal Nah, it ...|    result->positive|
|393954|@Renegat Ñ?ÑƒÐ¿Ðµ...|    result->positive|
|393955|@reneilim don't f...|result->positive@...|
|393956|@renelannte mouse...|re

In [24]:
sentimentData.printSchema

root
 |-- itemid: integer (nullable = true)
 |-- text: string (nullable = true)
 |-- finished_sentiment: string (nullable = true)



In [43]:
def sigmoid(xs: Iterable[Int]) = xs.sum / xs.size
// column 2 == finished_sentiment
sentimentData.take(120).map(r => r.getString(2).split('@').map(s => if (s == "result->positive") 1 else 0)).map(a => sigmoid(a)).foreach(println)

1
1
1
1
1
1
1
1
1
1
1
1
0
1
1
1
1
0
0
1
1
1
0
1
1
1
1
1
1
1
1
0
0
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
1
0
1
1
1
1
1
1
1
0
1
1
1
0
1
1
1
0
0
1
0
0
1
1
0
1
1
0
0
1
0
1
1
1
1
0
1
0
1
1
0
1
1
0
1
0
1
1
0
1
1
1
1
1
1
1
0
1
1
1
1
0
1
1
1
1
1
1
1
1
1
0


In [8]:
pipeline.write.overwrite.save("./ps")
pipeline.fit(data).write.overwrite.save("./ms")

In [9]:
Pipeline.read.load("./ps")
PipelineModel.read.load("./ms")

pipeline_725777fdacdb