In [29]:
# Importing packages
import os
import sys
import string
import re

# Installing pre-reqs
# !pip install twython
# !pip install spark-nlp

# Importing nltk-related junk
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.sentiment import SentimentIntensityAnalyzer

# Importing pyspark tools & init sparksession
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, when
from pyspark.sql.types import IntegerType, StringType, FloatType, NumericType
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, CountVectorizer, HashingTF
from pyspark.ml.classification import NaiveBayes, LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Importing helper functions python script
import helpers

# Importing sparkNLP and model
import sparknlp
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
spark = sparknlp.start(gpu=True)

# Checking spark session status
spark

# Importing training/testing set
train = spark.read.format("csv").load("train.csv", header=True)
test = spark.read.format("csv").load("test.csv", header=True)

# Subsetting from training data for quick dev testing
df = train.limit(20)
df.printSchema() 

root
 |-- textID: string (nullable = true)
 |-- text: string (nullable = true)
 |-- selected_text: string (nullable = true)
 |-- sentiment: string (nullable = true)



In [30]:
############ CLEANING STEPS ON train0 ############

# Calling helper udfs on dev short data.frame
train0 =  train.withColumn('stage01',udf_ascii_strip( train['selected_text']))
train0 = train0.withColumn('stage02',udf_fix_abbrevs(train0['stage01']))
train0 = train0.withColumn('cleaned',udf_rm_features(train0['stage02']))
train0 = train0.select('textID','sentiment','cleaned')

############ CLEANING STEPS ON test0 ############

# Calling helper udfs on dev short data.frame
test0 =  test.withColumn('stage01',udf_ascii_strip(test['text']))
test0 = test0.withColumn('stage02',udf_fix_abbrevs(test0['stage01']))
test0 = test0.withColumn('cleaned',udf_rm_features(test0['stage02']))
test0 = test0.select('textID','sentiment','cleaned')

In [31]:
train0.show()
test0.show()

+----------+---------+--------------------+
|    textID|sentiment|             cleaned|
+----------+---------+--------------------+
|cb774db0d1|  neutral|have responded if...|
|549e992a42| negative|            sooo sad|
|088c60f138| negative|         bullying me|
|9642c003ef| negative|      leave me alone|
|358bd9e861| negative|             sons of|
|28b57f3990|  neutral|some shameless pl...|
|6e0c6d75b1| positive|                 fun|
|50e14c0bb8|  neutral|          soooo high|
|e050245fbd|  neutral|         both of you|
|fc2cbefa9d| positive|wow you just beca...|
|2339a9b08b|  neutral|as much as love t...|
|16fab9f95b| positive|                like|
|74a76f6e0a| negative|         dangerously|
|04dd1d2e34| negative|                lost|
|bbe3cbf620|  neutral|test test from th...|
|8a939bfb59| negative|  uh oh am sunburned|
|3440297f8b| negative|                sigh|
|919fa93391| negative|                sick|
|af3fed7fc3| negative|                onna|
|40e7becabf|  neutral|hes just n

In [32]:
# NLTK's VADER social media model implementation
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Defining NLTK sentiment pyfunction
def get_nltk_sentiment(txt):
  tmp = sia.polarity_scores(txt)
  return(tmp['compound'])

# pyspark udf function for nltk
sentiment_analysis_udf = udf(get_nltk_sentiment, FloatType()) 

# Applying udf and print results
train0.select('textID','cleaned','sentiment')\
    .withColumn("score_nltk", sentiment_analysis_udf(col('cleaned')))\
    .show()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\rober\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


+----------+--------------------+---------+----------+
|    textID|             cleaned|sentiment|score_nltk|
+----------+--------------------+---------+----------+
|cb774db0d1|have responded if...|  neutral|       0.0|
|549e992a42|            sooo sad| negative|   -0.4767|
|088c60f138|         bullying me| negative|   -0.5994|
|9642c003ef|      leave me alone| negative|    -0.296|
|358bd9e861|             sons of| negative|       0.0|
|28b57f3990|some shameless pl...|  neutral|    0.4215|
|6e0c6d75b1|                 fun| positive|    0.5106|
|50e14c0bb8|          soooo high|  neutral|       0.0|
|e050245fbd|         both of you|  neutral|       0.0|
|fc2cbefa9d|wow you just beca...| positive|    0.5859|
|2339a9b08b|as much as love t...|  neutral|    0.8519|
|16fab9f95b|                like| positive|    0.3612|
|74a76f6e0a|         dangerously| negative|   -0.4588|
|04dd1d2e34|                lost| negative|   -0.3182|
|bbe3cbf620|test test from th...|  neutral|       0.0|
|8a939bfb5

In [33]:
document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

use = UniversalSentenceEncoder.pretrained('tfhub_use', lang="en") \
    .setInputCols(["document"])\
    .setOutputCol("sentence_embeddings")

classifier = SentimentDLModel().pretrained('sentimentdl_use_twitter')\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlp_pipeline = Pipeline(stages=[document_assembler,
                                use,
                                classifier
                                ])

l_model = LightPipeline(nlp_pipeline.fit(spark.createDataFrame([['']]).toDF("text")))

annotations = l_model.fullAnnotate(["im meeting up with one of my besties tonight! Cant wait!!  - GIRL TALK!!", "is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ | ]
An error occurred while calling z:com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadModel.
: java.lang.UnsatisfiedLinkError: org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:645)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1230)
	at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1435)
	at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:493)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1868)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1910)
	at org.apache.hadoop.fs.FileSystem$4.<init>(FileSystem.java:2072)
	at org.apache.hadoop.fs.FileSystem.listLocatedStatus(FileSystem.java:2071)
	at org.apache.hadoop.fs.ChecksumFileSystem.listLo

Py4JJavaError: An error occurred while calling z:com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadModel.
: java.lang.UnsatisfiedLinkError: org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:645)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1230)
	at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1435)
	at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:493)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1868)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1910)
	at org.apache.hadoop.fs.FileSystem$4.<init>(FileSystem.java:2072)
	at org.apache.hadoop.fs.FileSystem.listLocatedStatus(FileSystem.java:2071)
	at org.apache.hadoop.fs.ChecksumFileSystem.listLocatedStatus(ChecksumFileSystem.java:700)
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:239)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:325)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.RDD.$anonfun$take$1(RDD.scala:1428)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.take(RDD.scala:1422)
	at org.apache.spark.rdd.RDD.$anonfun$first$1(RDD.scala:1463)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.first(RDD.scala:1463)
	at org.apache.spark.ml.util.DefaultParamsReader$.loadMetadata(ReadWrite.scala:587)
	at org.apache.spark.ml.util.DefaultParamsReader.load(ReadWrite.scala:465)
	at com.johnsnowlabs.nlp.FeaturesReader.load(ParamsAndFeaturesReadable.scala:31)
	at com.johnsnowlabs.nlp.FeaturesReader.load(ParamsAndFeaturesReadable.scala:24)
	at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadModel(ResourceDownloader.scala:500)
	at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadModel(ResourceDownloader.scala:492)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader$.downloadModel(ResourceDownloader.scala:653)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadModel(ResourceDownloader.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)
