## Initialize Spark

In [2]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

# Start Spark Session with Spark NLP
#spark = sparknlp.start()

spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[4]")\
    .config("spark.driver.memory","4G")\
    .config("spark.driver.maxResultSize", "2G") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5")\
    .config("spark.kryoserializer.buffer.max", "1000M")\
    .getOrCreate()

## Read the Spam Case Data (5k)

In [3]:

# File location and type
file_location = r'E:\Machine Learning\data\Spam_data.csv'
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","


df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)


df.count()

5574

## Split train and test data

In [5]:
(trainingData, testData) = df.randomSplit([0.7, 0.3], seed = 100)

print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 3964
Test Dataset Count: 1610


In [9]:
df.columns

['Category', 'Message']

## Data Preprocessing and ML Pipeline using Spark-NLP

In [17]:
from pyspark.ml.feature import HashingTF, IDF, OneHotEncoder, StringIndexer, VectorAssembler, SQLTransformer,IndexToString
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

document_assembler = DocumentAssembler() \
    .setInputCol("Message") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

stemmer = Stemmer() \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("stem")

finisher = Finisher() \
    .setInputCols(["stem"]) \
    .setOutputCols(["token_features"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)


hashingTF = HashingTF(inputCol="token_features", outputCol="rawFeatures", numFeatures=1000)

idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)

label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.0)

label_to_stringIdx = IndexToString(inputCol="label", outputCol="sms_class")

nlp_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
            hashingTF,
            idf,
            label_stringIdx,
            lr,
            label_to_stringIdx])

nlp_model = nlp_pipeline.fit(trainingData)

processed = nlp_model.transform(testData)

processed.count()

1610

In [21]:
processed.select("Message","Category","label","prediction","sms_class").show()

+--------------------+--------+-----+----------+---------+
|             Message|Category|label|prediction|sms_class|
+--------------------+--------+-----+----------+---------+
|"A Boy loved a ga...|     ham|  0.0|       0.0|      ham|
|"A boy was late 2...|     ham|  0.0|       0.0|      ham|
|"A cute thought f...|     ham|  0.0|       0.0|      ham|
|"A swt thought: "...|     ham|  0.0|       0.0|      ham|
|"Beautiful Truth ...|     ham|  0.0|       0.0|      ham|
|"Beautiful Truth ...|     ham|  0.0|       0.0|      ham|
|"Best line said i...|     ham|  0.0|       0.0|      ham|
|"Do 1 thing! Chan...|     ham|  0.0|       0.0|      ham|
|"Edison has right...|     ham|  0.0|       0.0|      ham|
|"Ever green quote...|     ham|  0.0|       0.0|      ham|
|"Gumby's has a sp...|     ham|  0.0|       0.0|      ham|
|"Happy or sad , o...|     ham|  0.0|       0.0|      ham|
|"Height of ""Oh s...|     ham|  0.0|       0.0|      ham|
|"I just lov this ...|     ham|  0.0|       0.0|      ha

In [19]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(processed)
print("Accuracy = %g" % (accuracy))
print("Test Error = %g " % (1.0 - accuracy))

Accuracy = 0.961491
Test Error = 0.0385093 
