In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('original_NB').getOrCreate()

In [3]:
# Import csv of spam and ham (not spam)
start_data = spark.read.format("csv").option("header", "true").load("spam.csv")
start_data.show()

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
| spam|FreeMsg Hey there...|
|  ham|Even my brother i...|
|  ham|As per your reque...|
| spam|WINNER!! As a val...|
| spam|Had your mobile 1...|
|  ham|I'm gonna be home...|
| spam|SIX chances to wi...|
| spam|URGENT! You have ...|
|  ham|I've been searchi...|
|  ham|I HAVE A DATE ON ...|
| spam|XXXMobileMovieClu...|
|  ham|Oh k...i'm watchi...|
|  ham|Eh u remember how...|
|  ham|Fine if that��s t...|
| spam|England v Macedon...|
+-----+--------------------+
only showing top 20 rows



In [4]:
# Create a length column to be used as a future feature 
from pyspark.sql.functions import length
data = start_data.withColumn('length', length(start_data['text']))
data.show()

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
| spam|FreeMsg Hey there...|   147|
|  ham|Even my brother i...|    77|
|  ham|As per your reque...|   160|
| spam|WINNER!! As a val...|   157|
| spam|Had your mobile 1...|   154|
|  ham|I'm gonna be home...|   109|
| spam|SIX chances to wi...|   136|
| spam|URGENT! You have ...|   155|
|  ham|I've been searchi...|   196|
|  ham|I HAVE A DATE ON ...|    35|
| spam|XXXMobileMovieClu...|   149|
|  ham|Oh k...i'm watchi...|    26|
|  ham|Eh u remember how...|    81|
|  ham|Fine if that��s t...|    58|
| spam|England v Macedon...|   155|
+-----+--------------------+------+
only showing top 20 rows



### Feature Transformations


In [5]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer

# Create all the features to the data set
ham_spam_to_num = StringIndexer(inputCol='class',outputCol='label')
tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')


In [6]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
# See https://spark.apache.org/docs/latest/ml-features.html#vectorassembler
# This just creates a new, single vector of features that is the concatenation
# of tf-idf data and the length of the email
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [11]:
# Create a and run a data processing Pipeline
# See https://spark.apache.org/docs/latest/ml-pipeline.html#pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[ham_spam_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

In [12]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data)
cleaned = cleaner.transform(data)

In [13]:
# Show label of ham spame and resulting features
cleaned.select(['label', 'features']).show(truncate=False)

+-----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                                                                                                                                                                                                                                                                           

In [14]:
# Break data down into a training set and a testing set
(training, testing) = cleaned.randomSplit([0.7, 0.3])

In [17]:
from pyspark.ml.classification import NaiveBayes

# Create a Naive Bayes model and fit training data
nb = NaiveBayes(modelType='multinomial')
spam_predictor = nb.fit(training)

In [18]:
# Tranform the model with the testing data
test_results = spam_predictor.transform(testing)
test_results.show(5)

+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|class|                text|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  ham| came to look at ...|   103|  0.0|[, came, to, look...|[, came, look, fl...|(262144,[9129,134...|(262144,[9129,134...|(262145,[9129,134...|[-1124.3808896814...|[1.0,8.3735836783...|       0.0|
|  ham| says that he's q...|   200|  0.0|[, says, that, he...|[, says, quitting...|(262144,[13957,22...|(262144,[13957,22...|(262145,[13957,22...|[-1354.0036417131...|[1.0,2.4559404169...|       0.0|


In [19]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print(f"Accuracy of model at predicting spam was: {acc}")

Accuracy of model at predicting spam was: 0.9695185638356549


In [14]:
spark.stop()