In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("SMSSpam").getOrCreate()

In [3]:
data = spark.read.csv("SMSSpamCollection", inferSchema = True, sep='\t')

In [4]:
data.show(n = 3, truncate = False)

+----+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|_c0 |_c1                                                                                                                                                        |
+----+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|ham |Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...                                            |
|ham |Ok lar... Joking wif u oni...                                                                                                                              |
|spam|Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's|
+----+----------------

In [5]:
data = data.selectExpr("_c0 as class","_c1 as text")

In [6]:
data.printSchema()

root
 |-- class: string (nullable = true)
 |-- text: string (nullable = true)



In [7]:
data.show(n = 3, truncate = False)

+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|class|text                                                                                                                                                       |
+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|ham  |Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...                                            |
|ham  |Ok lar... Joking wif u oni...                                                                                                                              |
|spam |Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's|
+-----+---------

In [8]:
import pyspark.sql.functions as F

In [9]:
data = data.withColumn("length",F.length("text"))

In [10]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer, VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline

In [11]:
# Data Cleaning Pipeline
#tokenizer
tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
#StopWordsRemover
stopremover = StopWordsRemover(inputCol="token_text", outputCol="stop_tokens")
#Counter
cv = CountVectorizer(inputCol="stop_tokens", outputCol="c_vec")
#IDF
idf = IDF(inputCol="c_vec", outputCol="idf_vec")
#assembler
assembler = VectorAssembler(inputCols = ["idf_vec","length"],outputCol = "features")
#stringIndexer
indexer = StringIndexer(inputCol="class", outputCol="label")

In [12]:
pipeline = Pipeline(stages = [tokenizer, stopremover, cv, idf,assembler, indexer])

In [13]:
model = pipeline.fit(data)

In [14]:
output = model.transform(data)

In [15]:
clean_data = output.select("features","label")

In [16]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes()

In [17]:
(training,testing) = clean_data.randomSplit([0.7,0.3])

In [18]:
spam_predictor = nb.fit(training)

In [19]:
test_results = spam_predictor.transform(testing)

In [20]:
test_results.show(10)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(13424,[0,1,2,4,3...|  1.0|[-1208.3470100753...|[1.02700546664329...|       1.0|
|(13424,[0,1,2,5,5...|  1.0|[-943.31306797169...|[0.99999999999881...|       0.0|
|(13424,[0,1,2,12,...|  1.0|[-1133.4869786538...|[1.57085615985246...|       1.0|
|(13424,[0,1,2,15,...|  1.0|[-1161.9456463703...|[1.15413834143423...|       1.0|
|(13424,[0,1,2,15,...|  1.0|[-1146.4349936412...|[1.51775023353833...|       1.0|
|(13424,[0,1,2,20,...|  1.0|[-1173.5016202996...|[4.55528283289305...|       1.0|
|(13424,[0,1,2,20,...|  1.0|[-1122.6332397233...|[1.98590717458232...|       1.0|
|(13424,[0,1,2,20,...|  1.0|[-1304.8763529224...|[1.33544263615690...|       1.0|
|(13424,[0,1,2,47,...|  1.0|[-1511.4267604279...|[1.24796234035185...|       1.0|
|(13424,[0,1,5,1

In [21]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [22]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting spam was: {}".format(acc))

Accuracy of model at predicting spam was: 0.9328524927341234
