In [2]:
import pyspark.sql.functions as F

In [4]:
spark = SparkSession.builder.appName("SpamDetection Notebook").getOrCreate()

In [16]:
data = spark.read.option("delimiter","\t").csv('spam detection/SMSSpamCollection')

In [17]:
data.show()

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
|spam|FreeMsg Hey there...|
| ham|Even my brother i...|
| ham|As per your reque...|
|spam|WINNER!! As a val...|
|spam|Had your mobile 1...|
| ham|I'm gonna be home...|
|spam|SIX chances to wi...|
|spam|URGENT! You have ...|
| ham|I've been searchi...|
| ham|I HAVE A DATE ON ...|
|spam|XXXMobileMovieClu...|
| ham|Oh k...i'm watchi...|
| ham|Eh u remember how...|
| ham|Fine if thats th...|
|spam|England v Macedon...|
+----+--------------------+
only showing top 20 rows



In [18]:
data = data.toDF("status","message")

In [19]:
data.show()

+------+--------------------+
|status|             message|
+------+--------------------+
|   ham|Go until jurong p...|
|   ham|Ok lar... Joking ...|
|  spam|Free entry in 2 a...|
|   ham|U dun say so earl...|
|   ham|Nah I don't think...|
|  spam|FreeMsg Hey there...|
|   ham|Even my brother i...|
|   ham|As per your reque...|
|  spam|WINNER!! As a val...|
|  spam|Had your mobile 1...|
|   ham|I'm gonna be home...|
|  spam|SIX chances to wi...|
|  spam|URGENT! You have ...|
|   ham|I've been searchi...|
|   ham|I HAVE A DATE ON ...|
|  spam|XXXMobileMovieClu...|
|   ham|Oh k...i'm watchi...|
|   ham|Eh u remember how...|
|   ham|Fine if thats th...|
|  spam|England v Macedon...|
+------+--------------------+
only showing top 20 rows



In [21]:
from pyspark.ml.feature import Tokenizer
token = Tokenizer().setInputCol("message").setOutputCol("words")
transformed_data = token.transform(data)
transformed_data.show()

+------+--------------------+--------------------+
|status|             message|               words|
+------+--------------------+--------------------+
|   ham|Go until jurong p...|[go, until, juron...|
|   ham|Ok lar... Joking ...|[ok, lar..., joki...|
|  spam|Free entry in 2 a...|[free, entry, in,...|
|   ham|U dun say so earl...|[u, dun, say, so,...|
|   ham|Nah I don't think...|[nah, i, don't, t...|
|  spam|FreeMsg Hey there...|[freemsg, hey, th...|
|   ham|Even my brother i...|[even, my, brothe...|
|   ham|As per your reque...|[as, per, your, r...|
|  spam|WINNER!! As a val...|[winner!!, as, a,...|
|  spam|Had your mobile 1...|[had, your, mobil...|
|   ham|I'm gonna be home...|[i'm, gonna, be, ...|
|  spam|SIX chances to wi...|[six, chances, to...|
|  spam|URGENT! You have ...|[urgent!, you, ha...|
|   ham|I've been searchi...|[i've, been, sear...|
|   ham|I HAVE A DATE ON ...|[i, have, a, date...|
|  spam|XXXMobileMovieClu...|[xxxmobilemoviecl...|
|   ham|Oh k...i'm watchi...|[o

In [24]:
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover().setInputCol("words").setOutputCol("clean_data")
massaged_data = remover.transform(transformed_data)

In [25]:
massaged_data.show()

+------+--------------------+--------------------+--------------------+
|status|             message|               words|          clean_data|
+------+--------------------+--------------------+--------------------+
|   ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|
|   ham|Ok lar... Joking ...|[ok, lar..., joki...|[ok, lar..., joki...|
|  spam|Free entry in 2 a...|[free, entry, in,...|[free, entry, 2, ...|
|   ham|U dun say so earl...|[u, dun, say, so,...|[u, dun, say, ear...|
|   ham|Nah I don't think...|[nah, i, don't, t...|[nah, think, goes...|
|  spam|FreeMsg Hey there...|[freemsg, hey, th...|[freemsg, hey, da...|
|   ham|Even my brother i...|[even, my, brothe...|[even, brother, l...|
|   ham|As per your reque...|[as, per, your, r...|[per, request, 'm...|
|  spam|WINNER!! As a val...|[winner!!, as, a,...|[winner!!, valued...|
|  spam|Had your mobile 1...|[had, your, mobil...|[mobile, 11, mont...|
|   ham|I'm gonna be home...|[i'm, gonna, be, ...|[gonna, home, 

In [26]:
from pyspark.ml.feature import CountVectorizer,CountVectorizerModel

In [30]:
cv = CountVectorizer().setInputCol("clean_data").setOutputCol("features").fit(massaged_data)

In [32]:
feature = cv.transform(massaged_data)

In [33]:
feature.show()

+------+--------------------+--------------------+--------------------+--------------------+
|status|             message|               words|          clean_data|            features|
+------+--------------------+--------------------+--------------------+--------------------+
|   ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|(13423,[7,11,31,6...|
|   ham|Ok lar... Joking ...|[ok, lar..., joki...|[ok, lar..., joki...|(13423,[0,24,297,...|
|  spam|Free entry in 2 a...|[free, entry, in,...|[free, entry, 2, ...|(13423,[2,13,19,3...|
|   ham|U dun say so earl...|[u, dun, say, so,...|[u, dun, say, ear...|(13423,[0,70,80,1...|
|   ham|Nah I don't think...|[nah, i, don't, t...|[nah, think, goes...|(13423,[36,134,31...|
|  spam|FreeMsg Hey there...|[freemsg, hey, th...|[freemsg, hey, da...|(13423,[10,60,139...|
|   ham|Even my brother i...|[even, my, brothe...|[even, brother, l...|(13423,[10,53,103...|
|   ham|As per your reque...|[as, per, your, r...|[per, request, 'm...

In [36]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer
indexer = StringIndexer().setInputCol("status").setOutputCol("label").fit(feature)
final_data = indexer.transform(feature)
final_data.show(3)

+------+--------------------+--------------------+--------------------+--------------------+-----+
|status|             message|               words|          clean_data|            features|label|
+------+--------------------+--------------------+--------------------+--------------------+-----+
|   ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|(13423,[7,11,31,6...|  0.0|
|   ham|Ok lar... Joking ...|[ok, lar..., joki...|[ok, lar..., joki...|(13423,[0,24,297,...|  0.0|
|  spam|Free entry in 2 a...|[free, entry, in,...|[free, entry, 2, ...|(13423,[2,13,19,3...|  1.0|
+------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 3 rows



In [37]:
#from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
train,test = final_data.randomSplit([0.7, 0.3], seed = 123)

In [38]:
train.show()

+------+--------------------+--------------------+--------------------+--------------------+-----+
|status|             message|               words|          clean_data|            features|label|
+------+--------------------+--------------------+--------------------+--------------------+-----+
|   ham| &lt;#&gt;  in mc...|[, &lt;#&gt;, , i...|[, &lt;#&gt;, , m...|(13423,[3,6,5140,...|  0.0|
|   ham| &lt;#&gt;  mins ...|[, &lt;#&gt;, , m...|[, &lt;#&gt;, , m...|(13423,[3,6,41,20...|  0.0|
|   ham| and  picking the...|[, and, , picking...|[, , picking, var...|(13423,[3,719,201...|  0.0|
|   ham| came to look at ...|[, came, to, look...|[, came, look, fl...|(13423,[3,11,165,...|  0.0|
|   ham| gonna let me kno...|[, gonna, let, me...|[, gonna, let, kn...|(13423,[3,12,79,8...|  0.0|
|   ham| says that he's q...|[, says, that, he...|[, says, quitting...|(13423,[0,3,14,18...|  0.0|
|   ham| what number do u...|[, what, number, ...|[, number, u, liv...|(13423,[0,3,86,19...|  0.0|
|   ham|"H

In [44]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
model = lr.fit(train)

In [45]:
predict= model.transform(test)

In [47]:
predict.show()

+------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|status|             message|               words|          clean_data|            features|label|       rawPrediction|         probability|prediction|
+------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|   ham| &lt;DECIMAL&gt; ...|[, &lt;decimal&gt...|[, &lt;decimal&gt...|(13423,[3,84,114,...|  0.0|[1.84255044036003...|[0.86325006438320...|       0.0|
|   ham| said kiss, kiss,...|[, said, kiss,, k...|[, said, kiss,, k...|(13423,[3,94,212,...|  0.0|[1.84255044036003...|[0.86325006438320...|       0.0|
|   ham|"Gimme a few" was...|["gimme, a, few",...|["gimme, few", , ...|(13423,[3,6,278,1...|  0.0|[1.84255044036003...|[0.86325006438320...|       0.0|
|   ham|"Life is nothing ...|["life, is, nothi...|["life, nothing, ...|(13423,[5,78,116,

In [49]:
predict.select("prediction","label").show(2)

+----------+-----+
|prediction|label|
+----------+-----+
|       0.0|  0.0|
|       0.0|  0.0|
+----------+-----+
only showing top 2 rows



In [57]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator().setLabelCol('label').setRawPredictionCol('prediction').setMetricName("areaUnderROC")
accuracy = evaluator.evaluate(predict)

In [58]:
accuracy

0.5