In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import length
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, StringIndexer, IDF
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spark= SparkSession.builder.getOrCreate()

In [3]:
df= spark.read.csv("SMSSpamCollection", inferSchema= True, sep='\t')

In [4]:
df.show(5)

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
+----+--------------------+
only showing top 5 rows



In [5]:
df= df.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text')

In [6]:
df= df.withColumn('length', length(df['text']))

In [7]:
df.show(5)

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
+-----+--------------------+------+
only showing top 5 rows



In [8]:
df.groupby('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



In [9]:
tokenizer= Tokenizer(inputCol= 'text', outputCol= 'token_text')
stop_word_remover= StopWordsRemover(inputCol= 'token_text', outputCol= "stop_tokens")
count_vec= CountVectorizer(inputCol= 'stop_tokens', outputCol= 'c_vec')
idf= IDF(inputCol= 'c_vec', outputCol= 'tf_idf')
labels= StringIndexer(inputCol= 'class', outputCol= "label")

In [10]:
cleanned= VectorAssembler(inputCols=['tf_idf', 'length'], outputCol= 'features')

In [11]:
nb= NaiveBayes()

In [12]:
pipeline= Pipeline(stages=[
    labels,
    tokenizer,
    stop_word_remover,
    count_vec,
    idf,
    cleanned
])

In [13]:
cleanner= pipeline.fit(df)

In [14]:
clean_df= cleanner.transform(df)

In [15]:
clean_df.show(5)

+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|class|                text|length|label|          token_text|         stop_tokens|               c_vec|              tf_idf|            features|
+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|   111|  0.0|[go, until, juron...|[go, jurong, poin...|(13423,[7,11,31,6...|(13423,[7,11,31,6...|(13424,[7,11,31,6...|
|  ham|Ok lar... Joking ...|    29|  0.0|[ok, lar..., joki...|[ok, lar..., joki...|(13423,[0,24,301,...|(13423,[0,24,301,...|(13424,[0,24,301,...|
| spam|Free entry in 2 a...|   155|  1.0|[free, entry, in,...|[free, entry, 2, ...|(13423,[2,13,19,3...|(13423,[2,13,19,3...|(13424,[2,13,19,3...|
|  ham|U dun say so earl...|    49|  0.0|[u, dun, say, so,...|[u, dun, say, ear...|(13423,[0,70,80,1...|(13423,[0,70,8

In [16]:
clean_df= clean_df.select(['label', 'features'])

In [17]:
(train, test)= clean_df.randomSplit([0.7,0.3], seed= 42)

In [18]:
pred= nb.fit(train)

In [19]:
results= pred.transform(test)

In [20]:
results.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,2,41,...|[-1066.4444494620...|[1.0,2.9801047802...|       0.0|
|  0.0|(13424,[0,1,5,20,...|[-808.28928301010...|[1.0,4.7601546641...|       0.0|
|  0.0|(13424,[0,1,7,8,1...|[-1167.3054457630...|[1.0,2.6335649563...|       0.0|
|  0.0|(13424,[0,1,7,15,...|[-657.95991822322...|[1.0,2.6681187262...|       0.0|
|  0.0|(13424,[0,1,12,33...|[-444.48015999387...|[1.0,1.8748881516...|       0.0|
|  0.0|(13424,[0,1,14,18...|[-1362.7802170498...|[1.0,1.2529954030...|       0.0|
|  0.0|(13424,[0,1,14,31...|[-215.54775472262...|[1.0,4.8844219551...|       0.0|
|  0.0|(13424,[0,1,18,20...|[-830.46884103326...|[1.0,8.0551837369...|       0.0|
|  0.0|(13424,[0,1,21,27...|[-774.83487436693...|[1.0,2.5889764398...|       0.0|
|  0.0|(13424,[0

In [21]:
evl= MulticlassClassificationEvaluator()
acc= evl.evaluate(results)
print(f'Accuracy: {acc*100}%')

Accuracy: 92.97146121523177%
