In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import findspark
findspark.init("/home/rajdeep/spark-3.5.0-bin-hadoop3/")

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import length, col
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, VectorAssembler, IDF, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [23]:
#initalizing the spark object
spark = SparkSession.builder.appName("spam").getOrCreate()

In [5]:
#reading the data into dataframe
df = spark.read.csv("data/SMSSpamCollection", inferSchema=True, sep = '\t')

                                                                                

In [6]:
# renaming the column headers
df = df.withColumnRenamed("_c0","class").withColumnRenamed("_c1","text")

In [7]:
# creatng the new column header
df = df.withColumn("length", length(col("text")))

In [8]:
# Cleaning text

In [9]:
# initalizing the tokenizer, stopwordsremover, countvectorizer, idf, idf  and StringIndexer on label column
tokenizer = Tokenizer(inputCol="text",outputCol="tokenize_text")
stopwordsremover = StopWordsRemover(inputCol="tokenize_text", outputCol="clean_tokenize")
countvectorizer = CountVectorizer(inputCol='clean_tokenize', outputCol='cvect')
idf = IDF(inputCol='cvect', outputCol='idf_vect')
assembler = VectorAssembler(inputCols=['idf_vect','length'], outputCol='feature')
spam_ham_to_numeric =  StringIndexer(inputCol='class',outputCol='label')

In [10]:
#initalizing the pipeline
pipeline = Pipeline(stages=[spam_ham_to_numeric,tokenizer,stopwordsremover,countvectorizer,idf,assembler])

In [11]:
#fitting the ppeline on df
cleaner = pipeline.fit(df)

                                                                                

In [12]:
# transforming the df
df = cleaner.transform(df)

In [13]:
# selecting th required columns
clean_df = df.select(['label','feature'])

In [14]:
clean_df.show()

+-----+--------------------+
|label|             feature|
+-----+--------------------+
|  0.0|(13424,[7,11,31,6...|
|  0.0|(13424,[0,24,301,...|
|  1.0|(13424,[2,13,19,3...|
|  0.0|(13424,[0,70,80,1...|
|  0.0|(13424,[36,134,31...|
|  1.0|(13424,[10,60,140...|
|  0.0|(13424,[10,53,102...|
|  0.0|(13424,[127,185,4...|
|  1.0|(13424,[1,47,121,...|
|  1.0|(13424,[0,1,13,27...|
|  0.0|(13424,[18,43,117...|
|  1.0|(13424,[8,16,37,8...|
|  1.0|(13424,[13,30,47,...|
|  0.0|(13424,[39,95,221...|
|  0.0|(13424,[555,1797,...|
|  1.0|(13424,[30,109,11...|
|  0.0|(13424,[82,214,44...|
|  0.0|(13424,[0,2,49,13...|
|  0.0|(13424,[0,74,105,...|
|  1.0|(13424,[4,30,33,5...|
+-----+--------------------+
only showing top 20 rows



In [15]:
# splitting the data in train test
train_df, test_df = clean_df.randomSplit([0.7,0.3])

In [16]:
# initializing classification model
nb = NaiveBayes(featuresCol='feature')

In [17]:
# ftting the model using training data
model = nb.fit(train_df)

23/12/20 21:16:58 WARN DAGScheduler: Broadcasting large task binary with size 1192.9 KiB
23/12/20 21:17:00 WARN DAGScheduler: Broadcasting large task binary with size 1172.3 KiB
                                                                                

In [18]:
# predicting the results
predictions = model.transform(test_df)

In [19]:
predictions.show()

23/12/20 21:17:02 WARN DAGScheduler: Broadcasting large task binary with size 1397.9 KiB
[Stage 14:>                                                         (0 + 1) / 1]

+-----+--------------------+--------------------+--------------------+----------+
|label|             feature|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,2,13,...|[-605.32818178580...|[1.0,3.0568666344...|       0.0|
|  0.0|(13424,[0,1,3,9,1...|[-573.10690380877...|[0.99999999999987...|       0.0|
|  0.0|(13424,[0,1,4,50,...|[-841.29622298089...|[1.0,7.2523152791...|       0.0|
|  0.0|(13424,[0,1,5,20,...|[-807.66778320629...|[1.0,4.4406975432...|       0.0|
|  0.0|(13424,[0,1,7,8,1...|[-876.91909620077...|[1.0,2.7211460697...|       0.0|
|  0.0|(13424,[0,1,9,14,...|[-542.50462158837...|[1.0,1.9961277531...|       0.0|
|  0.0|(13424,[0,1,12,33...|[-442.64272786844...|[1.0,2.1603724010...|       0.0|
|  0.0|(13424,[0,1,14,18...|[-1375.5441298791...|[1.0,1.9537974995...|       0.0|
|  0.0|(13424,[0,1,15,20...|[-668.67724817835...|[1.0,6.8491352653...|       0.0|
|  0.0|(13424,[0

23/12/20 21:17:03 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

In [20]:
# initialing the evaluator
evaluator = MulticlassClassificationEvaluator()

In [21]:
# calculating the rmse score
acc = evaluator.evaluate(predictions)

23/12/20 21:17:03 WARN DAGScheduler: Broadcasting large task binary with size 1402.9 KiB
                                                                                

In [22]:
print("Accuracy of model at predicting spam was: {}".format(acc))

Accuracy of model at predicting spam was: 0.9263661726402102
