In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerType

In [0]:
data = spark.read.table('smsspamcollection_5')
data.show(5)

+----+--------------------+
|   x|                   y|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
+----+--------------------+
only showing top 5 rows



In [0]:
data = data.withColumnRenamed('x','class').withColumnRenamed('y','text')
data.show(5)

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
+-----+--------------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import length
data = data.withColumn('length',length(data['text']))
data.show()

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
| spam|FreeMsg Hey there...|   147|
|  ham|Even my brother i...|    77|
|  ham|As per your reque...|   160|
| spam|WINNER!! As a val...|   157|
| spam|Had your mobile 1...|   154|
|  ham|I'm gonna be home...|   109|
| spam|SIX chances to wi...|   136|
| spam|URGENT! You have ...|   155|
|  ham|I've been searchi...|   196|
|  ham|I HAVE A DATE ON ...|    35|
| spam|XXXMobileMovieClu...|   149|
|  ham|Oh k...i'm watchi...|    26|
|  ham|Eh u remember how...|    81|
|  ham|Fine if thats th...|    56|
| spam|England v Macedon...|   155|
+-----+--------------------+------+
only showing top 20 rows



In [0]:
data.groupBy('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham| 71.4545266210897|
| spam|138.6706827309237|
+-----+-----------------+



In [0]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer

In [0]:
#Create X
tokenizer = Tokenizer(inputCol = 'text', outputCol = 'token_text')
stop_remove = StopWordsRemover(inputCol = 'token_text', outputCol = 'stop_token')
count_vec = CountVectorizer(inputCol = 'stop_token', outputCol = 'c_vec')
idf = IDF(inputCol = 'c_vec', outputCol = 'tf_idf')

#Create y
ham_spam_to_numeric = StringIndexer(inputCol = 'class', outputCol = 'label')

In [0]:
from pyspark.ml.feature import VectorAssembler

clean_up = VectorAssembler(inputCols = ['tf_idf','length'], outputCol = 'features')

In [0]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes()

In [0]:
from pyspark.ml import Pipeline

data_prep_pipe = Pipeline(
    stages = [
        ham_spam_to_numeric, 
        tokenizer,
        stop_remove,
        count_vec,
        idf,
        clean_up
    ]
)

In [0]:
cleaner = data_prep_pipe.fit(data)

In [0]:
clean_data = cleaner.transform(data)
clean_data = clean_data.select('label','features')
clean_data.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(13424,[7,11,31,6...|
|  0.0|(13424,[0,24,297,...|
|  1.0|(13424,[2,13,19,3...|
|  0.0|(13424,[0,70,80,1...|
|  0.0|(13424,[36,134,31...|
+-----+--------------------+
only showing top 5 rows



In [0]:
training, test = clean_data.randomSplit([0.7,0.3])

In [0]:
spam_detector = nb.fit(training)

In [0]:
data.printSchema()

root
 |-- class: string (nullable = true)
 |-- text: string (nullable = true)
 |-- length: integer (nullable = true)



In [0]:
test_results = spam_detector.transform(test)

In [0]:
test_results.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,7,8,1...|[-1147.8970950088...|[1.0,2.7389236567...|       0.0|
|  0.0|(13424,[0,1,17,19...|[-803.66657498136...|[1.0,7.8834348293...|       0.0|
|  0.0|(13424,[0,1,21,27...|[-1008.3730764355...|[1.0,3.0076664690...|       0.0|
|  0.0|(13424,[0,1,23,63...|[-1295.7420961745...|[1.0,3.5778126630...|       0.0|
|  0.0|(13424,[0,1,30,12...|[-618.18994526012...|[1.0,1.0341885065...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print('Accuracy of Naive Bayes Model: ', acc)

Accuracy of Naive Bayes Model:  0.9307979508734597
