In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('spamNLP').getOrCreate()

In [3]:
import os

In [4]:
spam_data_file = os.path.join(os.path.curdir, 'data', 'SMSSpamCollection')

In [5]:
spam_data = spark.read.csv(spam_data_file,
                          inferSchema=True,
                          sep='\t')

In [6]:
spam_data.show()

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
|spam|FreeMsg Hey there...|
| ham|Even my brother i...|
| ham|As per your reque...|
|spam|WINNER!! As a val...|
|spam|Had your mobile 1...|
| ham|I'm gonna be home...|
|spam|SIX chances to wi...|
|spam|URGENT! You have ...|
| ham|I've been searchi...|
| ham|I HAVE A DATE ON ...|
|spam|XXXMobileMovieClu...|
| ham|Oh k...i'm watchi...|
| ham|Eh u remember how...|
| ham|Fine if thats th...|
|spam|England v Macedon...|
+----+--------------------+
only showing top 20 rows



In [8]:
spam_data = spam_data.withColumnRenamed('_c0', 'class').\
withColumnRenamed('_c1','text')

In [9]:
spam_data.show()

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
| spam|FreeMsg Hey there...|
|  ham|Even my brother i...|
|  ham|As per your reque...|
| spam|WINNER!! As a val...|
| spam|Had your mobile 1...|
|  ham|I'm gonna be home...|
| spam|SIX chances to wi...|
| spam|URGENT! You have ...|
|  ham|I've been searchi...|
|  ham|I HAVE A DATE ON ...|
| spam|XXXMobileMovieClu...|
|  ham|Oh k...i'm watchi...|
|  ham|Eh u remember how...|
|  ham|Fine if thats th...|
| spam|England v Macedon...|
+-----+--------------------+
only showing top 20 rows



In [11]:
from pyspark.sql.functions import length

In [12]:
spam_data = spam_data.withColumn('length', length(spam_data['text']))

In [14]:
spam_data.show()

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
| spam|FreeMsg Hey there...|   147|
|  ham|Even my brother i...|    77|
|  ham|As per your reque...|   160|
| spam|WINNER!! As a val...|   157|
| spam|Had your mobile 1...|   154|
|  ham|I'm gonna be home...|   109|
| spam|SIX chances to wi...|   136|
| spam|URGENT! You have ...|   155|
|  ham|I've been searchi...|   196|
|  ham|I HAVE A DATE ON ...|    35|
| spam|XXXMobileMovieClu...|   149|
|  ham|Oh k...i'm watchi...|    26|
|  ham|Eh u remember how...|    81|
|  ham|Fine if thats th...|    56|
| spam|England v Macedon...|   155|
+-----+--------------------+------+
only showing top 20 rows



In [15]:
spam_data.groupBy('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



In [16]:
# Tokenization
# Stop world removal
# Count vectorization - BOW
# TF-IDF

In [17]:
from pyspark.ml.feature import (Tokenizer, StopWordsRemover,
                               CountVectorizer, IDF, StringIndexer)

In [24]:
tokenizer = Tokenizer(inputCol='text', outputCol='token_text')
stop_word_remover = StopWordsRemover(inputCol='token_text', 
                                     outputCol='stop_token')
count_vectorizer = CountVectorizer(inputCol='stop_token',
                                  outputCol='count_vector',
                                  )
idf = IDF(inputCol='count_vector', outputCol='tf_idf')
ham_spam = StringIndexer(inputCol='class', outputCol='label')

In [19]:
from pyspark.ml.feature import VectorAssembler

In [20]:
clean_up = VectorAssembler(inputCols=['tf_idf', 'length'],
                          outputCol='features')

In [21]:
from pyspark.ml.classification import NaiveBayes

In [22]:
nb = NaiveBayes()

In [23]:
from pyspark.ml import Pipeline

In [25]:
pipeline = Pipeline(stages=[
    tokenizer,
    stop_word_remover,
    count_vectorizer,
    idf,
    ham_spam,
    clean_up])

In [26]:
cleaner = pipeline.fit(spam_data)

In [28]:
clean_data = cleaner.transform(spam_data)

In [29]:
clean_data.head().asDict()

{'class': 'ham',
 'text': 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'length': 111,
 'token_text': ['go',
  'until',
  'jurong',
  'point,',
  'crazy..',
  'available',
  'only',
  'in',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet...',
  'cine',
  'there',
  'got',
  'amore',
  'wat...'],
 'stop_token': ['go',
  'jurong',
  'point,',
  'crazy..',
  'available',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet...',
  'cine',
  'got',
  'amore',
  'wat...'],
 'count_vector': SparseVector(13423, {7: 1.0, 11: 1.0, 31: 1.0, 61: 1.0, 72: 1.0, 344: 1.0, 625: 1.0, 731: 1.0, 1409: 1.0, 1598: 1.0, 4485: 1.0, 6440: 1.0, 8092: 1.0, 8838: 1.0, 11344: 1.0, 12979: 1.0}),
 'tf_idf': SparseVector(13423, {7: 3.1126, 11: 3.2055, 31: 3.822, 61: 4.2072, 72: 4.322, 344: 5.4072, 625: 5.918, 731: 6.1411, 1409: 6.6801, 1598: 6.8343, 4485: 7.5274, 6440: 7.9329, 8092: 7.9329, 8838: 7.9329, 11344: 7.9329,

In [30]:
final_data = clean_data.select('features', 'label')

In [31]:
final_data.head().asDict()

{'features': SparseVector(13424, {7: 3.1126, 11: 3.2055, 31: 3.822, 61: 4.2072, 72: 4.322, 344: 5.4072, 625: 5.918, 731: 6.1411, 1409: 6.6801, 1598: 6.8343, 4485: 7.5274, 6440: 7.9329, 8092: 7.9329, 8838: 7.9329, 11344: 7.9329, 12979: 7.9329, 13423: 111.0}),
 'label': 0.0}

In [32]:
training_data, testing_data = final_data.randomSplit([0.75, 0.25])

In [33]:
model = nb.fit(training_data)

In [34]:
predictions = model.transform(testing_data)

In [36]:
predictions.select('prediction', 'label').show()

+----------+-----+
|prediction|label|
+----------+-----+
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       1.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
+----------+-----+
only showing top 20 rows



In [38]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [39]:
acc_eval = MulticlassClassificationEvaluator()

In [40]:
accuracy = acc_eval.evaluate(predictions)

In [42]:
accuracy # good accuracy

0.9175010255548299