#### Building a spam vs ham filter using NLP with Spark

By: Matt Purvis

##### Importing and Creating Spark Session

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()

##### Importing data and previewing data

In [0]:
data = spark.sql('select * from smsspamcollection')

In [0]:
data.printSchema()

In [0]:
data.show()

##### Renaming columns

In [0]:
data = data.withColumnRenamed('_c0','class').withColumnRenamed('_c1','text')

In [0]:
data.show()

##### Feature Engineering

In [0]:
# Import length to get length of text column
from pyspark.sql.functions import length

In [0]:
# Add length column
data = data.withColumn('length',length(data['text']))

##### Length Investigation

In [0]:
data.groupby('class').mean().show() # Spam texts are typically longer

##### More Feature Engineering

In [0]:
# Imports for more features
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer

##### Documentation for following objects
Tokenizer - Will take the text and create a list of all the words <br>
Stop_remove - Will remove common meaningless words like 'the' or 'a' <br>
count_vec - Will create a vector of how many times each token appears in the text (bag of words approach) <br>
idf - Inverse Document Frequency - will create a vector for how important the term is in the entire dataset (how many rows have this term in them) <br>
ham_spam_to_numeric - Converts spam column to zeros and ones <br>

In [0]:
tokenizer = Tokenizer(inputCol = 'text',outputCol = 'token_text') 
stop_remove = StopWordsRemover(inputCol = 'token_text', outputCol = 'stop_token')
count_vec = CountVectorizer(inputCol = 'stop_token', outputCol = 'c_vec')
idf = IDF(inputCol = 'c_vec', outputCol = 'tf_idf')
ham_spam_to_numeric = StringIndexer(inputCol = 'class', outputCol = 'label')

##### Transforming features into features vector

In [0]:
from pyspark.ml.feature import VectorAssembler
clean_up = VectorAssembler(inputCols = ['tf_idf','length'], outputCol = 'features')

##### Importing and Creating Naive Bayes classifier to use with NLP

In [0]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes()

##### Creating a pipeline to make all the transformations

In [0]:
from pyspark.ml import Pipeline
data_prep_pipe = Pipeline(stages= [ham_spam_to_numeric, tokenizer, stop_remove, count_vec, idf, clean_up])

##### Fitting the pipeline to the data and then transforming the data

In [0]:
cleaner = data_prep_pipe.fit(data)
clean_data = cleaner.transform(data)

##### Select the relevant columns (labels and features) to use the naive bayes classifier on

In [0]:
clean_data = clean_data.select('label', 'features')

In [0]:
clean_data.show()

##### Train/Test Split

In [0]:
training, test = clean_data.randomSplit([.7,.3])

##### Fit/train the naive bayes algorithm on training data

In [0]:
spam_detector = nb.fit(training)

##### Use the model to make predictions on test set

In [0]:
test_results = spam_detector.transform(test)

In [0]:
test_results.show()

##### Evaluate the spam filter using accuracy

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)

In [0]:
# Display the accuracy
print('Accuracy')
acc

Our spam filter was ~92% accurate! Not bad! Further tweaking could improve the spam filter even more!