In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()
data = spark.read.csv("/FileStore/shared_uploads/sejal@ibm.com/SMSSpamCollection", sep='\t', inferSchema=True)

In [0]:
data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text')
data.show()

In [0]:
from pyspark.sql.functions import length
data = data.withColumn('length', length(data['text']))
data.show()

In [0]:
data.groupBy('class').mean().show()

In [0]:
from pyspark.ml.feature import (Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer)
tokenizer = Tokenizer(inputCol='text', outputCol='token_text')
stop_remove = StopWordsRemover(inputCol='token_text', outputCol='stop_token')
count_vec = CountVectorizer(inputCol='stop_token', outputCol='c_vec')
idf = IDF(inputCol='c_vec', outputCol='tf_idf')
ham_spam_to_numeric = StringIndexer(inputCol='class', outputCol='label')

In [0]:
from pyspark.ml.feature import VectorAssembler
clean_up = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features')

In [0]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes()

In [0]:
from pyspark.ml import Pipeline
data_prep_pipe = Pipeline(stages=[ham_spam_to_numeric, tokenizer, stop_remove, count_vec, idf, clean_up])
cleaner = data_prep_pipe.fit(data)
clean_data = cleaner.transform(data)
clean_data.show()

In [0]:
clean_data = clean_data.select('label', 'features')

In [0]:
training, test = clean_data.randomSplit([0.7, 0.3])
spam_detector = nb.fit(training)
test_results = spam_detector.transform(test)
test_results.show()

In [0]:
# evaluate by comparing true label with prediction
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print('ACC of NB Model')
print(acc)