In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spam').getOrCreate()

In [2]:
data = spark.read.csv('/FileStore/tables/SMSSpamCollection', inferSchema=True, sep='\t')

In [3]:
data.show()

In [4]:
data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text')

In [5]:
data.show()

In [6]:
from pyspark.sql.functions import length

In [7]:
data = data.withColumn('length', length(data['text']))

In [8]:
data.show()

In [9]:
data.groupBy('class').mean().show()

In [10]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer, VectorAssembler

In [11]:
tokenizer = Tokenizer(inputCol='text', outputCol='token_text')
stop_remove = StopWordsRemover(inputCol='token_text', outputCol='stop_token')
count_vec = CountVectorizer(inputCol='stop_token', outputCol='c_vec')
idf = IDF(inputCol='c_vec', outputCol='tf_idf')
ham_spam_to_numeric = StringIndexer(inputCol='class', outputCol='label')

clean_up = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features')

In [12]:
from pyspark.ml.classification import NaiveBayes

In [13]:
nb = NaiveBayes()

In [14]:
from pyspark.ml import Pipeline

In [15]:
data_prep_pipe = Pipeline(stages=[ham_spam_to_numeric, tokenizer, stop_remove, count_vec, idf, clean_up ])

In [16]:
cleaner = data_prep_pipe.fit(data)

In [17]:
clean_data = cleaner.transform(data)

In [18]:
clean_data = clean_data.select('label', 'features')

In [19]:
clean_data.show()

In [20]:
training, test = clean_data.randomSplit([0.7, 0.3])

In [21]:
spam_detector = nb.fit(training)

In [22]:
test_results = spam_detector.transform(test)

In [23]:
test_results.show()

In [24]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [25]:
acc_eval = MulticlassClassificationEvaluator()

In [26]:
acc = acc_eval.evaluate(test_results)

In [27]:
print(acc)