In [1]:
import pyspark
from pyspark.sql import *
from pyspark.sql import SQLContext

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder.master("local[2]").config("spark.driver.host","localhost").appName("NLP").getOrCreate()
sc = spark.sparkContext

# using SQLContext to read parquet file
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

spark

In [2]:
df = spark.read.csv('spam_test.csv', inferSchema = True,sep='\t')
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)



In [3]:
df = df.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text')
df=df.select('class','text')

In [4]:
df.show()

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
| spam|FreeMsg Hey there...|
|  ham|Even my brother i...|
|  ham|As per your reque...|
| spam|WINNER!! As a val...|
| spam|Had your mobile 1...|
|  ham|I'm gonna be home...|
| spam|SIX chances to wi...|
| spam|URGENT! You have ...|
|  ham|I've been searchi...|
|  ham|I HAVE A DATE ON ...|
| spam|XXXMobileMovieClu...|
|  ham|Oh k...i'm watchi...|
|  ham|Eh u remember how...|
|  ham|Fine if thats th...|
| spam|England v Macedon...|
+-----+--------------------+
only showing top 20 rows



In [5]:
from pyspark.sql.functions import length

df = df.withColumn('length', length(df['text']))
df.show(3)

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
+-----+--------------------+------+
only showing top 3 rows



In [6]:
df.groupBy('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



In [7]:
from pyspark.ml.feature import CountVectorizer, Tokenizer,StopWordsRemover, IDF, StringIndexer,VectorAssembler
tokenizer = Tokenizer(inputCol = 'text', outputCol = 'token_text')

In [8]:
ham_spam_to_numeric = StringIndexer(inputCol = 'class', outputCol = 'label')
stop_remove = StopWordsRemover(inputCol = 'token_text', outputCol = 'stop_token')
count_vec = CountVectorizer(inputCol = 'stop_token', outputCol = 'count_vector')
idf = IDF(inputCol = 'count_vector', outputCol = 'tf_idf')

In [9]:
assembler = VectorAssembler(inputCols = ['tf_idf', 'length'], outputCol = 'features')

In [10]:
train, test = df.randomSplit([0.7, 0.3])

In [11]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes()

In [12]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[ham_spam_to_numeric, tokenizer, stop_remove, count_vec, idf,assembler,nb])

In [13]:
fit_model=pipeline.fit(train)

In [14]:
spam_detector = fit_model.transform(test)
spam_detector.select('class','rawPrediction','prediction').show()

+-----+--------------------+----------+
|class|       rawPrediction|prediction|
+-----+--------------------+----------+
|  ham|[-297.45587634925...|       0.0|
|  ham|[-842.41883431870...|       0.0|
|  ham|[-554.83512401155...|       0.0|
|  ham|[-517.04317069414...|       0.0|
|  ham|[-836.58888358508...|       0.0|
|  ham|[-571.43789610723...|       0.0|
|  ham|[-517.66967191374...|       0.0|
|  ham|[-2082.8073645667...|       0.0|
|  ham|[-245.59046332496...|       0.0|
|  ham|[-712.29223627013...|       0.0|
|  ham|[-111.89973510411...|       0.0|
|  ham|[-81.741935983939...|       0.0|
|  ham|[-557.73448859723...|       0.0|
|  ham|[-202.79518397735...|       0.0|
|  ham|[-1309.4629041877...|       0.0|
|  ham|[-861.19711905600...|       0.0|
|  ham|[-259.40927507588...|       0.0|
|  ham|[-195.18392489014...|       0.0|
|  ham|[-222.15531620025...|       0.0|
|  ham|[-1264.9244872376...|       0.0|
+-----+--------------------+----------+
only showing top 20 rows



In [15]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator()
print("Test Accuracy: " + str(evaluator.evaluate(spam_detector, {evaluator.metricName: "accuracy"})))

Test Accuracy: 0.9775280898876404
