In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession, HiveContext
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StopWordsRemover

In [2]:
spark = SparkSession.builder.enableHiveSupport().getOrCreate()

In [4]:
#data = spark.read.format('csv') \
#                .option('delimiter', "\t") \
#                .load('SMSSpamCollection').toDF("Spam","Message")
#data.show()
#print(f'Record count is: {data.count()}')

data = spark.read.format('csv') \
                .option('delimiter', "\t") \
                .load("hdfs://nameservice1/user/edureka_1118556/SMSSpamCollection").toDF("Spam","Message")
data.show()
print(f'Record count is: {data.count()}')


IllegalArgumentException: java.net.UnknownHostException: nameservice1

In [65]:
tokenizer = Tokenizer(inputCol="Message", outputCol="words")

regexTokenizer = RegexTokenizer(inputCol="Message", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)

#countTokens = udf(lambda words: len(words), IntegerType())

tokenized = tokenizer.transform(data)
tokenized.select("Message", "words")\
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

regexTokenized = regexTokenizer.transform(data)
regexTokenized.select("Message", "words") \
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
# $example off$

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
|Message                                                                                                                                                                                             |words                                                                                                                                                                                                                                     |tokens|
+---------------------------------------------------------------------------------------------------------------------

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
|Message                                                                                                                                                                                             |words                                                                                                                                                                                                                                   |tokens|
+-------------------------------------------------------------------------------------------------------------------------

In [66]:
regexTokenized.show()

+----+--------------------+--------------------+
|Spam|             Message|               words|
+----+--------------------+--------------------+
| ham|Go until jurong p...|[go, until, juron...|
| ham|Ok lar... Joking ...|[ok, lar, joking,...|
|spam|Free entry in 2 a...|[free, entry, in,...|
| ham|U dun say so earl...|[u, dun, say, so,...|
| ham|Nah I don't think...|[nah, i, don, t, ...|
|spam|FreeMsg Hey there...|[freemsg, hey, th...|
| ham|Even my brother i...|[even, my, brothe...|
| ham|As per your reque...|[as, per, your, r...|
|spam|WINNER!! As a val...|[winner, as, a, v...|
|spam|Had your mobile 1...|[had, your, mobil...|
| ham|I'm gonna be home...|[i, m, gonna, be,...|
|spam|SIX chances to wi...|[six, chances, to...|
|spam|URGENT! You have ...|[urgent, you, hav...|
| ham|I've been searchi...|[i, ve, been, sea...|
| ham|I HAVE A DATE ON ...|[i, have, a, date...|
|spam|XXXMobileMovieClu...|[xxxmobilemoviecl...|
| ham|Oh k...i'm watchi...|[oh, k, i, m, wat...|
| ham|Eh u remember 

In [123]:
stopwords = StopWordsRemover().getStopWords() + ["-"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(regexTokenized)

In [81]:
type(cleaned)

pyspark.sql.dataframe.DataFrame

In [82]:
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="words",outputCol="CountVector",vocabSize=3,minDF=2.0)
model = cv.fit(cleaned)
result = model.transform(cleaned)
result.show()

+----+--------------------+--------------------+--------------------+--------------------+
|Spam|             Message|               words|            filtered|         CountVector|
+----+--------------------+--------------------+--------------------+--------------------+
| ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|           (3,[],[])|
| ham|Ok lar... Joking ...|[ok, lar, joking,...|[ok, lar, joking,...|           (3,[],[])|
|spam|Free entry in 2 a...|[free, entry, in,...|[free, entry, 2, ...|       (3,[1],[3.0])|
| ham|U dun say so earl...|[u, dun, say, so,...|[u, dun, say, ear...|           (3,[],[])|
| ham|Nah I don't think...|[nah, i, don, t, ...|[nah, think, goes...| (3,[0,1],[1.0,1.0])|
|spam|FreeMsg Hey there...|[freemsg, hey, th...|[freemsg, hey, da...|(3,[0,1,2],[1.0,2...|
| ham|Even my brother i...|[even, my, brothe...|[even, brother, l...|       (3,[1],[1.0])|
| ham|As per your reque...|[as, per, your, r...|[per, request, me...|       (3,[1],[1.0])|

In [103]:
from pyspark.ml.feature import (VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer)
spam_indexer = StringIndexer(inputCol='Spam',outputCol='SpamIndex')
spam_encoder = OneHotEncoder(inputCol='SpamIndex',outputCol='SpamVec')
assembler = VectorAssembler(inputCols=['CountVector'],outputCol='features')
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml import Pipeline
log_reg_titanic = LogisticRegression(featuresCol='features',labelCol='SpamIndex')
randomforest = RandomForestClassifier(featuresCol='features',labelCol='SpamIndex')
pipeline = Pipeline(stages=[spam_indexer,spam_encoder,assembler,log_reg_titanic])
pipeline_rf = Pipeline(stages=[spam_indexer,spam_encoder,assembler,randomforest])
train_data , test_data = result.randomSplit([0.7,0.3])
fit_model = pipeline.fit(train_data)
fit_model_rf = pipeline_rf.fit(train_data)

In [104]:
results = fit_model.transform(test_data)
results_rf = fit_model_rf.transform(test_data)

In [99]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [96]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='SpamIndex')
results.select('SpamIndex', 'prediction').show()

+---------+----------+
|SpamIndex|prediction|
+---------+----------+
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
+---------+----------+
only showing top 20 rows



In [98]:
AUC = my_eval.evaluate(results)
AUC

0.5847812510897028

In [106]:
my_eval_rf = MulticlassClassificationEvaluator(labelCol="SpamIndex", predictionCol="prediction", metricName="accuracy")
accuracy_rf = my_eval_rf.evaluate(results_rf)
print("Accuracy = %g" % accuracy_rf)

Accuracy = 0.882494


In [113]:
from pyspark.ml.feature import NGram
ngram = NGram(n=2, inputCol="words", outputCol="bigrams")
bigramdata = ngram.transform(regexTokenized)

In [121]:
bigramdata.show()

+----+--------------------+--------------------+--------------------+
|Spam|             Message|               words|             bigrams|
+----+--------------------+--------------------+--------------------+
| ham|Go until jurong p...|[go, until, juron...|[go until, until ...|
| ham|Ok lar... Joking ...|[ok, lar, joking,...|[ok lar, lar joki...|
|spam|Free entry in 2 a...|[free, entry, in,...|[free entry, entr...|
| ham|U dun say so earl...|[u, dun, say, so,...|[u dun, dun say, ...|
| ham|Nah I don't think...|[nah, i, don, t, ...|[nah i, i don, do...|
|spam|FreeMsg Hey there...|[freemsg, hey, th...|[freemsg hey, hey...|
| ham|Even my brother i...|[even, my, brothe...|[even my, my brot...|
| ham|As per your reque...|[as, per, your, r...|[as per, per your...|
|spam|WINNER!! As a val...|[winner, as, a, v...|[winner as, as a,...|
|spam|Had your mobile 1...|[had, your, mobil...|[had your, your m...|
| ham|I'm gonna be home...|[i, m, gonna, be,...|[i m, m gonna, go...|
|spam|SIX chances to

In [124]:
stopwordsbigram = StopWordsRemover().getStopWords() + ["-"]
removerbigrams = StopWordsRemover().setStopWords(stopwordsbigram).setInputCol("words").setOutputCol("bigramsfiltered")
cleanedbigrams = removerbigrams.transform(bigramdata)

In [149]:
cv_bigram = CountVectorizer(inputCol="bigrams",outputCol="features",vocabSize=3,minDF=2.0)
model = cv_bigram.fit(cleanedbigrams)
resultbigram = model.transform(cleanedbigrams)
resultbigram.show()

+----+--------------------+--------------------+--------------------+--------------------+-------------+
|Spam|             Message|               words|             bigrams|     bigramsfiltered|     features|
+----+--------------------+--------------------+--------------------+--------------------+-------------+
| ham|Go until jurong p...|[go, until, juron...|[go until, until ...|[go, jurong, poin...|    (3,[],[])|
| ham|Ok lar... Joking ...|[ok, lar, joking,...|[ok lar, lar joki...|[ok, lar, joking,...|    (3,[],[])|
|spam|Free entry in 2 a...|[free, entry, in,...|[free entry, entr...|[free, entry, 2, ...|    (3,[],[])|
| ham|U dun say so earl...|[u, dun, say, so,...|[u dun, dun say, ...|[u, dun, say, ear...|    (3,[],[])|
| ham|Nah I don't think...|[nah, i, don, t, ...|[nah i, i don, do...|[nah, think, goes...|    (3,[],[])|
|spam|FreeMsg Hey there...|[freemsg, hey, th...|[freemsg hey, hey...|[freemsg, hey, da...|    (3,[],[])|
| ham|Even my brother i...|[even, my, brothe...|[even m

In [153]:
ham_spam_encoded_data = StringIndexer(inputCol="Spam",outputCol="label").fit(resultbigram).transform(resultbigram)
ham_spam_encoded_data.show()

+----+--------------------+--------------------+--------------------+--------------------+-------------+-----+
|Spam|             Message|               words|             bigrams|     bigramsfiltered|     features|label|
+----+--------------------+--------------------+--------------------+--------------------+-------------+-----+
| ham|Go until jurong p...|[go, until, juron...|[go until, until ...|[go, jurong, poin...|    (3,[],[])|  0.0|
| ham|Ok lar... Joking ...|[ok, lar, joking,...|[ok lar, lar joki...|[ok, lar, joking,...|    (3,[],[])|  0.0|
|spam|Free entry in 2 a...|[free, entry, in,...|[free entry, entr...|[free, entry, 2, ...|    (3,[],[])|  1.0|
| ham|U dun say so earl...|[u, dun, say, so,...|[u dun, dun say, ...|[u, dun, say, ear...|    (3,[],[])|  0.0|
| ham|Nah I don't think...|[nah, i, don, t, ...|[nah i, i don, do...|[nah, think, goes...|    (3,[],[])|  0.0|
|spam|FreeMsg Hey there...|[freemsg, hey, th...|[freemsg hey, hey...|[freemsg, hey, da...|    (3,[],[])|  1.0|
|

In [154]:
training, test = ham_spam_encoded_data.randomSplit([0.7, 0.3], seed = 12345)
training.show()

+----+--------------------+--------------------+--------------------+--------------------+-------------+-----+
|Spam|             Message|               words|             bigrams|     bigramsfiltered|     features|label|
+----+--------------------+--------------------+--------------------+--------------------+-------------+-----+
| ham| &lt;#&gt;  in mc...|[lt, gt, in, mca,...|[lt gt, gt in, in...|[lt, gt, mca, con...|(3,[1],[1.0])|  0.0|
| ham| &lt;#&gt;  mins ...|[lt, gt, mins, bu...|[lt gt, gt mins, ...|[lt, gt, mins, st...|(3,[1],[1.0])|  0.0|
| ham| &lt;DECIMAL&gt; ...|[lt, decimal, gt,...|[lt decimal, deci...|[lt, decimal, gt,...|(3,[2],[1.0])|  0.0|
| ham| and  picking the...|[and, picking, th...|[and picking, pic...|[picking, various...|    (3,[],[])|  0.0|
| ham| came to look at ...|[came, to, look, ...|[came to, to look...|[came, look, flat...|    (3,[],[])|  0.0|
| ham| gonna let me kno...|[gonna, let, me, ...|[gonna let, let m...|[gonna, let, know...|    (3,[],[])|  0.0|
|

In [156]:
lr_bigrams = LogisticRegression()
model_fit = lr_bigrams.fit(training)
predictions = model_fit.transform(test)

In [162]:
evaluate_bigrams = BinaryClassificationEvaluator()
accuracy = evaluate_bigrams.evaluate(predictions)
accuracy

0.5708676973250963