In [6]:
import findspark
findspark.init('/home/shashank/spark')

In [7]:
import pyspark

In [8]:
from pyspark.sql import SparkSession

In [9]:
spark = SparkSession.builder.appName('ca').getOrCreate()

In [10]:
from pyspark.ml.feature import (HashingTF, IDF, CountVectorizer, Tokenizer, RegexTokenizer,
                                    StopWordsRemover, NGram, StringIndexer)

# build a spam detection param

In [11]:
data = spark.read.csv('smsspamcollection/SMSSpamCollection', inferSchema=True, sep = '\t')

In [12]:
data.show()

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
|spam|FreeMsg Hey there...|
| ham|Even my brother i...|
| ham|As per your reque...|
|spam|WINNER!! As a val...|
|spam|Had your mobile 1...|
| ham|I'm gonna be home...|
|spam|SIX chances to wi...|
|spam|URGENT! You have ...|
| ham|I've been searchi...|
| ham|I HAVE A DATE ON ...|
|spam|XXXMobileMovieClu...|
| ham|Oh k...i'm watchi...|
| ham|Eh u remember how...|
| ham|Fine if thats th...|
|spam|England v Macedon...|
+----+--------------------+
only showing top 20 rows



In [13]:
data.createOrReplaceTempView('data')

In [14]:
data2 = spark.sql("FROM data SELECT _c0 AS class_spam, _c1 AS text")

In [15]:
data2.show()

+----------+--------------------+
|class_spam|                text|
+----------+--------------------+
|       ham|Go until jurong p...|
|       ham|Ok lar... Joking ...|
|      spam|Free entry in 2 a...|
|       ham|U dun say so earl...|
|       ham|Nah I don't think...|
|      spam|FreeMsg Hey there...|
|       ham|Even my brother i...|
|       ham|As per your reque...|
|      spam|WINNER!! As a val...|
|      spam|Had your mobile 1...|
|       ham|I'm gonna be home...|
|      spam|SIX chances to wi...|
|      spam|URGENT! You have ...|
|       ham|I've been searchi...|
|       ham|I HAVE A DATE ON ...|
|      spam|XXXMobileMovieClu...|
|       ham|Oh k...i'm watchi...|
|       ham|Eh u remember how...|
|       ham|Fine if thats th...|
|      spam|England v Macedon...|
+----------+--------------------+
only showing top 20 rows



In [16]:
from pyspark.sql.functions import length

In [17]:
data2.createOrReplaceTempView('data2')

In [18]:
data3 = spark.sql("FROM data2 SELECT class_spam, text, LENGTH(text) AS len")

In [19]:
data3.show()

+----------+--------------------+---+
|class_spam|                text|len|
+----------+--------------------+---+
|       ham|Go until jurong p...|111|
|       ham|Ok lar... Joking ...| 29|
|      spam|Free entry in 2 a...|155|
|       ham|U dun say so earl...| 49|
|       ham|Nah I don't think...| 61|
|      spam|FreeMsg Hey there...|147|
|       ham|Even my brother i...| 77|
|       ham|As per your reque...|160|
|      spam|WINNER!! As a val...|157|
|      spam|Had your mobile 1...|154|
|       ham|I'm gonna be home...|109|
|      spam|SIX chances to wi...|136|
|      spam|URGENT! You have ...|155|
|       ham|I've been searchi...|196|
|       ham|I HAVE A DATE ON ...| 35|
|      spam|XXXMobileMovieClu...|149|
|       ham|Oh k...i'm watchi...| 26|
|       ham|Eh u remember how...| 81|
|       ham|Fine if thats th...| 56|
|      spam|England v Macedon...|155|
+----------+--------------------+---+
only showing top 20 rows



In [20]:
data3.createOrReplaceTempView('data3')

In [21]:
spark.sql("FROM data3 SELECT class_spam, AVG(len) GROUP BY class_spam").show()

+----------+-----------------+
|class_spam|         avg(len)|
+----------+-----------------+
|       ham|71.45431945307645|
|      spam|138.6706827309237|
+----------+-----------------+



In [22]:
tokenizer = Tokenizer(inputCol='text', outputCol='token_text')

In [23]:
stop_remove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens')

In [24]:
count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec')

In [25]:
idf = IDF(inputCol='c_vec', outputCol='tf_idf')

In [26]:
ham_spam_to_numeric = StringIndexer(inputCol='class_spam', outputCol='label')

In [27]:
from pyspark.ml.feature import VectorAssembler

In [28]:
clean_up = VectorAssembler(inputCols=['tf_idf', 'len'], outputCol='features')

In [29]:
from pyspark.ml.classification import NaiveBayes

In [30]:
nb = NaiveBayes() #use all defaults

In [31]:
from pyspark.ml import Pipeline

In [32]:
data_prep_pipe = Pipeline(stages=[ham_spam_to_numeric, tokenizer, stop_remove, count_vec, idf, clean_up])

In [33]:
clean_data = data_prep_pipe.fit(data3).transform(data3)

In [34]:
clean_data.show()

+----------+--------------------+---+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|class_spam|                text|len|label|          token_text|         stop_tokens|               c_vec|              tf_idf|            features|
+----------+--------------------+---+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|       ham|Go until jurong p...|111|  0.0|[go, until, juron...|[go, jurong, poin...|(13423,[7,11,31,6...|(13423,[7,11,31,6...|(13424,[7,11,31,6...|
|       ham|Ok lar... Joking ...| 29|  0.0|[ok, lar..., joki...|[ok, lar..., joki...|(13423,[0,24,297,...|(13423,[0,24,297,...|(13424,[0,24,297,...|
|      spam|Free entry in 2 a...|155|  1.0|[free, entry, in,...|[free, entry, 2, ...|(13423,[2,13,19,3...|(13423,[2,13,19,3...|(13424,[2,13,19,3...|
|       ham|U dun say so earl...| 49|  0.0|[u, dun, say, so,...|[u, dun, say, ear...|(13423,[0,70,80,1...|

In [35]:
train, test = clean_data.randomSplit([0.7,0.3])

In [36]:
spam_detector = nb.fit(train)

In [37]:
test_results = spam_detector.transform(test)

In [38]:
test_results.show()

+----------+--------------------+---+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|class_spam|                text|len|label|          token_text|         stop_tokens|               c_vec|              tf_idf|            features|       rawPrediction|         probability|prediction|
+----------+--------------------+---+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|       ham| &lt;#&gt;  mins ...| 51|  0.0|[, &lt;#&gt;, , m...|[, &lt;#&gt;, , m...|(13423,[3,6,41,20...|(13423,[3,6,41,20...|(13424,[3,6,41,20...|[-298.50954793164...|[1.0,4.2601326582...|       0.0|
|       ham| what number do u...| 36|  0.0|[, what, number, ...|[, number, u, liv...|(13423,[0,3,86,19...|(13423,[0,3,86,19...|(13424,[0,3,86,19...|[-309.71982965587...|[0.99999973791163...|  

In [39]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [40]:
acc_eval = MulticlassClassificationEvaluator()

In [41]:
accuracy = acc_eval.evaluate(test_results)

In [42]:
print("Accuracy")
print(accuracy)

Accuracy
0.9275291002810677


# Random Forest

In [43]:
from pyspark.ml.classification import RandomForestClassifier

In [44]:
rf_model = RandomForestClassifier()

In [45]:
spam_detector_rf = rf_model.fit(train)

In [46]:
test_results_rf = spam_detector_rf.transform(test)

In [47]:
accuracy_rf = acc_eval.evaluate(test_results_rf)

In [48]:
print("Accuracy - Random Forest")
print(accuracy_rf)

Accuracy - Random Forest
0.8021539530555728


**Check out other classifiers in the classification module and play around with parameters to see what gives you the best prediction.**
