## Text Classifier in Pyspark

- Use Pyspark ML for text classification problem

In [1]:
import os 
os.environ["JAVA_HOME"]='/usr/local/opt/openjdk@8'

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()



In [3]:
#df = spark.read.csv("spam.csv", sep = ",", inferSchema=False, header = True, encoding='latin1')
df = spark.read.csv("SMSSpamCollection", sep = "\t", inferSchema=True, header = False)
df.show(5)

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
+----+--------------------+
only showing top 5 rows



In [4]:
#df = df.drop('_c2', '_c3', '_c4')
df.show(5)

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
+----+--------------------+
only showing top 5 rows



In [5]:
#df = df.withColumnRenamed('v1', 'status').withColumnRenamed('v2', 'message')
df = df.withColumnRenamed('_c0', 'status').withColumnRenamed('_c1', 'message')
df.show(5)

+------+--------------------+
|status|             message|
+------+--------------------+
|   ham|Go until jurong p...|
|   ham|Ok lar... Joking ...|
|  spam|Free entry in 2 a...|
|   ham|U dun say so earl...|
|   ham|Nah I don't think...|
+------+--------------------+
only showing top 5 rows



In [6]:
df.createOrReplaceTempView('temp')
df = spark.sql('select case status when "ham" then 1.0  else 0 end as label, message from temp')
df.show(5)

+-----+--------------------+
|label|             message|
+-----+--------------------+
|  1.0|Go until jurong p...|
|  1.0|Ok lar... Joking ...|
|  0.0|Free entry in 2 a...|
|  1.0|U dun say so earl...|
|  1.0|Nah I don't think...|
+-----+--------------------+
only showing top 5 rows



In [7]:
df.groupby('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|  747|
|  1.0| 4827|
+-----+-----+



In [8]:
from pyspark.ml.feature import  RegexTokenizer
tokenizer = RegexTokenizer(inputCol="message", outputCol="words", pattern='[^\\w]')
wordsData = tokenizer.transform(df)
wordsData.show()

+-----+--------------------+--------------------+
|label|             message|               words|
+-----+--------------------+--------------------+
|  1.0|Go until jurong p...|[go, until, juron...|
|  1.0|Ok lar... Joking ...|[ok, lar, joking,...|
|  0.0|Free entry in 2 a...|[free, entry, in,...|
|  1.0|U dun say so earl...|[u, dun, say, so,...|
|  1.0|Nah I don't think...|[nah, i, don, t, ...|
|  0.0|FreeMsg Hey there...|[freemsg, hey, th...|
|  1.0|Even my brother i...|[even, my, brothe...|
|  1.0|As per your reque...|[as, per, your, r...|
|  0.0|WINNER!! As a val...|[winner, as, a, v...|
|  0.0|Had your mobile 1...|[had, your, mobil...|
|  1.0|I'm gonna be home...|[i, m, gonna, be,...|
|  0.0|SIX chances to wi...|[six, chances, to...|
|  0.0|URGENT! You have ...|[urgent, you, hav...|
|  1.0|I've been searchi...|[i, ve, been, sea...|
|  1.0|I HAVE A DATE ON ...|[i, have, a, date...|
|  0.0|XXXMobileMovieClu...|[xxxmobilemoviecl...|
|  1.0|Oh k...i'm watchi...|[oh, k, i, m, wat...|


In [9]:
# wordsData = wordsData.limit(100)
wordsData.count()

5574

## Apply CountVectorizer

- CountVectorizer converts the list of tokens above to vectors of token counts
- https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.CountVectorizer.html

In [11]:
from pyspark.ml.feature import CountVectorizer

count_vec = CountVectorizer(inputCol="words", outputCol="features")
model = count_vec.fit(wordsData)
featurizedData = model.transform(wordsData)
featurizedData.show()

+-----+--------------------+--------------------+--------------------+
|label|             message|               words|            features|
+-----+--------------------+--------------------+--------------------+
|  1.0|Go until jurong p...|[go, until, juron...|(8748,[7,52,60,61...|
|  1.0|Ok lar... Joking ...|[ok, lar, joking,...|(8748,[5,48,338,4...|
|  0.0|Free entry in 2 a...|[free, entry, in,...|(8748,[1,3,7,17,2...|
|  1.0|U dun say so earl...|[u, dun, say, so,...|(8748,[5,25,59,11...|
|  1.0|Nah I don't think...|[nah, i, don, t, ...|(8748,[0,1,23,64,...|
|  0.0|FreeMsg Hey there...|[freemsg, hey, th...|(8748,[0,1,2,6,11...|
|  1.0|Even my brother i...|[even, my, brothe...|(8748,[1,8,9,10,2...|
|  1.0|As per your reque...|[as, per, your, r...|(8748,[1,12,13,53...|
|  0.0|WINNER!! As a val...|[winner, as, a, v...|(8748,[1,2,3,15,1...|
|  0.0|Had your mobile 1...|[had, your, mobil...|(8748,[1,4,5,12,1...|
|  1.0|I'm gonna be home...|[i, m, gonna, be,...|(8748,[0,1,6,23,2...|
|  0.0

In [16]:
featurizedData.select('features').show(8,truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                                                                                                                                        |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|(8748,[7,52,60,61,69,92,127,138,150,343,472,654,742,877,1373,1413,1427,3000,7571,8201],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])                                                     

In [15]:
print(model.vocabulary[0:10])

['i', 'to', 'you', 'a', 'the', 'u', 'and', 'in', 'is', 'me']


#### As an example, 'to' is index 1 and appears 3 time in sentences 3 and 1 time in sentense 7
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's


Even my brother is not like to speak with me. They treat me like aids patent.

In [61]:
featurizedData = featurizedData.select(['label', 'features'])
featurizedData.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(8748,[7,52,60,61...|
|  1.0|(8748,[5,48,338,4...|
|  0.0|(8748,[1,3,7,17,2...|
|  1.0|(8748,[5,25,59,11...|
|  1.0|(8748,[0,1,23,64,...|
+-----+--------------------+
only showing top 5 rows



In [81]:
seed = 10  # set seed for reproducibility
trainDF, testDF = featurizedData.randomSplit([0.8,0.2],seed)

In [82]:
trainDF.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(8748,[0,1,2,3,6,...|
|  0.0|(8748,[0,1,2,3,13...|
|  0.0|(8748,[0,1,2,3,13...|
|  0.0|(8748,[0,1,2,3,14...|
|  0.0|(8748,[0,1,2,5,6,...|
+-----+--------------------+
only showing top 5 rows



In [83]:
trainDF.count()

4451

In [84]:
testDF.count()

1123

In [85]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import numpy as np

lr = LogisticRegression(maxIter = 10)

paramGrid_lr = ParamGridBuilder() \
    .addGrid(lr.regParam, np.linspace(0.3, 0.01, 10)) \
    .addGrid(lr.elasticNetParam, np.linspace(0.3, 0.8, 6)) \
    .build()
crossval_lr = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid_lr,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds= 5)  
cvModel_lr = crossval_lr.fit(trainDF)
best_model_lr = cvModel_lr.bestModel.summary

In [87]:
best_model_lr.predictions.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(8748,[0,1,2,3,6,...|[-0.4863393904709...|[0.38075629205915...|       1.0|
|  0.0|(8748,[0,1,2,3,13...|[1.83750308503984...|[0.86265313398684...|       0.0|
|  0.0|(8748,[0,1,2,3,13...|[1.83750308503984...|[0.86265313398684...|       0.0|
|  0.0|(8748,[0,1,2,3,14...|[1.05195635570505...|[0.74115039618105...|       0.0|
|  0.0|(8748,[0,1,2,5,6,...|[2.61536902487082...|[0.93184417792660...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [88]:
predictions_lr = cvModel_lr.transform(testDF)

In [89]:
predictions_lr.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(8748,[0,1,2,3,6,...|[3.00633005917547...|[0.95285927988722...|       0.0|
|  0.0|(8748,[0,1,2,3,13...|[1.83750308503984...|[0.86265313398684...|       0.0|
|  0.0|(8748,[0,1,2,4,7,...|[1.13082750118948...|[0.75599157917591...|       0.0|
|  0.0|(8748,[0,1,2,9,10...|[-0.0607815723814...|[0.48480928333143...|       1.0|
|  0.0|(8748,[0,1,3,6,10...|[-0.2273395500029...|[0.44340863836306...|       1.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



#### the probability columns, contains this tuple $(P_{spam}, P_{ham})$

In [90]:
predictions_lr.groupBy('label','prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  0.0|       1.0|   23|
|  1.0|       1.0|  960|
|  0.0|       0.0|  138|
|  1.0|       0.0|    2|
+-----+----------+-----+



In [91]:
23 + 960 + 138 + 2

1123