In [1]:
from zipfile import ZipFile
from io import BytesIO
import urllib.request

import ssl

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE


def download(url):
    ZipFile.extractall(
        ZipFile(
            BytesIO(
                urllib
                .request
                .urlopen(url,context=ctx)
                .read()
            )
        ),
    )


In [2]:
download('https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip')

In [3]:
!cat readme

SMS Spam Collection v.1
-------------------------

1. DESCRIPTION
--------------

The SMS Spam Collection v.1 (hereafter the corpus) is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam. 

1.1. Compilation
----------------

This corpus has been collected from free or free for research sources at the Web:

- A collection of between 425 SMS spam messages extracted manually from the Grumbletext Web site. This is a UK forum in which cell phone users make public claims about SMS spam messages, most of them without reporting the very spam message received. The identification of the text of spam messages in the claims is a very hard and time-consuming task, and it involved carefully scanning hundreds of web pages. The Grumbletext Web site is: http://www.grumbletext.co.uk/
- A list of 450 SMS ham messages collected from Caroline

In [4]:
import findspark

findspark.init()
import pyspark

In [5]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [6]:
sms = spark.read.option("sep", "\t").csv("SMSSpamCollection")

In [7]:
src = sms.withColumnRenamed("_c0", "label").withColumnRenamed("_c1", "text")
src.show()

+-----+--------------------+
|label|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
| spam|FreeMsg Hey there...|
|  ham|Even my brother i...|
|  ham|As per your reque...|
| spam|WINNER!! As a val...|
| spam|Had your mobile 1...|
|  ham|I'm gonna be home...|
| spam|SIX chances to wi...|
| spam|URGENT! You have ...|
|  ham|I've been searchi...|
|  ham|I HAVE A DATE ON ...|
| spam|XXXMobileMovieClu...|
|  ham|Oh k...i'm watchi...|
|  ham|Eh u remember how...|
|  ham|Fine if thats th...|
| spam|England v Macedon...|
+-----+--------------------+
only showing top 20 rows



In [8]:
src.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|  ham| 4827|
| spam|  747|
+-----+-----+



In [9]:
from pyspark.ml import feature

In [10]:
feature.Tokenizer(inputCol="text", outputCol="tokens").transform(src).show()

+-----+--------------------+--------------------+
|label|                text|              tokens|
+-----+--------------------+--------------------+
|  ham|Go until jurong p...|[go, until, juron...|
|  ham|Ok lar... Joking ...|[ok, lar..., joki...|
| spam|Free entry in 2 a...|[free, entry, in,...|
|  ham|U dun say so earl...|[u, dun, say, so,...|
|  ham|Nah I don't think...|[nah, i, don't, t...|
| spam|FreeMsg Hey there...|[freemsg, hey, th...|
|  ham|Even my brother i...|[even, my, brothe...|
|  ham|As per your reque...|[as, per, your, r...|
| spam|WINNER!! As a val...|[winner!!, as, a,...|
| spam|Had your mobile 1...|[had, your, mobil...|
|  ham|I'm gonna be home...|[i'm, gonna, be, ...|
| spam|SIX chances to wi...|[six, chances, to...|
| spam|URGENT! You have ...|[urgent!, you, ha...|
|  ham|I've been searchi...|[i've, been, sear...|
|  ham|I HAVE A DATE ON ...|[i, have, a, date...|
| spam|XXXMobileMovieClu...|[xxxmobilemoviecl...|
|  ham|Oh k...i'm watchi...|[oh, k...i'm, wat...|


In [11]:
from pyspark.ml import classification

In [12]:
from pyspark.ml import pipeline

main = pipeline.Pipeline(
    stages=(
        feature.RegexTokenizer(
            minTokenLength=3,
            inputCol="text", 
            pattern="\s+", 
            outputCol="tokens",
        ),
        feature.CountVectorizer(
            inputCol="tokens", 
            outputCol="v",
            minDF=5,
            maxDF=900
        ),
        feature.StringIndexer(inputCol="label", outputCol="y"),
        classification.RandomForestClassifier(
            seed=123,
            labelCol="y",
            featuresCol="v",
        )
    )
)




In [13]:
train, test = src.randomSplit(weights=(70., 30.), seed=123)
main_model = main.fit(train)

results = (
    main_model
    .transform(test)
    .select("y", "rawPrediction", "probability", "prediction")
    .cache()
)

results.show()

+---+--------------------+--------------------+----------+
|  y|       rawPrediction|         probability|prediction|
+---+--------------------+--------------------+----------+
|0.0|[17.4577708451969...|[0.87288854225984...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.6961900509745...|[0.88480950254872...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.

In [14]:
from pyspark.sql import functions

results.orderBy("probability").show()

+---+--------------------+--------------------+----------+
|  y|       rawPrediction|         probability|prediction|
+---+--------------------+--------------------+----------+
|1.0|[6.02774777924674...|[0.30138738896233...|       1.0|
|1.0|[7.05938187799983...|[0.35296909389999...|       1.0|
|1.0|[7.06069250555023...|[0.35303462527751...|       1.0|
|1.0|[7.28622884676247...|[0.36431144233812...|       1.0|
|1.0|[7.54685095248554...|[0.37734254762427...|       1.0|
|1.0|[8.41448111855782...|[0.42072405592789...|       1.0|
|1.0|[8.46720221661500...|[0.42336011083075...|       1.0|
|1.0|[8.63567962692463...|[0.43178398134623...|       1.0|
|1.0|[8.67324089890456...|[0.43366204494522...|       1.0|
|1.0|[9.30519987922656...|[0.46525999396132...|       1.0|
|1.0|[9.36938491669584...|[0.46846924583479...|       1.0|
|1.0|[9.36938491669584...|[0.46846924583479...|       1.0|
|1.0|[9.36938491669584...|[0.46846924583479...|       1.0|
|1.0|[9.54773915954144...|[0.47738695797707...|       1.

In [15]:
results.orderBy(functions.desc("probability")).show()

+---+--------------------+--------------------+----------+
|  y|       rawPrediction|         probability|prediction|
+---+--------------------+--------------------+----------+
|1.0|[18.0893416907218...|[0.90446708453609...|       0.0|
|0.0|[18.0893416907218...|[0.90446708453609...|       0.0|
|1.0|[18.0893416907218...|[0.90446708453609...|       0.0|
|0.0|[18.0893416907218...|[0.90446708453609...|       0.0|
|1.0|[18.0893416907218...|[0.90446708453609...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.0|
|0.0|[17.9915243908026...|[0.89957621954013...|       0.

In [16]:
from pyspark.ml import evaluation

evaluation.BinaryClassificationEvaluator(labelCol="y").evaluate(results)

0.9314047828132821

In [68]:
from pyspark.ml import pipeline

lsh_pipe = pipeline.Pipeline(
    stages=(
        feature.Tokenizer(
            inputCol="text",  
            outputCol="tokens",
        ),
        feature.CountVectorizer(
            binary=True,
            inputCol="tokens", 
            outputCol="v",
        ),
    )
)


In [71]:
lsh_prep_model = lsh_pipe.fit(src)

lsh_src = lsh_prep_model.transform(src)
lsh_src.show()

+-----+--------------------+--------------------+--------------------+
|label|                text|              tokens|                   v|
+-----+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|[go, until, juron...|(13587,[8,42,52,6...|
|  ham|Ok lar... Joking ...|[ok, lar..., joki...|(13587,[5,75,411,...|
| spam|Free entry in 2 a...|[free, entry, in,...|(13587,[0,3,8,20,...|
|  ham|U dun say so earl...|[u, dun, say, so,...|(13587,[5,22,60,1...|
|  ham|Nah I don't think...|[nah, i, don't, t...|(13587,[0,1,66,87...|
| spam|FreeMsg Hey there...|[freemsg, hey, th...|(13587,[0,2,6,10,...|
|  ham|Even my brother i...|[even, my, brothe...|(13587,[0,7,9,13,...|
|  ham|As per your reque...|[as, per, your, r...|(13587,[0,10,11,4...|
| spam|WINNER!! As a val...|[winner!!, as, a,...|(13587,[0,2,3,14,...|
| spam|Had your mobile 1...|[had, your, mobil...|(13587,[0,4,5,10,...|
|  ham|I'm gonna be home...|[i'm, gonna, be, ...|(13587,[0,1,6,32,...|
| spam

In [74]:
mh = feature.MinHashLSH(inputCol="v", outputCol="hash")
mh_model = mh.fit(lsh_src)

In [75]:
similar = mh_model.approxSimilarityJoin(lsh_src, lsh_src, 0.7)

In [76]:
similar.show()

+--------------------+--------------------+-------------------+
|            datasetA|            datasetB|            distCol|
+--------------------+--------------------+-------------------+
|[spam, PRIVATE! Y...|[spam, PRIVATE! Y...| 0.6206896551724138|
|[ham, Of cos can ...|[ham, Of cos can ...|                0.0|
|[ham, I luv u soo...|[ham, I luv u soo...|                0.0|
|[ham, Dude ive be...|[ham, Dude ive be...|                0.0|
|[ham, Better. Mad...|[ham, Better. Mad...|                0.0|
|[ham, Hey i booke...|[ham, Hey i booke...|                0.0|
|[ham, Just buy a ...|[ham, Just buy a ...|                0.0|
|[ham, I don know ...|[ham, I don know ...|                0.0|
|[ham, You call hi...|[ham, I asked you...|0.33333333333333337|
|[ham, Come to me,...|[ham, Come to me,...|                0.0|
|[ham, My planning...|[ham, My planning...|                0.0|
|[ham, How did you...|[ham, How did you...|                0.0|
|[ham, We are both...|[ham, We are both.

принтанем найденные похожие (но неодинаковые) тексты

In [91]:
print(
    "\n===========\n".join(
        " <= похож на => ".join(x) 
        for x in
        similar
        .where("datasetA.text != datasetB.text")
        .rdd
        .map(lambda x: (x["datasetA"]["text"], x["datasetB"]["text"]))
        .take(100)
    )
)

PRIVATE! Your 2004 Account Statement for 07742676969 shows 786 unredeemed Bonus Points. To claim call 08719180248 Identifier Code: 45239 Expires <= похож на => PRIVATE! Your 2003 Account Statement for shows 800 un-redeemed S. I. M. points. Call 08715203694 Identifier Code: 40533 Expires 31/10/04
You call him now ok i said call him <= похож на => I asked you to call him now ok
Goodmorning, today i am late for 1hr. <= похож на => Goodmorning today i am late for  &lt;DECIMAL&gt; min.
Todays Voda numbers ending 5226 are selected to receive a ?350 award. If you hava a match please call 08712300220 quoting claim code 1131 standard rates app  <= похож на => Todays Voda numbers ending 1225 are selected to receive a £50award. If you have a match please call 08712300220 quoting claim code 3100 standard rates app 
Ok can... <= похож на => Ok lor...
Sir, Waiting for your mail. <= похож на => Sir, waiting for your letter.
December only! Had your mobile 11mths+? You are entitled to update to the lat