<a href="https://colab.research.google.com/github/rklepov/hse-cs-ml-2018-2019/blob/master/08-spark/03-ml/MinHash.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[***Lecture 3 slides***](https://hackmd.io/@J_qqq0PjTGK1be0341GpYA/S1-yiOj-X#/ "Spark MLLib - HackMD"): https://hackmd.io/@J_qqq0PjTGK1be0341GpYA/S1-yiOj-X#/

___

[MinHash - Wikipedia](https://en.wikipedia.org/wiki/MinHash "MinHash - Wikipedia")

In [1]:
!pip search spark | grep INSTALLED || pip install pyspark==2.4.0 findspark

  INSTALLED: 2.4.0


In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64/jre/"

import findspark
findspark.init('/usr/local/lib/python3.6/dist-packages/pyspark/')

import pyspark

In [0]:
from zipfile import ZipFile
from io import BytesIO
import urllib.request

import ssl

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

def download(url):
    ZipFile.extractall(
        ZipFile(
            BytesIO(
                urllib
                .request
                .urlopen(url,context=ctx)
                .read()
            )
        ),
    )

[SMS Spam Collection Data Set](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection "UCI Machine Learning Repository: SMS Spam Collection Data Set")

In [0]:
download('https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip')

In [0]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [0]:
sms = spark.read.option('sep', '\t').csv('SMSSpamCollection')

In [0]:
src = sms.withColumnRenamed('_c0', 'label').withColumnRenamed('_c1', 'text')

In [8]:
src.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  ham| 4827|
| spam|  747|
+-----+-----+



In [0]:
from pyspark.ml import feature
from pyspark.ml import pipeline

In [0]:
lsh_pipe = pipeline.Pipeline(
    stages =(
        feature.RegexTokenizer(
            inputCol='text',
            outputCol='tokens',
        ),
        feature.CountVectorizer(
            binary=True,
            inputCol='tokens',
            outputCol='v',
        ),
    )
)

In [11]:
lsh_prep_model = lsh_pipe.fit(src)

lsh_src = lsh_prep_model.transform(src)

lsh_src.show(5)

+-----+--------------------+--------------------+--------------------+
|label|                text|              tokens|                   v|
+-----+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|[go, until, juron...|(13586,[8,41,51,6...|
|  ham|Ok lar... Joking ...|[ok, lar..., joki...|(13586,[5,74,407,...|
| spam|Free entry in 2 a...|[free, entry, in,...|(13586,[0,3,8,20,...|
|  ham|U dun say so earl...|[u, dun, say, so,...|(13586,[5,21,59,1...|
|  ham|Nah I don't think...|[nah, i, don't, t...|(13586,[0,1,65,86...|
+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [12]:
mh = feature.MinHashLSH(inputCol='v', outputCol='hash', numHashTables=10)

mh_model = mh.fit(lsh_src)

mh_model.transform(lsh_src).show(5)

+-----+--------------------+--------------------+--------------------+--------------------+
|label|                text|              tokens|                   v|                hash|
+-----+--------------------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|[go, until, juron...|(13586,[8,41,51,6...|[[3.1678372E7], [...|
|  ham|Ok lar... Joking ...|[ok, lar..., joki...|(13586,[5,74,407,...|[[1.27324576E8], ...|
| spam|Free entry in 2 a...|[free, entry, in,...|(13586,[0,3,8,20,...|[[6.657488E7], [1...|
|  ham|U dun say so earl...|[u, dun, say, so,...|(13586,[5,21,59,1...|[[1.27324576E8], ...|
|  ham|Nah I don't think...|[nah, i, don't, t...|(13586,[0,1,65,86...|[[1.2288481E7], [...|
+-----+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [0]:
similar = mh_model.approxSimilarityJoin(lsh_src, lsh_src, threshold=0.7)

In [14]:
similar.show()

+--------------------+--------------------+------------------+
|            datasetA|            datasetB|           distCol|
+--------------------+--------------------+------------------+
|[ham, Cool. So ho...|[ham, Cool. So ho...|               0.0|
|[ham, Mm that tim...|[ham, Mm that tim...|               0.0|
|[ham, En chikku n...|[ham, En chikku n...|               0.0|
|[ham, Yeah there'...|[ham, Yeah there'...|               0.0|
|[ham, Ok.ok ok..t...|[ham, Ok.ok ok..t...|               0.0|
|[ham, Yo, the gam...|[ham, Yo, the gam...|               0.0|
|[ham, Aiyo a bit ...|[ham, Aiyo a bit ...|               0.0|
|[ham, I love work...|[ham, I love work...|               0.0|
|[ham, Its hard to...|[ham, Its hard to...|               0.0|
|[ham, Waaaat?? Lo...|[ham, Waaaat?? Lo...|               0.0|
|[spam, Spook up y...|[spam, Sppok up u...|0.4137931034482759|
|[ham, * Am on a t...|[ham, * Am on a t...|               0.0|
|[ham, Sorry de i ...|[ham, Sorry de i ...|            