In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('word2vec').getOrCreate()

In [2]:
import os.path
import zipfile

if not os.path.exists("unlabeledTrainData.tsv"):
    with zipfile.ZipFile("unlabeledTrainData.tsv.zip", "r") as zip_ref:
        zip_ref.extractall(".")

if not os.path.exists("labeledTrainData.tsv"):
    with zipfile.ZipFile("labeledTrainData.tsv.zip", "r") as zip_ref:
        zip_ref.extractall(".")

if not os.path.exists("testData.tsv"):
    with zipfile.ZipFile("testData.tsv.zip", "r") as zip_ref:
        zip_ref.extractall(".")

In [3]:
unsupervisedTrain = spark.read.csv('./unlabeledTrainData.tsv', sep='\t', header='true', inferSchema='true')
unsupervisedTrain.printSchema()
unsupervisedTrain.show()

root
 |-- id: string (nullable = true)
 |-- review: string (nullable = true)

+-------+--------------------+
|     id|              review|
+-------+--------------------+
| 9999_0|Watching Time Cha...|
|45057_0|I saw this film a...|
|15561_0|Minor Spoilers<br...|
| 7161_0|I went to see thi...|
|43971_0|Yes, I agree with...|
|36495_0|Jennifer Ehle was...|
|49472_0|Amy Poehler is a ...|
|36693_0|A plane carrying ...|
|  316_0|A well made, grit...|
|32454_0|Incredibly dumb a...|
|37128_0|After reading the...|
|19439_0|It's hard to desc...|
|10760_0|Of all the bile-i...|
|15073_0|This is quite an ...|
|33119_0|Being a huge Gary...|
|38735_0|For the most part...|
|12041_0|Ram Gopal Varma d...|
|41565_0|I gave it 2 for s...|
|48612_0|I wanted to watch...|
|17525_0|Che is a good fil...|
+-------+--------------------+
only showing top 20 rows



In [4]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    
    def handle_data(self, d):
        self.fed.append(d)
    
    def get_data(self):
        return ' '.join(self.fed)

In [5]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

class StripHtmlTags(Transformer, HasInputCol, HasOutputCol):

    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, stopwords=None):
        super().__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None, stopwords=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def _transform(self, dataset):
        def strip_tags(html):
            s = MLStripper()
            s.feed(html)
            return s.get_data()

        t = StringType()
        out_col = self.getOutputCol()
        in_col = dataset[self.getInputCol()]
        return dataset.withColumn(out_col, udf(strip_tags, t)(in_col))

In [6]:
# Copied from https://stackoverflow.com/a/32337101/512251
import nltk

from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

class NLTKWordPunctTokenizer(Transformer, HasInputCol, HasOutputCol):

    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, stopwords=None):
        super().__init__()
        self.stopwords = Param(self, "stopwords", "")
        self._setDefault(stopwords=set())
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None, stopwords=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def setStopwords(self, value):
        self._paramMap[self.stopwords] = value
        return self

    def getStopwords(self):
        return self.getOrDefault(self.stopwords)

    def _transform(self, dataset):
        stopwords = self.getStopwords()

        def f(s):
            tokens = nltk.tokenize.wordpunct_tokenize(s)
            return [t for t in tokens if t.lower() not in stopwords]

        t = ArrayType(StringType())
        out_col = self.getOutputCol()
        in_col = dataset[self.getInputCol()]
        return dataset.withColumn(out_col, udf(f, t)(in_col))

In [7]:
from pyspark.ml import Pipeline

colName = 'review'

stripper = StripHtmlTags(inputCol=colName, outputCol='strippedReview')
tokenizer = NLTKWordPunctTokenizer(inputCol='strippedReview', outputCol='tokens')
tokenizationPipeline = Pipeline(stages=[stripper, tokenizer])
unsupervisedTrain = tokenizationPipeline.fit(unsupervisedTrain).transform(unsupervisedTrain)
unsupervisedTrain = unsupervisedTrain.drop('review', 'strippedReview')
unsupervisedTrain.show()

+-------+--------------------+
|     id|              tokens|
+-------+--------------------+
| 9999_0|[Watching, Time, ...|
|45057_0|[I, saw, this, fi...|
|15561_0|[Minor, Spoilers,...|
| 7161_0|[I, went, to, see...|
|43971_0|[Yes, ,, I, agree...|
|36495_0|[Jennifer, Ehle, ...|
|49472_0|[Amy, Poehler, is...|
|36693_0|[A, plane, carryi...|
|  316_0|[A, well, made, ,...|
|32454_0|[Incredibly, dumb...|
|37128_0|[After, reading, ...|
|19439_0|[It, ', s, hard, ...|
|10760_0|[Of, all, the, bi...|
|15073_0|[This, is, quite,...|
|33119_0|[Being, a, huge, ...|
|38735_0|[For, the, most, ...|
|12041_0|[Ram, Gopal, Varm...|
|41565_0|[I, gave, it, 2, ...|
|48612_0|[I, wanted, to, w...|
|17525_0|[Che, is, a, good...|
+-------+--------------------+
only showing top 20 rows



In [8]:
from pyspark.ml.feature import Word2Vec

word2Vec = Word2Vec(vectorSize=200, seed=42, inputCol="tokens", outputCol="wordVectors")
word2VecModel = word2Vec.fit(unsupervisedTrain)
word2VecModel.getVectors().show()


+------------+--------------------+
|        word|              vector|
+------------+--------------------+
|      Talent|[0.05043964087963...|
|       1910s|[0.00766492448747...|
|   professed|[0.00649873120710...|
|     Priests|[0.00435002613812...|
|          CV|[-0.0500916466116...|
|          Bu|[0.00334370625205...|
|   Mikkelsen|[-0.0234730765223...|
|     Gégauff|[0.10754672437906...|
|    quotient|[0.02462394163012...|
|      Sadler|[0.10671693086624...|
|    incident|[0.15025775134563...|
|     misfire|[0.06377870589494...|
|        buns|[0.04085354879498...|
|precognition|[0.01148595195263...|
|     serious|[0.23976345360279...|
|       brink|[0.14629855751991...|
|   showdowns|[0.03588028252124...|
|       Milch|[0.06159516051411...|
| ferociously|[0.01441349834203...|
|     acronym|[0.07183517515659...|
+------------+--------------------+
only showing top 20 rows



In [9]:
supervisedTrain = spark.read.csv('./labeledTrainData.tsv', sep='\t', header='true', inferSchema='true')
supervisedTrain.printSchema()
supervisedTrain.show()
supervisedTrain.groupby('sentiment').count().show()

root
 |-- id: string (nullable = true)
 |-- sentiment: integer (nullable = true)
 |-- review: string (nullable = true)

+-------+---------+--------------------+
|     id|sentiment|              review|
+-------+---------+--------------------+
| 5814_8|        1|With all this stu...|
| 2381_9|        1|"The Classic War ...|
| 7759_3|        0|The film starts w...|
| 3630_4|        0|It must be assume...|
| 9495_8|        1|Superbly trashy a...|
| 8196_8|        1|I dont know why p...|
| 7166_2|        0|This movie could ...|
|10633_1|        0|I watched this vi...|
|  319_1|        0|A friend of mine ...|
|8713_10|        1|<br /><br />This ...|
| 2486_3|        0|What happens when...|
|6811_10|        1|Although I genera...|
|11744_9|        1|"Mr. Harvey Light...|
| 7369_1|        0|I had a feeling t...|
|12081_1|        0|note to George Li...|
| 3561_4|        0|Stephen King adap...|
| 4489_1|        0|`The Matrix' was ...|
| 3951_2|        0|Ulli Lommel's 198...|
|3304_10|        1|

In [10]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

rf = RandomForestClassifier(labelCol='sentiment', featuresCol='wordVectors')
classificationPipeline = Pipeline(stages=[tokenizationPipeline, word2VecModel, rf])
grid = ParamGridBuilder().addGrid(rf.numTrees, [15, 20, 25, 30]).addGrid(rf.maxDepth, [3, 5]).build()
cv = CrossValidator(estimator=classificationPipeline, 
                    estimatorParamMaps=grid, 
                    evaluator=BinaryClassificationEvaluator(labelCol='sentiment'),
                    numFolds=10)

model = cv.fit(supervisedTrain)
supervisedTrain = model.transform(supervisedTrain)

evaluator = BinaryClassificationEvaluator(labelCol='sentiment')
r = evaluator.evaluate(supervisedTrain)
r

0.8502578879999996

In [11]:
test = spark.read.csv('./testData.tsv', sep='\t', header='true', inferSchema='true')
test.printSchema()
test.show()

root
 |-- id: string (nullable = true)
 |-- review: string (nullable = true)

+--------+--------------------+
|      id|              review|
+--------+--------------------+
|12311_10|Naturally in a fi...|
|  8348_2|This movie is a d...|
|  5828_4|All in all, this ...|
|  7186_2|Afraid of the Dar...|
| 12128_7|A very accurate d...|
|  2913_8|...as valuable as...|
|  4396_1|This has to be on...|
|   395_2|This is one of th...|
| 10616_1|The worst movie i...|
|  9074_9|Five medical stud...|
|  9252_3|'The Mill on the ...|
|  9896_9|I just saw this f...|
|   574_4|"The Love Letter"...|
| 11182_8|Another fantastic...|
| 11656_4|This was included...|
|  2322_4|I'm not really mu...|
|  8703_1|This movie was dr...|
|  7483_1|I don't think I'v...|
| 6007_10|Excellent story-t...|
| 12424_4|I completely forg...|
+--------+--------------------+
only showing top 20 rows



In [12]:
test = model.transform(test)

In [13]:
test.show()

+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|      id|              review|      strippedReview|              tokens|         wordVectors|       rawPrediction|         probability|prediction|
+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|12311_10|Naturally in a fi...|Naturally in a fi...|[Naturally, in, a...|[0.03897609903848...|[5.74055357343557...|[0.22962214293742...|       1.0|
|  8348_2|This movie is a d...|This movie is a d...|[This, movie, is,...|[0.03354414005419...|[17.6688367243038...|[0.70675346897215...|       0.0|
|  5828_4|All in all, this ...|All in all, this ...|[All, in, all, ,,...|[0.03919219366508...|[12.9978412503189...|[0.51991365001275...|       0.0|
|  7186_2|Afraid of the Dar...|Afraid of the Dar...|[Afraid, of, the,...|[0.02362638613847...|[12.2917913812105.

In [14]:
from pyspark.sql.types import IntegerType

test.select('id', 'prediction')\
    .coalesce(1)\
    .withColumn('sentiment', test['prediction'].cast(IntegerType()))\
    .drop('prediction')\
    .write.csv('prediction.csv', header='true')

AnalysisException: 'path file:/home/jovyan/work/word2vec-nlp-tutorial/prediction.csv already exists.;'