In [None]:
##  Quiz
* TrainValidationSplit를 이용하여 영화리뷰 긍/부정 예측  
  Estimator pipeline을 테스트(trainRatio = 0.8)
* ParamGridBuilder를 사용하여 Word2Vec의 파라미터 vectorSize를 5,10,20,40으로 바꾸어 정확도를 측정하여 출력
* 정확도는 BinaryClassificationEvaluator를 사용할 것

In [1]:
import findspark
findspark.init()

# create spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("my app").master("local").getOrCreate()

# get context from the session
sc = spark.sparkContext

In [12]:
# df = spark.read.format('csv').option('header','true').option('escape','"').load('imdb-review-sentiment.csv')
# root
#  |-- text: string (nullable = true)
#  |-- label: string (nullable = true)

df = spark.read.csv("imdb-review-sentiment.csv", inferSchema=True, header=True, escape='"')
# root
#  |-- text: string (nullable = true)
#  |-- label: integer (nullable = true)
df.show(5)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|I grew up (b. 196...|    0|
|When I put this m...|    0|
|Why do people who...|    0|
|Even though I hav...|    0|
|Im a die hard Dad...|    1|
+--------------------+-----+
only showing top 5 rows



In [13]:
df.count()

40000

In [14]:
df.printSchema()

root
 |-- text: string (nullable = true)
 |-- label: integer (nullable = true)



# Read stopword list

In [15]:
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

# With Pipeline

In [16]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

class RemoveStopWordsAndSpecialCharacters(Transformer, HasInputCol, HasOutputCol):

    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, stopwords=None):
        super(RemoveStopWordsAndSpecialCharacters, self).__init__()
        self.stopwords = Param(self, "stopwords", "")
        self._setDefault(stopwords=set())
        kwargs = self._input_kwargs
        self._set(**kwargs)

    def setStopwords(self, value):
        self._paramMap[self.stopwords] = value
        return self

    def getStopwords(self):
        return self.getOrDefault(self.stopwords)

    def _transform(self, dataset):
        stopwords = self.getStopwords()

        def f(s):
            return [ ''.join(e for e in token if e.isalnum()) for token in s if token not in stopwords ]

        t = ArrayType(StringType())
        out_col = self.getOutputCol()
        in_col = dataset[self.getInputCol()]
        return dataset.withColumn(out_col, udf(f, t)(in_col))

In [23]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, Word2Vec, VectorAssembler
from pyspark.ml.classification import LinearSVC

tokenizer = Tokenizer(inputCol="text", outputCol="words")
cleaning = RemoveStopWordsAndSpecialCharacters(inputCol="words", outputCol="clean_words",stopwords=stopwords)
hashingTF = HashingTF(inputCol="clean_words", outputCol="tf")

preprocessing = Pipeline(stages=[
    tokenizer,
    cleaning,
    hashingTF,
])

preprocessed_df = preprocessing.fit(df).transform(df)
train_set, test_set = preprocessed_df.randomSplit([0.6, 0.4], seed=7)

In [24]:
w2v = Word2Vec(inputCol="clean_words", outputCol="w2v", minCount=1, maxIter=2)
asm = VectorAssembler(inputCols=[hashingTF.getOutputCol(), w2v.getOutputCol()], outputCol="features")
svm = LinearSVC(labelCol="label")

estimator = Pipeline(stages=[
    w2v,
    asm,
    svm
])

# mypipeline = Pipeline(stages=[tokenizer, cleaning, hashingTF, w2v, asm, svm])

In [25]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [27]:
evaluator = BinaryClassificationEvaluator()
paramgrid = ParamGridBuilder().addGrid(w2v.vectorSize, [5,10,20,40]).build()
# tvs = TrainValidationSplit(estimator = mypipeline, estimatorParamMaps=paramgrid, evaluator = evaluator, trainRatio=0.8, seed=46, collectSubModels=True))
# tvsmodel=tvs.fit(df)
# evaluator.evaluate(tvsModel.transform(df))
tvs = TrainValidationSplit(estimator = estimator, estimatorParamMaps=paramgrid, evaluator = evaluator, trainRatio=0.8, seed=46, collectSubModels=True)

In [28]:
tvsmodel=tvs.fit(train_set)

KeyboardInterrupt: 

In [None]:
acc = []

for i, subModel in enumerate(model.subModels):
    print(i)
    test_pred = subModel.transform(test_set)
    N = test_pred.count()
    
    acc.append(
        test_pred.filter(test_pred["prediction"] == test_pred["label"]).count() / N
    )

In [None]:
for _acc, param in zip(acc, [5, 10, 20, 40]):
    print(f"Accuracy when vector size is {param}: {_acc}")