In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer,VectorAssembler,Normalizer,StandardScaler,CountVectorizer,IDF \
,StringIndexer, MinMaxScaler
from pyspark.sql.functions import udf,concat,lit
from pyspark.sql.types import IntegerType
from pyspark.ml.pipeline import Pipeline

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import re

In [2]:
# Creating a Spark Session 
spark=SparkSession.builder.master('local').appName('Hyperparameter Tuning').getOrCreate()

## Reading the data

In [3]:
# Printing the columns and the number of rows
df=spark.read.json('../data/Train_onetag_small.json')
df.persist()

DataFrame[Body: string, Id: bigint, Tags: string, Title: string, oneTag: string]

In [4]:
# Splitting the data into training and testing into 70% training and 30% testing
train,test=df.randomSplit([0.9,0.1],seed=42)

# For larger datasets this can be used to split data into 3 sets i.e. train/dev/test
#train,rest=df.randomSplit([0.7,0.3],seed=42)
#test,val=df.randomSplit([0.5,0,5],seed=42)

## Creating a Pipeline

In [5]:
regexTokenizer=RegexTokenizer(inputCol='Body',outputCol='words',pattern="\\W")
cv=CountVectorizer(inputCol='words',outputCol='TF',vocabSize=1000)
idf=IDF(inputCol='TF',outputCol='features')
indexer=StringIndexer(inputCol='oneTag',outputCol='label')

lr=LogisticRegression(maxIter=10,regParam=0.0,elasticNetParam=0)

pipeline=Pipeline(stages=[regexTokenizer,cv,idf,indexer,lr])

In [6]:
# fitting the underlining model
pmodel=pipeline.fit(train)

In [7]:
# Checking for the result
result=pmodel.transform(test)
result.filter(result.prediction==result.label).count()/result.count()

0.34690997076318175

Hence, we get a accuracy of 34.7% i.e. very less

## Performing Hyperparameter Tuning

In [8]:
paramGrid=ParamGridBuilder() \
            .addGrid(cv.vocabSize,[1000,5000])\
            .addGrid(lr.regParam,[0.0,0.1]).build()

crossval=CrossValidator(estimator=pipeline,
                       estimatorParamMaps=paramGrid,
                       evaluator=MulticlassClassificationEvaluator(),
                       numFolds=3)

In [9]:
cvModel=crossval.fit(train)

In [10]:
cvModel.avgMetrics

[0.30460039219970986,
 0.23267537467422655,
 0.36525309195440986,
 0.28284587842893544]

In [16]:
result=cvModel.transform(test)
result.filter(result.prediction==result.label).count()/result.count()

0.392378263937897

Thus, we can see that hyperparameter tuning improves the performance of our model. Even now the Accuracy is pretty low but is better than the prev case.