In [1]:
# Read data - text classification data
data = spark.read.csv("./TextClassificationDataset.csv", header=True, inferSchema=True)
data = data.withColumnRenamed("Label", "Classification")

In [2]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="Text", outputCol="words", pattern="\\W")
# stop words
add_stopwords = [",","-",".",";",":"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "Classification", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)

+--------------------+--------------+--------------------+--------------------+--------------------+-----+
|                Text|Classification|               words|            filtered|            features|label|
+--------------------+--------------+--------------------+--------------------+--------------------+-----+
|what expression w...|     translate|[what, expression...|[what, expression...|(1842,[0,2,4,5,41...|102.0|
|can you tell me h...|     translate|[can, you, tell, ...|[can, you, tell, ...|(1842,[0,2,5,8,10...|102.0|
|what is the equiv...|     translate|[what, is, the, e...|[what, is, the, e...|(1842,[3,4,9,14,1...|102.0|
|tell me how to sa...|     translate|[tell, me, how, t...|[tell, me, how, t...|(1842,[2,6,8,9,10...|102.0|
|if i were mongoli...|     translate|[if, i, were, mon...|[if, i, were, mon...|(1842,[0,6,10,30,...|102.0|
+--------------------+--------------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [4]:
# set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 13733
Test Dataset Count: 5767


In [5]:
lr = LogisticRegression(maxIter=40, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 1) \
    .select("Text","Classification","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 100, truncate = 40)

+----------------------------------------+--------------+----------------------------------------+-----+----------+
|                                    Text|Classification|                             probability|label|prediction|
+----------------------------------------+--------------+----------------------------------------+-----+----------+
|whats the status of my south west flight| flight_status|[0.00717505357902084,0.05515959991180...|  1.0|       1.0|
|    whats the status of my united flight| flight_status|[0.007065416424952154,0.0821106614627...|  1.0|       1.0|
|     update me on my delta flight please| flight_status|[0.006686975868151713,0.0381456358099...|  1.0|       1.0|
|              is my delta flight on time| flight_status|[0.006626949886437967,0.0392403961407...|  1.0|       1.0|
|whats the most recent status for my v...| flight_status|[0.0064641092483681125,0.051690768302...|  1.0|       1.0|
|            what is the eta of my flight| flight_status|[0.006326603411

In [6]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.8534873189767498

In [7]:
from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
lr = LogisticRegression(maxIter=40, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 1) \
    .select("Text","Classification","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 100, truncate = 40)

+----------------------------------------+--------------+----------------------------------------+-----+----------+
|                                    Text|Classification|                             probability|label|prediction|
+----------------------------------------+--------------+----------------------------------------+-----+----------+
|whats the status of my south west flight| flight_status|[0.007073738834939095,0.0545944934405...|  1.0|       1.0|
|    whats the status of my united flight| flight_status|[0.006963401679874309,0.0812816918545...|  1.0|       1.0|
|     update me on my delta flight please| flight_status|[0.006723841409649237,0.0336838989898...|  1.0|       1.0|
|              is my delta flight on time| flight_status|[0.006534512775481058,0.0388507520476...|  1.0|       1.0|
|whats the most recent status for my v...| flight_status|[0.006454225424673015,0.0516583560124...|  1.0|       1.0|
|            what is the eta of my flight| flight_status|[0.006328723942

In [8]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.8447271389358195