In [1]:
# Read data - text classification data
data = spark.read.csv("./TextClassificationDataset.csv", header=True, inferSchema=True)
data = data.withColumnRenamed("Label", "Classification")

In [2]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="Text", outputCol="words", pattern="\\W")
# stop words
add_stopwords = ["http","https","amp","rt","t","c","the"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "Classification", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)

+--------------------+--------------+--------------------+--------------------+--------------------+-----+
|                Text|Classification|               words|            filtered|            features|label|
+--------------------+--------------+--------------------+--------------------+--------------------+-----+
|what expression w...|     translate|[what, expression...|[what, expression...|(1840,[0,2,3,4,40...|102.0|
|can you tell me h...|     translate|[can, you, tell, ...|[can, you, tell, ...|(1840,[0,2,4,7,9,...|102.0|
|what is the equiv...|     translate|[what, is, the, e...|[what, is, equiva...|(1840,[3,8,13,14,...|102.0|
|tell me how to sa...|     translate|[tell, me, how, t...|[tell, me, how, t...|(1840,[2,5,7,8,9,...|102.0|
|if i were mongoli...|     translate|[if, i, were, mon...|[if, i, were, mon...|(1840,[0,5,9,29,4...|102.0|
+--------------------+--------------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [4]:
# set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 13733
Test Dataset Count: 5767


In [5]:
lr = LogisticRegression(maxIter=40, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 1) \
    .select("Text","Classification","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 100, truncate = 40)

+----------------------------------------+--------------+----------------------------------------+-----+----------+
|                                    Text|Classification|                             probability|label|prediction|
+----------------------------------------+--------------+----------------------------------------+-----+----------+
|whats the status of my south west flight| flight_status|[0.007197387319391339,0.0549051554374...|  1.0|       1.0|
|    whats the status of my united flight| flight_status|[0.007088975605568832,0.0817409749374...|  1.0|       1.0|
|     update me on my delta flight please| flight_status|[0.006686828385876014,0.0381956623770...|  1.0|       1.0|
|              is my delta flight on time| flight_status|[0.006626312730341516,0.0392783278078...|  1.0|       1.0|
|whats the most recent status for my v...| flight_status|[0.006487724311273607,0.0514797655034...|  1.0|       1.0|
|            what is the eta of my flight| flight_status|[0.006349686992

In [6]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.8531565907866342

In [7]:
from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
lr = LogisticRegression(maxIter=40, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 1) \
    .select("Text","Classification","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 100, truncate = 40)

+----------------------------------------+--------------+----------------------------------------+-----+----------+
|                                    Text|Classification|                             probability|label|prediction|
+----------------------------------------+--------------+----------------------------------------+-----+----------+
|whats the status of my south west flight| flight_status|[0.007086461446404349,0.0543508884472...|  1.0|       1.0|
|    whats the status of my united flight| flight_status|[0.0069771676409775985,0.080934074919...|  1.0|       1.0|
|     update me on my delta flight please| flight_status|[0.006715135328207017,0.0337231000025...|  1.0|       1.0|
|              is my delta flight on time| flight_status|[0.006525126786736954,0.0388905249869...|  1.0|       1.0|
|whats the most recent status for my v...| flight_status|[0.006469844877149016,0.0514574823905...|  1.0|       1.0|
|            what is the eta of my flight| flight_status|[0.006341687562

In [8]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.8439374691058514