In [1]:
import json
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.feature import HashingTF, Tokenizer, StringIndexer, StopWordsRemover, NGram, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
def read(hdfspath):
    data = sc.textFile(hdfspath).map(lambda line: json.loads(line))
    return data.map(lambda line: (line['label'], line['text'])).toDF(['label_text', 'text'])

In [3]:
train_paired = read("hdfs://hdfs-mesos/user/simonj/data/train.json")
test_paired = read("hdfs://hdfs-mesos/user/simonj/data/devel.json")

In [15]:
labelIndexer = StringIndexer(inputCol="label_text", outputCol="label")
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered")
hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=20, regParam=0.01)
ovr = OneVsRest(classifier=lr)
pipeline = Pipeline(stages=[labelIndexer, tokenizer, remover, hashingTF, ovr])
model = pipeline.fit(train_paired)
result = model.transform(test_paired)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.843155239624
