The following code does Multi-class text classification using pyspark.
Article text is preprocessed by removing non-alphanumeric characters. 
Tokenizing, Stopwords removal and Count vector operations are done with a pipeline.
Training is done with the complete dataset without any split.
Unknown articles from different topics are collected.
Trained models are used to predict the class of the article.

Predictions are made the following ways
1. Logistic Regression using Count Vector Features
2. Logistic Regression using TF-IDF Features
3. Cross-Validation
4. Naive Bayes
5. Random Forest

Sample output: Accuracy
1. 0.73
2. 0.75
3. 0.87
4. 0.80
5. 0.93

In [3]:
import os, csv, re

def preProcessData():
    with open('UnknownArticles.csv', 'w+') as csvfile:
        fieldnames = ['text', 'category']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()

        keyWords = ['blockchain','business','sports','politics']

        for keyWord in keyWords:

            for i in range(50, 60):
                total_path = "/Users/sajidkhan/Desktop/DIC/Lab3/Input/unknown/"+keyWord
                text_file = open(os.path.join(total_path, keyWord+str(i)+".txt"), "r")
                for line in text_file:
                    line2 = re.sub('[^A-Za-z0-9\s]+', '', line)
                    writer.writerow({'text': line2, 'category': keyWord})  

                text_file.close()

In [4]:
# Model Training and Evaluation
# Logistic Regression using Count Vector Features

from pyspark.sql import SQLContext
from pyspark import SparkContext
import pyspark
import pandas as pd

sc = pyspark.SparkContext.getOrCreate()
sqlContext = SQLContext(sc)
preProcessData()
trainingFile = "/Users/sajidkhan/Desktop/DIC/Lab3/DataAnalysisCode/ArticleData.csv"
trainingData = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(trainingFile)

testFile = "/Users/sajidkhan/Desktop/DIC/Lab3/DataAnalysisCode/UnknownArticles.csv"
testData = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(testFile)

from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

# stop words
add_stopwords = ["a", "an", "as", "able", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "aint", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "arent", "around", "as", "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", "but", "by", "cmon", "cs", "came", "can", "cant", "cannot", "cant", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldnt", "course", "currently", "definitely", "described", "despite", "did", "didnt", "different", "do", "does", "doesnt", "doing", "dont", "done", "down", "downwards", "during", "each", "edu", "eg", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "far", "few", "ff", "fifth", "first", "five", "followed", "following", "follows", "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings", "had", "hadnt", "happens", "hardly", "has", "hasnt", "have", "havent", "having", "he", "hes", "hello", "help", "hence", "her", "here", "heres", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit", "however", "i", "id", "ill", "im", "ive", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "is", "isnt", "it", "itd", "itll", "its", "its", "itself", "just", "keep", "keeps", "kept", "know", "knows", "known", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "little", "look", "looking", "looks", "ltd", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone", "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "own", "particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provides", "que", "quite", "qv", "rather", "rd", "re", "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "she", "should", "shouldnt", "since", "six", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "sub", "such", "sup", "sure", "ts", "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "thats", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "theres", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon", "these", "they", "theyd", "theyll", "theyre", "theyve", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "un", "under", "unfortunately", "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used", "useful", "uses", "using", "usually", "value", "various", "very", "via", "viz", "vs", "want", "wants", "was", "wasnt", "way", "we", "wed", "well", "were", "weve", "welcome", "well", "went", "were", "werent", "what", "whats", "whatever", "when", "whence", "whenever", "where", "wheres", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whos", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish", "with", "within", "without", "wont", "wonder", "would", "would", "wouldnt", "yes", "yet", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself", "yourselves", "zero"]
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(trainingData)
trainingDataSet = pipelineFit.transform(trainingData)
testDataSet = pipelineFit.transform(testData)

#dataset.show(200)

# set seed for reproducibility
#(trainingData1, testData1) = trainingDataSet.randomSplit([0.8, 0.2], seed = 100)
print("Training Dataset Count: " + str(trainingDataSet.count()))
print("Test Dataset Count: " + str(testDataSet.count()))

#trainingDataSet.show(trainingDataSet.count())
#testDataSet.show(testDataSet.count())

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingDataSet)
predictions = lrModel.transform(testDataSet)
#predictions.show(50)
predictions.filter(predictions['prediction'] == 0).select("text","category","probability","label","prediction").orderBy("probability", ascending=False).show(n = 10, truncate = 30)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

Training Dataset Count: 200
Test Dataset Count: 40
+------------------------------+----------+------------------------------+-----+----------+
|                          text|  category|                   probability|label|prediction|
+------------------------------+----------+------------------------------+-----+----------+
|The distance between the Wh...|blockchain|[0.9125202701036145,0.01689...|  0.0|       0.0|
|Instead of perfectly polish...|blockchain|[0.9125202701036145,0.01689...|  0.0|       0.0|
|There are now more ways to ...|blockchain|[0.9125202701036145,0.01689...|  0.0|       0.0|
|The pioneers of social medi...|blockchain|[0.9125202701036145,0.01689...|  0.0|       0.0|
|The internet was born in te...|blockchain|[0.9125202701036145,0.01689...|  0.0|       0.0|
|Major companies have been f...|blockchain|[0.911839091049913,0.016619...|  0.0|       0.0|
|AdvertisementSupported byTh...|blockchain|[0.8901288681191811,0.03274...|  0.0|       0.0|
|AdvertisementSupported byTO.

0.728374480696462

In [5]:
# Logistic Regression using TF-IDF Features

from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(trainingData)
trainingDataSet = pipelineFit.transform(trainingData)
testDataSet = pipelineFit.transform(testData)
#(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingDataSet)
predictions = lrModel.transform(testDataSet)
predictions.filter(predictions['prediction'] == 0).select("text","category","probability","label","prediction").orderBy("probability", ascending=False).show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)


+------------------------------+----------+------------------------------+-----+----------+
|                          text|  category|                   probability|label|prediction|
+------------------------------+----------+------------------------------+-----+----------+
|There are now more ways to ...|blockchain|[0.9327160135051216,0.01505...|  0.0|       0.0|
|The internet was born in te...|blockchain|[0.9327160135051216,0.01505...|  0.0|       0.0|
|Instead of perfectly polish...|blockchain|[0.9325748691113527,0.01514...|  0.0|       0.0|
|The pioneers of social medi...|blockchain|[0.9287856437823444,0.01562...|  0.0|       0.0|
|The distance between the Wh...|blockchain|[0.9286906722930692,0.01688...|  0.0|       0.0|
|Major companies have been f...|blockchain|[0.9273480740926981,0.01699...|  0.0|       0.0|
|AdvertisementSupported byTh...|blockchain|[0.8003762420295648,0.04049...|  0.0|       0.0|
|AdvertisementSupported byTO...|  business|[0.6228966542210546,0.22361...|  1.0|

0.7450501253132833

In [6]:
# Cross-Validation

pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
pipelineFit = pipeline.fit(trainingData)
trainingDataSet = pipelineFit.transform(trainingData)
testDataSet = pipelineFit.transform(testData)
#(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cvModel = cv.fit(trainingDataSet)

predictions = cvModel.transform(testDataSet)
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.8748120300751882

In [7]:
# Naive Bayes
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingDataSet)
predictions = model.transform(testDataSet)
predictions.filter(predictions['prediction'] == 0).select("text","category","probability","label","prediction").orderBy("probability", ascending=False).show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

+------------------------------+----------+------------------------------+-----+----------+
|                          text|  category|                   probability|label|prediction|
+------------------------------+----------+------------------------------+-----+----------+
|AdvertisementSupported byTO...|  business|[1.0,1.0810095835387586E-16...|  1.0|       0.0|
|AdvertisementWith Andrew Ro...|blockchain|[1.0,1.6549201130872253E-21...|  0.0|       0.0|
|There are now more ways to ...|blockchain|[1.0,7.13836643613608E-33,1...|  0.0|       0.0|
|Instead of perfectly polish...|blockchain|[1.0,7.13836643613608E-33,1...|  0.0|       0.0|
|The internet was born in te...|blockchain|[1.0,7.13836643613608E-33,1...|  0.0|       0.0|
|The pioneers of social medi...|blockchain|[1.0,7.13836643613608E-33,1...|  0.0|       0.0|
|The distance between the Wh...|blockchain|[1.0,7.13836643613608E-33,1...|  0.0|       0.0|
|Major companies have been f...|blockchain|[1.0,6.823462574461964E-33,...|  0.0|

0.800598086124402

In [8]:
# Random Forest

from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees = 100, maxDepth = 4, maxBins = 32)
# Train model with Training Data
rfModel = rf.fit(trainingDataSet)
predictions = rfModel.transform(testDataSet)
predictions.filter(predictions['prediction'] == 0).select("text","category","probability","label","prediction").orderBy("probability", ascending=False).show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

+------------------------------+----------+------------------------------+-----+----------+
|                          text|  category|                   probability|label|prediction|
+------------------------------+----------+------------------------------+-----+----------+
|AdvertisementWith Andrew Ro...|blockchain|[0.36567863609579904,0.2930...|  0.0|       0.0|
|Major companies have been f...|blockchain|[0.33481133575670435,0.2348...|  0.0|       0.0|
|Instead of perfectly polish...|blockchain|[0.33481133575670435,0.2348...|  0.0|       0.0|
|The distance between the Wh...|blockchain|[0.33481133575670435,0.2348...|  0.0|       0.0|
|There are now more ways to ...|blockchain|[0.33481133575670435,0.2348...|  0.0|       0.0|
|The internet was born in te...|blockchain|[0.33481133575670435,0.2348...|  0.0|       0.0|
|The pioneers of social medi...|blockchain|[0.33481133575670435,0.2348...|  0.0|       0.0|
|AdvertisementSupported byTh...|blockchain|[0.30819655659471196,0.2625...|  0.0|

0.9261278195488722