In [5]:
from pyspark.sql import SQLContext
from pyspark import SparkContext
sc =SparkContext()
sqlContext = SQLContext(sc)
data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('articles.csv')

In [6]:
data.columns

['article', 'category']

In [7]:
data.show(5)

+--------------------+--------+
|             article|category|
+--------------------+--------+
|AdvertisementSupp...|   Music|
|AdvertisementSupp...|   Music|
|AdvertisementSupp...|   Music|
|AdvertisementSupp...|   Music|
|AdvertisementSupp...|   Music|
+--------------------+--------+
only showing top 5 rows



In [8]:
data.printSchema()

root
 |-- article: string (nullable = true)
 |-- category: string (nullable = true)



In [11]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="article", outputCol="words", pattern="\\W")

# stop words
add_stopwords = ["http","https","amp","rt","t","c","the"] # standard stop words

stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

In [13]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "article", outputCol = "label")

In [17]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "article", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)

+--------------------+--------+--------------------+--------------------+--------------------+-----+
|             article|category|               words|            filtered|            features|label|
+--------------------+--------+--------------------+--------------------+--------------------+-----+
|AdvertisementSupp...|   Music|[advertisementsup...|[advertisementsup...|(4084,[0,1,2,3,4,...| 23.0|
|AdvertisementSupp...|   Music|[advertisementsup...|[advertisementsup...|(4084,[0,1,2,3,4,...| 18.0|
|AdvertisementSupp...|   Music|[advertisementsup...|[advertisementsup...|(4084,[0,1,2,3,4,...|  5.0|
|AdvertisementSupp...|   Music|[advertisementsup...|[advertisementsup...|(4084,[0,1,2,3,4,...| 13.0|
|AdvertisementSupp...|   Music|[advertisementsup...|[advertisementsup...|(4084,[0,1,2,3,4,...| 19.0|
+--------------------+--------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [18]:
# set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 141
Test Dataset Count: 59


In [20]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("article", "category") \
    .orderBy("category", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+
|                       article|category|
+------------------------------+--------+
|AdvertisementThey had the b...|  Sports|
|AdvertisementThey had the b...|  Sports|
|AdvertisementThey had the b...|  Sports|
|AdvertisementThey had the b...|  Sports|
+------------------------------+--------+



In [21]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.8587570621468926

In [26]:
from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("article", "category") \
    .orderBy("category", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+
|                       article|category|
+------------------------------+--------+
|AdvertisementThey had the b...|  Sports|
|AdvertisementThey had the b...|  Sports|
|AdvertisementThey had the b...|  Sports|
|AdvertisementThey had the b...|  Sports|
+------------------------------+--------+



In [27]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.9197740112994349

In [31]:
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)
cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.9661016949152541

In [33]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("article", "category") \
    .orderBy("category", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

+------------------------------+--------+
|                       article|category|
+------------------------------+--------+
|AdvertisementThey had the b...|  Sports|
|AdvertisementThey had the b...|  Sports|
|AdvertisementThey had the b...|  Sports|
|AdvertisementThey had the b...|  Sports|
+------------------------------+--------+



0.7909604519774012

In [34]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)
# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("article", "category") \
    .orderBy("category", ascending=False) \
    .show(n = 10, truncate = 30)
    
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

+------------------------------+--------+
|                       article|category|
+------------------------------+--------+
|AdvertisementThey had the b...|  Sports|
|AdvertisementThey had the b...|  Sports|
|AdvertisementThey had the b...|  Sports|
|AdvertisementThey had the b...|  Sports|
+------------------------------+--------+



0.6036871840618496