## Satvinder Singh Panesar

In [1]:
# Code in Python 3
# Note: Please follow the execution sequence to avoid errors

### Titanic Data Analysis

In [1]:
# Titanic Data Code taken from 
# https://creativedata.atlassian.net/wiki/spaces/SAP/pages/83237142/Pyspark+-+Tutorial+based+on+Titanic+Dataset
# Import packages
import findspark
findspark.init()
import time
import pyspark
import os
import csv
from numpy import array
from pyspark.mllib.regression import LabeledPoint
from pyspark import SparkContext, SparkConf

# Creating Spark environment
os.environ["HADOOP_USER_NAME"] = "hdfs"
os.environ["PYTHON_VERSION"] = "3.5.2"
conf = pyspark.SparkConf()
sc = pyspark.SparkContext(conf=conf)
conf.getAll()

dict_items([])

### Titanic using mllib module

In [3]:
# Reading from the hdfs, removing the header
trainTitanic = sc.textFile("train.csv")
trainHeader = trainTitanic.first()
trainTitanic = trainTitanic.filter(lambda line: line != trainHeader).mapPartitions(lambda x: csv.reader(x))
trainTitanic.first()
 
# Data preprocessing
def sexTransformMapper(elem):
    '''Function which transform "male" into 1 and else things into 0
    - elem : string
    - return : vector
    '''
     
    if elem == 'male' :
        return [0]
    else :
        return [1]
    
# Data Transformations and filter lines with empty strings
trainTitanic=trainTitanic.map(lambda line: line[1:3]+sexTransformMapper(line[4])+line[5:11])
trainTitanic=trainTitanic.filter(lambda line: line[3] != '' ).filter(lambda line: line[4] != '' )
trainTitanic.take(10)
 
# creating "labeled point" rdds specific to MLlib "(label (v1, v2...vp])"
trainTitanicLP=trainTitanic.map(lambda line: LabeledPoint(line[0],[line[1:5]]))
trainTitanicLP.first()
 
# splitting dataset into train and test set
(trainData, testData) = trainTitanicLP.randomSplit([0.7, 0.3])
 
# Random forest : same parameters as sklearn (?)
from pyspark.mllib.tree import RandomForest
 
time_start=time.time()
model_rf = RandomForest.trainClassifier(trainData, numClasses = 2,
        categoricalFeaturesInfo = {}, numTrees = 100,
        featureSubsetStrategy='auto', impurity='gini', maxDepth=12,
        maxBins=32, seed=None) 
  
model_rf.numTrees()
model_rf.totalNumNodes()
time_end=time.time()
time_rf=(time_end - time_start)
print("RF takes %d s" %(time_rf))
 
# Predictions on test set
predictions = model_rf.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
 
# first metrics
from pyspark.mllib.evaluation import BinaryClassificationMetrics
metrics = BinaryClassificationMetrics(labelsAndPredictions)
 
# Area under precision-recall curve
print("Area under PR = %s" % metrics.areaUnderPR)
 
# Area under ROC curve
print("Area under ROC = %s" % metrics.areaUnderROC)

RF takes 7 s
Area under PR = 0.5888035357255603
Area under ROC = 0.7889872717210846


### Titanic using sql dataframe and ml module

In [4]:
# Import packages
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer, OneHotEncoder, VectorAssembler, IndexToString
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import *
 
# Creatingt Spark SQL environment
from pyspark.sql import SparkSession, HiveContext
SparkContext.setSystemProperty("hive.metastore.uris", "thrift://nn1:9083")
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
 
# spark is an existing SparkSession
train = spark.read.csv("train.csv", header = True)
# Displays the content of the DataFrame to stdout
train.show(10)
 
# String to float on some columns of the dataset : creates a new dataset
train = train.select(col("Survived"),col("Sex"),col("Embarked"),col("Pclass").cast("float"),col("Age").cast("float"),col("SibSp").cast("float"),col("Fare").cast("float"))
 
# dropping null values
train = train.dropna()
 
# Spliting in train and test set. Beware : It sorts the dataset
(traindf, testdf) = train.randomSplit([0.7,0.3])

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|  22|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|  38|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|  26|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|  35|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|  35|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

### Without pipeline

In [5]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
train = StringIndexer(inputCol="Sex", outputCol="indexedSex").fit(train).transform(train)
train = StringIndexer(inputCol="Embarked", outputCol="indexedEmbarked").fit(train).transform(train)
 
train = StringIndexer(inputCol="Survived", outputCol="indexedSurvived").fit(train).transform(train)
 
# One Hot Encoder on indexed features
train = OneHotEncoder(inputCol="indexedSex", outputCol="sexVec").transform(train)
train = OneHotEncoder(inputCol="indexedEmbarked", outputCol="embarkedVec").transform(train)
 
# Feature assembler as a vector
train = VectorAssembler(inputCols=["Pclass","sexVec","embarkedVec", "Age","SibSp","Fare"],outputCol="features").transform(train)
 
rf = RandomForestClassifier(labelCol="indexedSurvived", featuresCol="features")
 
model = rf.fit(train)
 
predictions = model.transform(train)
 
# Select example rows to display.
predictions.select(col("prediction"),col("probability"),).show(5)

+----------+--------------------+
|prediction|         probability|
+----------+--------------------+
|       0.0|[0.89184996904177...|
|       1.0|[0.04310468712642...|
|       1.0|[0.46799611398021...|
|       1.0|[0.04279690462562...|
|       0.0|[0.88765386496603...|
+----------+--------------------+
only showing top 5 rows



### With Pipeline

In [6]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
genderIndexer = StringIndexer(inputCol="Sex", outputCol="indexedSex")
embarkIndexer = StringIndexer(inputCol="Embarked", outputCol="indexedEmbarked")
 
surviveIndexer = StringIndexer(inputCol="Survived", outputCol="indexedSurvived")
 
# One Hot Encoder on indexed features
genderEncoder = OneHotEncoder(inputCol="indexedSex", outputCol="sexVec")
embarkEncoder = OneHotEncoder(inputCol="indexedEmbarked", outputCol="embarkedVec")
 
# Create the vector structured data (label,features(vector))
assembler = VectorAssembler(inputCols=["Pclass","sexVec","Age","SibSp","Fare","embarkedVec"],outputCol="features")
 
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedSurvived", featuresCol="features")
 
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[surviveIndexer, genderIndexer, embarkIndexer, genderEncoder,embarkEncoder, assembler, rf]) # genderIndexer,embarkIndexer,genderEncoder,embarkEncoder,
 
# Train model.  This also runs the indexers.
model = pipeline.fit(traindf)
 
# Predictions
predictions = model.transform(testdf)
 
# Select example rows to display.
predictions.columns 
 
# Select example rows to display.
predictions.select("prediction", "Survived", "features").show(5)
 
# Select (prediction, true label) and compute test error
predictions = predictions.select(col("Survived").cast("Float"),col("prediction"))
evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
 
rfModel = model.stages[6]
print(rfModel)  # summary only
 
evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % accuracy)
 
evaluatorf1 = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="f1")
f1 = evaluatorf1.evaluate(predictions)
print("f1 = %g" % f1)
 
evaluatorwp = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="weightedPrecision")
wp = evaluatorwp.evaluate(predictions)
print("weightedPrecision = %g" % wp)
 
evaluatorwr = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="weightedRecall")
wr = evaluatorwr.evaluate(predictions)
print("weightedRecall = %g" % wr)
 
# close sparkcontext
sc.stop()

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       1.0|       0|[1.0,0.0,50.0,0.0...|
|       0.0|       0|[3.0,0.0,17.0,0.0...|
|       1.0|       0|(7,[0,2,4],[3.0,3...|
|       0.0|       0|(7,[0,2,4],[3.0,3...|
|       1.0|       0|[2.0,0.0,26.0,1.0...|
+----------+--------+--------------------+
only showing top 5 rows

Test Error = 0.179724
RandomForestClassificationModel (uid=RandomForestClassifier_4a078bae21c90f530455) with 20 trees
Accuracy = 0.820276
f1 = 0.819051
weightedPrecision = 0.819595
weightedRecall = 0.820276


### Collecting News Articles

In [1]:
collect_now=False

In [2]:
from newsapi import NewsApiClient
import newspaper
import pandas
from newspaper import Article
import time
import os

In [3]:
newsapi = NewsApiClient(api_key='c8b2f3f94b474cadb0b08f0bae0b1b07')

### Collecting articles for training

In [10]:
articles_folder = "news_articles"
if collect_now == True:
    # any new category to be used, is to be added here
    categories = ['business','sports','technology','entertainment']
    date = time.strftime("%Y-%m-%d")
    if not os.path.exists(articles_folder):
        os.makedirs(articles_folder)
    for category in categories:
        if not os.path.exists(articles_folder+"/"+category):
            os.makedirs(articles_folder+"/"+category)
        article_no = 0    
        top_headlines = newsapi.get_top_headlines(q='', category=category, language='en', country='us', page_size=50)
        top_headlines = pandas.DataFrame(top_headlines)
        for ele in top_headlines['articles'].values:
            url = ele['url']
            article=Article(url)
            article.download()
            article.parse()
            article_no = article_no + 1
            file = open(articles_folder+"/"+category+"/"+category+"_"+str(date)+"_"+str(article_no)+".txt","w",encoding="utf-8")
            file.write(article.text)
            file.close()

### Collecting articles for testing i.e. unknown set

In [4]:
articles_folder = "news_articles_unknown"
if collect_now == True:
    date = time.strftime("%Y-%m-%d")
    if not os.path.exists(articles_folder):
        os.makedirs(articles_folder)
    for category in categories:
        if not os.path.exists(articles_folder+"/"+category):
            os.makedirs(articles_folder+"/"+category)
        article_no = 0    
        top_headlines = newsapi.get_top_headlines(q='', category=category, language='en', country='us', page_size=50)
        top_headlines = pandas.DataFrame(top_headlines)
        for ele in top_headlines['articles'].values:
            url = ele['url']
            article=Article(url)
            article.download()
            article.parse()
            article_no = article_no + 1
            file = open(articles_folder+"/"+category+"/"+category+"_"+str(date)+"_"+str(article_no)+".txt","w",encoding="utf-8")
            file.write(article.text)
            file.close()

### Classification of News Articles

In [3]:
import findspark
findspark.init()
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import os

In [4]:
sc = SparkContext('local')
spark = SparkSession(sc)

### Preparing training data

In [5]:
#read files and generate spark data frame
articles_folder = "news_articles"
data = []
for subdir, dirs, files in os.walk(articles_folder):
    for folder_name in dirs:
        # any label associated with a new category, has to be added here
        if folder_name == "business":
            label = 0
        elif folder_name == "entertainment":
            label = 1
        elif folder_name == "sports":
            label = 2
        elif folder_name == "technology":
            label = 3
        for filename in os.listdir(articles_folder+"/"+folder_name):
            filecontent = open(articles_folder+"/"+folder_name+"/"+filename,"r",encoding='utf-8').read()
            temp = (label,filecontent)
            data.append(temp)
            
sentenceData = spark.createDataFrame(data,["label","sentence"])     
sentenceData.show(10)

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|    0|With big jaws to ...|
|    0|CLOSE Amid a city...|
|    0|CLOSE Twitter say...|
|    0|PHILADELPHIA -- P...|
|    0|The fan blade tha...|
|    0|The Volkswagen em...|
|    0|Activision Blizza...|
|    0|Orlando-based Dar...|
|    0|Copyright 2018 Th...|
|    0|Southwest CEO: Ou...|
+-----+--------------------+
only showing top 10 rows



### Cleansing training data

In [6]:
# base code for feature extraction taken from official site
# https://spark.apache.org/docs/2.3.0/ml-features.html
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filteredWordsData = remover.transform(wordsData)
filteredWordsData.show(10)

+-----+--------------------+--------------------+--------------------+
|label|            sentence|               words|            filtered|
+-----+--------------------+--------------------+--------------------+
|    0|With big jaws to ...|[with, big, jaws,...|[big, jaws, feed,...|
|    0|CLOSE Amid a city...|[close, amid, a, ...|[close, amid, cit...|
|    0|CLOSE Twitter say...|[close, twitter, ...|[close, twitter, ...|
|    0|PHILADELPHIA -- P...|[philadelphia, --...|[philadelphia, --...|
|    0|The fan blade tha...|[the, fan, blade,...|[fan, blade, fail...|
|    0|The Volkswagen em...|[the, volkswagen,...|[volkswagen, emis...|
|    0|Activision Blizza...|[activision, bliz...|[activision, bliz...|
|    0|Orlando-based Dar...|[orlando-based, d...|[orlando-based, d...|
|    0|Copyright 2018 Th...|[copyright, 2018,...|[copyright, 2018,...|
|    0|Southwest CEO: Ou...|[southwest, ceo:,...|[southwest, ceo:,...|
+-----+--------------------+--------------------+--------------------+
only s

### Preparing testing data i.e. unknown set

In [17]:
#read files and generate spark data frame
articles_folder = "news_articles_unknown"
data = []
for subdir, dirs, files in os.walk(articles_folder):
    for folder_name in dirs:
        # any label associated with a new category, has to be added here
        if folder_name == "business":
            label = 0
        elif folder_name == "entertainment":
            label = 1
        elif folder_name == "sports":
            label = 2
        elif folder_name == "technology":
            label = 3
        for filename in os.listdir(articles_folder+"/"+folder_name):
            filecontent = open(articles_folder+"/"+folder_name+"/"+filename,"r",encoding='utf-8').read()
            temp = (label,filecontent)
            data.append(temp)
            
sentenceDataUnknown = spark.createDataFrame(data,["label","sentence"])     
sentenceDataUnknown.show(10)

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|    0|Footage of a prot...|
|    0|GameStop Corp. Ch...|
|    0|The Nomorobo robo...|
|    0|BEIJING—Ride-hail...|
|    0|Starbucks: No Nee...|
|    0|We’ve seen Boston...|
|    0|Barclays CEO Jes ...|
|    0|Oil prices early ...|
|    0|Banks have been w...|
|    0|One of America's ...|
+-----+--------------------+
only showing top 10 rows



### Cleansing testing data i.e. unknown set

In [18]:
# base code for feature extraction taken from official site
# https://spark.apache.org/docs/2.3.0/ml-features.html
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsDataUnknown = tokenizer.transform(sentenceDataUnknown)

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filteredWordsDataUnknown = remover.transform(wordsDataUnknown)
filteredWordsDataUnknown.show(10)

+-----+--------------------+--------------------+--------------------+
|label|            sentence|               words|            filtered|
+-----+--------------------+--------------------+--------------------+
|    0|Footage of a prot...|[footage, of, a, ...|[footage, prototy...|
|    0|GameStop Corp. Ch...|[gamestop, corp.,...|[gamestop, corp.,...|
|    0|The Nomorobo robo...|[the, nomorobo, r...|[nomorobo, roboca...|
|    0|BEIJING—Ride-hail...|[beijing—ride-hai...|[beijing—ride-hai...|
|    0|Starbucks: No Nee...|[starbucks:, no, ...|[starbucks:, need...|
|    0|We’ve seen Boston...|[we’ve, seen, bos...|[we’ve, seen, bos...|
|    0|Barclays CEO Jes ...|[barclays, ceo, j...|[barclays, ceo, j...|
|    0|Oil prices early ...|[oil, prices, ear...|[oil, prices, ear...|
|    0|Banks have been w...|[banks, have, bee...|[banks, walking, ...|
|    0|One of America's ...|[one, of, america...|[one, america's, ...|
+-----+--------------------+--------------------+--------------------+
only s

### Feature Extraction using Hasing TF and IDF

### Random Forest Classifier

In [32]:
# base code for classifiers taken from official site
# https://spark.apache.org/docs/2.3.0/ml-classification-regression.html
show_predictions=True
for i in range(5000,10001,500):
    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=i)
    featurizedData = hashingTF.transform(filteredWordsData)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    
    (trainingData, testData) = rescaledData.randomSplit([0.8, 0.2])  
    
    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=i)
    featurizedDataUnknown = hashingTF.transform(filteredWordsDataUnknown)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedDataUnknown)
    rescaledDataUnknown = idfModel.transform(featurizedDataUnknown)
    
    unknownData = rescaledDataUnknown

    # Train a RandomForest model.
    rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=200)

    pipeline = Pipeline(stages=[rf])

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)

    # Make predictions for known dataset.
    predictions = model.transform(testData)
    if show_predictions == True:
        predictions.select("prediction","label").show(10)
        show_predictions=False

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Accuracy with "+str(i)+" features for known articles: "+str(accuracy))
    
     # Make predictions for unknown dataset.
    predictions = model.transform(unknownData)
    if show_predictions == True:
        predictions.select("prediction","label").show(10)
        show_predictions=False

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Accuracy with "+str(i)+" features for unknown articles: "+str(accuracy))

+----------+-----+
|prediction|label|
+----------+-----+
|       0.0|    0|
|       0.0|    0|
|       0.0|    0|
|       0.0|    0|
|       0.0|    0|
|       0.0|    0|
|       0.0|    0|
|       0.0|    0|
|       0.0|    0|
|       0.0|    0|
+----------+-----+
only showing top 10 rows

Accuracy with 5000 features for known articles: 0.7068965517241379
Accuracy with 5000 features for unknown articles: 0.8
Accuracy with 5500 features for known articles: 0.5625
Accuracy with 5500 features for unknown articles: 0.775
Accuracy with 6000 features for known articles: 0.7435897435897436
Accuracy with 6000 features for unknown articles: 0.825
Accuracy with 6500 features for known articles: 0.6595744680851063
Accuracy with 6500 features for unknown articles: 0.7875
Accuracy with 7000 features for known articles: 0.6730769230769231
Accuracy with 7000 features for unknown articles: 0.8
Accuracy with 7500 features for known articles: 0.6122448979591837
Accuracy with 7500 features for unknown a

### Feature matrix

In [8]:
rescaledData.select("label","filtered","features").show(10)

+-----+--------------------+--------------------+
|label|            filtered|            features|
+-----+--------------------+--------------------+
|    0|[big, jaws, feed,...|(9500,[24,34,37,4...|
|    0|[close, amid, cit...|(9500,[30,37,69,2...|
|    0|[close, twitter, ...|(9500,[37,208,256...|
|    0|[philadelphia, --...|(9500,[125,135,19...|
|    0|[fan, blade, fail...|(9500,[37,46,94,2...|
|    0|[volkswagen, emis...|(9500,[33,80,543,...|
|    0|[activision, bliz...|(9500,[24,66,161,...|
|    0|[orlando-based, d...|(9500,[8,109,116,...|
|    0|[copyright, 2018,...|(9500,[1,58,92,16...|
|    0|[southwest, ceo:,...|(9500,[47,72,104,...|
+-----+--------------------+--------------------+
only showing top 10 rows



### Naives Bayes Classifier

In [33]:
show_predictions=True
for i in range(10000,20001,500):
    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=i)
    featurizedData = hashingTF.transform(filteredWordsData)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    
    splits = rescaledData.randomSplit([0.8, 0.2])
    train = splits[0]
    test = splits[1]
    
    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=i)
    featurizedDataUnknown = hashingTF.transform(filteredWordsDataUnknown)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedDataUnknown)
    rescaledDataUnknown = idfModel.transform(featurizedDataUnknown)
    
    unknown = rescaledDataUnknown

    # create the trainer and set its parameters
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

    # train the model
    model = nb.fit(train)

    # select example rows to display.
    predictions = model.transform(test)
    if show_predictions == True:
        predictions.select("prediction","label").show(10)
        show_predictions=False    

    # compute accuracy on the test set
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Accuracy with "+str(i)+" features for known articles: "+str(accuracy))
    
    # select example rows to display.
    predictions = model.transform(unknown)
    if show_predictions == True:
        predictions.select("prediction","label").show(10)
        show_predictions=False    

    # compute accuracy on the test set
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Accuracy with "+str(i)+" features for unknown articles: "+str(accuracy))

+----------+-----+
|prediction|label|
+----------+-----+
|       3.0|    0|
|       0.0|    0|
|       0.0|    0|
|       3.0|    0|
|       0.0|    0|
|       0.0|    0|
|       0.0|    0|
|       0.0|    0|
|       1.0|    0|
|       0.0|    0|
+----------+-----+
only showing top 10 rows

Accuracy with 10000 features for known articles: 0.8378378378378378
Accuracy with 10000 features for unknown articles: 0.875
Accuracy with 10500 features for known articles: 0.8958333333333334
Accuracy with 10500 features for unknown articles: 0.8125
Accuracy with 11000 features for known articles: 0.9387755102040817
Accuracy with 11000 features for unknown articles: 0.9125
Accuracy with 11500 features for known articles: 0.8863636363636364
Accuracy with 11500 features for unknown articles: 0.85
Accuracy with 12000 features for known articles: 0.9245283018867925
Accuracy with 12000 features for unknown articles: 0.8375
Accuracy with 12500 features for known articles: 0.8723404255319149
Accuracy with

In [13]:
spark.stop()

In [2]:
#Uncomment below line, change kernel to R and execute to regenerate html report
#rmarkdown::render("ReportLab3.md")

"C:/Users/SATVIN~1/ANACON~1/Scripts/pandoc" +RTS -K512m -RTS ReportLab3.utf8.md --to html4 --from markdown+autolink_bare_uris+ascii_identifiers+tex_math_single_backslash --output ReportLab3.html --smart --email-obfuscation none --self-contained --standalone --section-divs --template "C:\Users\Satvinder\Documents\R\win-library\3.4\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable "theme:bootstrap" --include-in-header "C:\Users\SATVIN~1\AppData\Local\Temp\RtmpEpvbq4\rmarkdown-str414c42c0423f.html" --mathjax --variable "mathjax-url:https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" 



Output created: ReportLab3.html
