In [1]:
import findspark
findspark.init('C:/extras/spark')

In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
                          .appName('classifier')\
                          .getOrCreate()
sc=spark.sparkContext

In [3]:
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from pyspark.ml.feature import *
from pyspark.ml.classification import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import nltk
from nltk.corpus import stopwords

In [4]:
sqlContext = SQLContext(sc)

In [5]:
fas_df = spark.read.text('Data/Fashion/*')
fas_df = fas_df.withColumn("category",lit("Fashion"))

tech_df = spark.read.text('Data/Technology/*')
tech_df = tech_df.withColumn("category",lit("Technology"))

sci_df = spark.read.text('Data/Science/*')
sci_df = sci_df.withColumn("category",lit("science"))

mov_df = spark.read.text('Data/Movie/*')
mov_df = mov_df.withColumn("category",lit("Movie"))


merge_df1 = fas_df.union(tech_df)
merge_df2 = merge_df1.union(sci_df)
merge_df3 = merge_df2.union(mov_df)

In [6]:
data = merge_df3.select([column for column in merge_df3.columns])
data.show(5)

+--------------------+--------+
|               value|category|
+--------------------+--------+
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
+--------------------+--------+
only showing top 5 rows



In [7]:
Fas_udf = spark.read.text('Data/unknown/Fashion/*')
Fas_udf = Fas_udf.withColumn("category",lit("Fashion"))

science_udf = spark.read.text('Data/unknown/science/*')
science_udf = science_udf.withColumn("category",lit("science"))

tech_udf = spark.read.text('Data/unknown/technology/*')
tech_udf = tech_udf.withColumn("category",lit("technology"))

movie_udf = spark.read.text('Data/unknown/Movie/*')
movie_udf = movie_udf.withColumn("category",lit("Movie"))

merge_udf1 = Fas_udf.union(science_udf)
merge_udf2 = merge_udf1.union(tech_udf)
merge_udf3 = merge_udf2.union(movie_udf)

unknown_data = merge_udf3.select([column for column in merge_udf3.columns])
unknown_data.show(5)

+--------------------+--------+
|               value|category|
+--------------------+--------+
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
+--------------------+--------+
only showing top 5 rows



In [8]:
regexTokenizer = RegexTokenizer(inputCol="value", outputCol="words", pattern="\\W")

In [9]:
nltk.download('stopwords')

add_stopwords=nltk.corpus.stopwords.words('english')
add_stopwords_1 = ["nytimes","com","sense","day","common","business","todays","said","food","review","sunday","letters","politics","events","terms","services","years","contributors","companies","listings","applications","tax","trump","president","contributing","make","think","woman","federal","called","system","found","american","sale","headline","arts","times","subscriptions","choices","privacy","take","jobs","books","account","accounts","television","nyc","writers","multimedia","journeys","editorials","photography","automobiles","paper","city","tool","sports","weddings","columnists","contribution","even","nyt","obituary","state","travel","advertise","pm","street","go","corrections","saturday","company","dance","states","real","movies","estate","percent","music","tech","living","science","fashion","please","opinion","art","new","york","time","u","wa","reading","ha","video","image","photo","credit","edition","magazine","oped","could","crossword","mr","term","feedback","index","get","also","b","help","year","health","united","education","week","think","guide","event","two","first","subscription","service","cut","is","nytimescom","section","sections","Sections","Home","home","Search","search","Skip","skip","content","navigation","View","view","mobile","version","Subscribe","subscribe","Now","now","Log","log","In","in","setting","settings","Site","site","Loading","loading","article","next","previous","Advertisement","ad","advertisement","Supported","supported","by","Share","share","Page","page","Continue","continue","main","story","newsletter","Sign","Up","Manage","email","preferences","Not","you","opt","out","contact","us","anytime","thank","subscribing","see","more","email"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered1").setStopWords(add_stopwords)
stopwordsRemover1 = StopWordsRemover(inputCol="filtered1", outputCol="filtered").setStopWords(add_stopwords_1)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prajw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover,stopwordsRemover1, hashingTF, idf, label_stringIdx])

In [11]:
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)

In [12]:
predictions_train = lrModel.transform(trainingData)
predictions_train.filter(predictions_train['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                         value|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|   Sections SEARCH Skip to ...|   Movie|[0.9830929888325927,0.00384...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9747092140405847,0.00447...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9686060739262444,0.01916...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9681088802503716,0.00823...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.964292088183739,0.013876...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9606086171105256,0.00778...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9564970809495726,0.00653...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9523739507319724,0.01119...|  0.0|       0.0|
|   Sectio

In [13]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of train data using logistic_regression-----: " + str(evaluator.evaluate(predictions_train)*100)+"%")

-------Accuracy of train data using logistic_regression-----: 99.45378632158226%


In [14]:
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+----------+------------------------------+-----+----------+
|                         value|  category|                   probability|label|prediction|
+------------------------------+----------+------------------------------+-----+----------+
|   Sections SEARCH Skip to ...|     Movie|[0.9908482339340072,0.00182...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[0.9802917286793247,0.00202...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|Technology|[0.9639835577417218,0.00295...|  2.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[0.9497979743473339,0.00516...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[0.9436546734364557,0.01587...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[0.9357835995319901,0.04857...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[0.9096375772642806,0.01534...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[0.8789536502476271,0.06299...|  0.0|

In [15]:
pipelineFit2 = pipeline.fit(unknown_data)
unknown_dataset = pipelineFit2.transform(unknown_data)

In [16]:
predictions2 = lrModel.transform(unknown_dataset)
predictions2.filter(predictions2['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
predictions2.show(3)

+-----+--------+-----------+-----+----------+
|value|category|probability|label|prediction|
+-----+--------+-----------+-----+----------+
+-----+--------+-----------+-----+----------+

+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|               value|category|               words|           filtered1|            filtered|         rawFeatures|            features|label|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|   Sections SEARC...| Fashion|[sections, search...|[sections, search...|[today, industry,...|(1000,[10,12,43,5...|(1000,[10,12,43,5...|  3.0|[-0.0950705146411...|[0.18761885123337...|       1.0|
|   Sections SEARC...| Fashion|

In [17]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of test data using logistic_regression-----: " + str(evaluator.evaluate(predictions)*100)+"%")

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of unknown data using logistic_regression-----: " + str(evaluator.evaluate(predictions2)*100)+"%")


-------Accuracy of test data using logistic_regression-----: 62.19060109615789%
-------Accuracy of unknown data using logistic_regression-----: 16.618357487922705%


In [18]:
#training the data -- Naive Bayes
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)

In [19]:
predictions_train = model.transform(trainingData)
predictions_train.filter(predictions_train['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of train data using logistic_regression-----: " + str(evaluator.evaluate(predictions_train)*100)+"%")

+------------------------------+--------+------------------------------+-----+----------+
|                         value|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|   Sections SEARCH Skip to ...|   Movie|[1.0,9.95898746962288E-17,4...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,8.666383808393523E-17,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,5.83249218608599E-17,1...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,1.4102774227108478E-17...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,6.840979668391668E-19,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,4.410386468458676E-19,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,1.7012505618103597E-19...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,5.723041467306191E-20,...|  0.0|       0.0|
|   Sectio

In [20]:
predictions3 = model.transform(testData)
predictions3.filter(predictions3['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
predictions3.show(10)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of test data using naive_bayes-----: " + str(evaluator.evaluate(predictions3)*100)+"%")

+------------------------------+--------+------------------------------+-----+----------+
|                         value|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|   Sections SEARCH Skip to ...|   Movie|[1.0,1.3684245806422722E-26...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,8.711591311028575E-30,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,8.190088701806704E-30,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...| Fashion|[1.0,3.7862126309323696E-34...|  1.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,3.143591723372227E-40,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,6.691698128707452E-42,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,1.6843244883576324E-43...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,9.08148115365893E-44,6...|  0.0|       0.0|
|   Sectio

In [21]:
predictions4 = model.transform(unknown_dataset)
predictions4.filter(predictions4['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of unknown data using naive_bayes-----: " + str(evaluator.evaluate(predictions4)*100)+"%")

+------------------------------+--------+------------------------------+-----+----------+
|                         value|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|   Sections SEARCH Skip to ...|   Movie|[0.9800785908270826,0.01379...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9054009410715822,0.09337...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.7384371853234606,0.12491...|  0.0|       0.0|
|   Sections SEARCH Skip to ...| science|[0.5682558978134179,0.02578...|  1.0|       0.0|
+------------------------------+--------+------------------------------+-----+----------+

-------Accuracy of unknown data using naive_bayes-----: 39.309603440038224%


In [22]:
#training the data using Decision Tree Classifier
from pyspark.ml.classification import DecisionTreeClassifier
pipelineFit_dt = pipeline.fit(data)
dataset = pipelineFit_dt.transform(data)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
dt = DecisionTreeClassifier(impurity="gini")
dtModel = dt.fit(trainingData)

predictions_dt = dtModel.transform(trainingData)
predictions_dt.filter(predictions_dt['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of train data using Decision Tree-----: " + str(evaluator.evaluate(predictions_dt)*100)+"%")


predictions = dtModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of test data using Decision Tree-----: " + str(evaluator.evaluate(predictions)*100)+"%")


pipelineFit2 = pipeline.fit(unknown_data)
unknown_dataset = pipelineFit2.transform(unknown_data)
predictions2 = dtModel.transform(unknown_dataset)
predictions2.filter(predictions2['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of unknown data using Decision Tree-----: " + str(evaluator.evaluate(predictions2)*100)+"%")

+------------------------------+--------+-----------------+-----+----------+
|                         value|category|      probability|label|prediction|
+------------------------------+--------+-----------------+-----+----------+
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|

In [23]:
#training the data using Random Forest Classifier
from pyspark.ml.classification import RandomForestClassifier
pipelineFit_rf = pipeline.fit(data)
dataset = pipelineFit_rf.transform(data)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
rf = RandomForestClassifier(numTrees=50)
rfModel = rf.fit(trainingData)

predictions_rf = rfModel.transform(trainingData)
predictions_rf.filter(predictions_rf['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of train data using Random Forest-----: " + str(evaluator.evaluate(predictions_rf)*100)+"%")



predictions = rfModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of test data using Random Forest-----: " + str(evaluator.evaluate(predictions)*100)+"%")



pipelineFit2 = pipeline.fit(unknown_data)
unknown_dataset = pipelineFit2.transform(unknown_data)
predictions2 = rfModel.transform(unknown_dataset)
predictions2.filter(predictions2['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of unknown data using Decision Tree-----: " + str(evaluator.evaluate(predictions2)*100)+"%")

+------------------------------+--------+------------------------------+-----+----------+
|                         value|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|   Sections SEARCH Skip to ...|   Movie|[0.6840458090651829,0.13882...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.6683217342576837,0.11471...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.6343212396708592,0.14489...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.6341318613026632,0.15792...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.6098285186402571,0.14259...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.6018263398945013,0.16046...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.5964096512402315,0.16659...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.5924842959358594,0.15568...|  0.0|       0.0|
|   Sectio

In [24]:
#training the data using Support Vector Machines
from pyspark.ml.classification import LinearSVC
pipelineFit_svc = pipeline.fit(data)
dataset = pipelineFit_svc.transform(data)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
svc = RandomForestClassifier(numTrees=50)
svcModel = svc.fit(trainingData)

predictions_svc = svcModel.transform(trainingData)
predictions_svc.filter(predictions_svc['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of train data using Support Vector Machines-----: " + str(evaluator.evaluate(predictions_svc)*100)+"%")



predictions = svcModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of test data using Support Vector Machines-----: " + str(evaluator.evaluate(predictions)*100)+"%")



pipelineFit2 = pipeline.fit(unknown_data)
unknown_dataset = pipelineFit2.transform(unknown_data)
predictions2 = svcModel.transform(unknown_dataset)
predictions2.filter(predictions2['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of unknown data using Decision Tree-----: " + str(evaluator.evaluate(predictions2)*100)+"%")

+------------------------------+--------+------------------------------+-----+----------+
|                         value|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|   Sections SEARCH Skip to ...|   Movie|[0.6840458090651829,0.13882...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.6683217342576837,0.11471...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.6343212396708592,0.14489...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.6341318613026632,0.15792...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.6098285186402571,0.14259...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.6018263398945013,0.16046...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.5964096512402315,0.16659...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.5924842959358594,0.15568...|  0.0|       0.0|
|   Sectio