In [1]:
import findspark
findspark.init('C:/extras/spark')

In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
                          .appName('classifier')\
                          .getOrCreate()
sc=spark.sparkContext

In [3]:
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from pyspark.ml.feature import *
from pyspark.ml.classification import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import nltk
from nltk.corpus import stopwords

In [4]:
sqlContext = SQLContext(sc)

In [5]:
fas_df = spark.read.text('Data/Fashion/*')
fas_df = fas_df.withColumn("category",lit("Fashion"))

tech_df = spark.read.text('Data/Technology/*')
tech_df = tech_df.withColumn("category",lit("Technology"))

sci_df = spark.read.text('Data/Science/*')
sci_df = sci_df.withColumn("category",lit("science"))

mov_df = spark.read.text('Data/Movie/*')
mov_df = mov_df.withColumn("category",lit("Movie"))


merge_df1 = fas_df.union(tech_df)
merge_df2 = merge_df1.union(sci_df)
merge_df3 = merge_df2.union(mov_df)

In [6]:
data = merge_df3.select([column for column in merge_df3.columns])
data.show(5)

+--------------------+--------+
|               value|category|
+--------------------+--------+
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
+--------------------+--------+
only showing top 5 rows



In [7]:
Fas_udf = spark.read.text('Data/unknown/Fashion/*')
Fas_udf = Fas_udf.withColumn("category",lit("Fashion"))

science_udf = spark.read.text('Data/unknown/science/*')
science_udf = science_udf.withColumn("category",lit("science"))

tech_udf = spark.read.text('Data/unknown/technology/*')
tech_udf = tech_udf.withColumn("category",lit("technology"))

movie_udf = spark.read.text('Data/unknown/Movie/*')
movie_udf = movie_udf.withColumn("category",lit("Movie"))

merge_udf1 = Fas_udf.union(science_udf)
merge_udf2 = merge_udf1.union(tech_udf)
merge_udf3 = merge_udf2.union(movie_udf)

unknown_data = merge_udf3.select([column for column in merge_udf3.columns])
unknown_data.show(5)

+--------------------+--------+
|               value|category|
+--------------------+--------+
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
+--------------------+--------+
only showing top 5 rows



In [8]:
regexTokenizer = RegexTokenizer(inputCol="value", outputCol="words", pattern="\\W")

In [10]:
nltk.download('stopwords')

add_stopwords=nltk.corpus.stopwords.words('english')
add_stopwords_1 = ["nytimes","com","sense","day","common","business","todays","said","food","review","sunday","letters","politics","events","terms","services","years","contributors","companies","listings","applications","tax","trump","president","contributing","make","think","woman","federal","called","system","found","american","sale","headline","arts","times","subscriptions","choices","privacy","take","jobs","books","account","accounts","television","nyc","writers","multimedia","journeys","editorials","photography","automobiles","paper","city","tool","sports","weddings","columnists","contribution","even","nyt","obituary","state","travel","advertise","pm","street","go","corrections","saturday","company","dance","states","real","movies","estate","percent","music","tech","living","science","fashion","please","opinion","art","new","york","time","u","wa","reading","ha","video","image","photo","credit","edition","magazine","oped","could","crossword","mr","term","feedback","index","get","also","b","help","year","health","united","education","week","think","guide","event","two","first","subscription","service","cut","is","nytimescom","section","sections","Sections","Home","home","Search","search","Skip","skip","content","navigation","View","view","mobile","version","Subscribe","subscribe","Now","now","Log","log","In","in","setting","settings","Site","site","Loading","loading","article","next","previous","Advertisement","ad","advertisement","Supported","supported","by","Share","share","Page","page","Continue","continue","main","story","newsletter","Sign","Up","Manage","email","preferences","Not","you","opt","out","contact","us","anytime","thank","subscribing","see","more","email"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered1").setStopWords(add_stopwords)
stopwordsRemover1 = StopWordsRemover(inputCol="filtered1", outputCol="filtered").setStopWords(add_stopwords_1)


In [11]:
label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover,stopwordsRemover1, hashingTF, idf, label_stringIdx])

In [12]:
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)

In [13]:
predictions_train = lrModel.transform(trainingData)
predictions_train.filter(predictions_train['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                         value|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|   Sections SEARCH Skip to ...|   Movie|[0.9822394802109057,0.00235...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9811567547470802,0.00302...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9770537920390813,0.00631...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9763162809849949,0.01115...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9762569248827576,0.00562...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.970554468408183,0.009335...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9679333738863134,0.00870...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9669405158721934,0.00738...|  0.0|       0.0|
|   Sectio

In [14]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of train data using logistic_regression-----: " + str(evaluator.evaluate(predictions_train)*100)+"%")

-------Accuracy of train data using logistic_regression-----: 98.76222962433778%


In [15]:
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                         value|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|   Sections SEARCH Skip to ...|   Movie|[0.9754632623041882,0.01057...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9431747402417925,0.02006...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9172397458630736,0.02471...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9053576601908154,0.02005...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.8962780254409616,0.02195...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.8507710146790355,0.01021...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.8454293725021643,0.05217...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.8197670154180591,0.01462...|  0.0|       0.0|
|   Sectio

In [16]:
pipelineFit2 = pipeline.fit(unknown_data)
unknown_dataset = pipelineFit2.transform(unknown_data)

In [17]:
predictions2 = lrModel.transform(unknown_dataset)
predictions2.filter(predictions2['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
predictions2.show(3)

+-----+--------+-----------+-----+----------+
|value|category|probability|label|prediction|
+-----+--------+-----------+-----+----------+
+-----+--------+-----------+-----+----------+



In [18]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of test data using logistic_regression-----: " + str(evaluator.evaluate(predictions)*100)+"%")

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of unknown data using logistic_regression-----: " + str(evaluator.evaluate(predictions2)*100)+"%")


-------Accuracy of test data using logistic_regression-----: 64.6386141204794%
-------Accuracy of unknown data using logistic_regression-----: 13.2128740824393%


In [19]:
#training the data -- Naive Bayes
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)

In [20]:
predictions_train = model.transform(trainingData)
predictions_train.filter(predictions_train['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of train data using logistic_regression-----: " + str(evaluator.evaluate(predictions_train)*100)+"%")

+------------------------------+--------+------------------------------+-----+----------+
|                         value|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|   Sections SEARCH Skip to ...|   Movie|[1.0,1.0348354707598595E-18...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,7.365373813070062E-21,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,2.0192994705203615E-23...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,9.314255596200951E-24,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,4.686775487333476E-24,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,6.131923885718499E-26,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,3.8235683097586036E-28...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,5.44902143952754E-29,1...|  0.0|       0.0|
|   Sectio

In [21]:
predictions3 = model.transform(testData)
predictions3.filter(predictions3['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
predictions3.show(10)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of test data using naive_bayes-----: " + str(evaluator.evaluate(predictions3)*100)+"%")

+------------------------------+----------+------------------------------+-----+----------+
|                         value|  category|                   probability|label|prediction|
+------------------------------+----------+------------------------------+-----+----------+
|   Sections SEARCH Skip to ...|   Fashion|[1.0,1.7849695897712397E-17...|  1.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[1.0,5.286020787858083E-18,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[1.0,4.703611074610709E-22,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[1.0,6.492341445916501E-27,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[1.0,4.248773392621757E-29,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[1.0,7.00353830988496E-31,2...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[1.0,8.202852029206588E-34,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[1.0,2.7184097459394804E-39...|  0.0|

In [22]:
predictions4 = model.transform(unknown_dataset)
predictions4.filter(predictions4['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of unknown data using naive_bayes-----: " + str(evaluator.evaluate(predictions4)*100)+"%")

+------------------------------+--------+------------------------------+-----+----------+
|                         value|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|   Sections SEARCH Skip to ...|   Movie|[0.9572707190900996,0.03799...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9200992619781946,0.03093...|  0.0|       0.0|
|   Sections SEARCH Skip to ...| science|[0.8060914009544183,0.00356...|  1.0|       0.0|
+------------------------------+--------+------------------------------+-----+----------+

-------Accuracy of unknown data using naive_bayes-----: 31.242077171133104%


In [23]:
#training the data using Decision Tree Classifier
from pyspark.ml.classification import DecisionTreeClassifier
pipelineFit_dt = pipeline.fit(data)
dataset = pipelineFit_dt.transform(data)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
dt = DecisionTreeClassifier(impurity="gini")
dtModel = dt.fit(trainingData)

predictions_dt = dtModel.transform(trainingData)
predictions_dt.filter(predictions_dt['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of train data using Decision Tree-----: " + str(evaluator.evaluate(predictions_dt)*100)+"%")


predictions = dtModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of test data using Decision Tree-----: " + str(evaluator.evaluate(predictions)*100)+"%")


pipelineFit2 = pipeline.fit(unknown_data)
unknown_dataset = pipelineFit2.transform(unknown_data)
predictions2 = dtModel.transform(unknown_dataset)
predictions2.filter(predictions2['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of unknown data using Decision Tree-----: " + str(evaluator.evaluate(predictions2)*100)+"%")

+------------------------------+--------+-----------------+-----+----------+
|                         value|category|      probability|label|prediction|
+------------------------------+--------+-----------------+-----+----------+
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[1.0,0.0,0.0,0.0]|  0.0|       0.0|

In [24]:
#training the data using Random Forest Classifier
from pyspark.ml.classification import RandomForestClassifier
pipelineFit_rf = pipeline.fit(data)
dataset = pipelineFit_rf.transform(data)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
rf = RandomForestClassifier(numTrees=50)
rfModel = rf.fit(trainingData)

predictions_rf = rfModel.transform(trainingData)
predictions_rf.filter(predictions_rf['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of train data using Random Forest-----: " + str(evaluator.evaluate(predictions_rf)*100)+"%")



predictions = rfModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of test data using Random Forest-----: " + str(evaluator.evaluate(predictions)*100)+"%")



pipelineFit2 = pipeline.fit(unknown_data)
unknown_dataset = pipelineFit2.transform(unknown_data)
predictions2 = rfModel.transform(unknown_dataset)
predictions2.filter(predictions2['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of unknown data using Decision Tree-----: " + str(evaluator.evaluate(predictions2)*100)+"%")

+------------------------------+--------+------------------------------+-----+----------+
|                         value|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|   Sections SEARCH Skip to ...|   Movie|[0.6398705677894322,0.13560...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.6227305732576371,0.12748...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.6227158003340711,0.15309...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.6154696010362707,0.13968...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.5852354309301614,0.15176...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.5812419180480376,0.14512...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.577175668620227,0.164575...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.5765341565550058,0.16046...|  0.0|       0.0|
|   Sectio

In [25]:
#training the data using Support Vector Machines
from pyspark.ml.classification import LinearSVC
pipelineFit_svc = pipeline.fit(data)
dataset = pipelineFit_svc.transform(data)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
svc = RandomForestClassifier(numTrees=50)
svcModel = svc.fit(trainingData)

predictions_svc = svcModel.transform(trainingData)
predictions_svc.filter(predictions_svc['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of train data using Support Vector Machines-----: " + str(evaluator.evaluate(predictions_svc)*100)+"%")



predictions = svcModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of test data using Support Vector Machines-----: " + str(evaluator.evaluate(predictions)*100)+"%")



pipelineFit2 = pipeline.fit(unknown_data)
unknown_dataset = pipelineFit2.transform(unknown_data)
predictions2 = svcModel.transform(unknown_dataset)
predictions2.filter(predictions2['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of unknown data using Decision Tree-----: " + str(evaluator.evaluate(predictions2)*100)+"%")

+------------------------------+--------+------------------------------+-----+----------+
|                         value|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|   Sections SEARCH Skip to ...|   Movie|[0.6398705677894322,0.13560...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.6227305732576371,0.12748...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.6227158003340711,0.15309...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.6154696010362707,0.13968...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.5852354309301614,0.15176...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.5812419180480376,0.14512...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.577175668620227,0.164575...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.5765341565550058,0.16046...|  0.0|       0.0|
|   Sectio