In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Team Members
1. Cao Quyết Chiến - 18520526
2. Nguyễn Hữu Toàn - 18521507
3. Đặng Quang Hưng - 18520790

In [None]:
!pip install pandas

In [None]:
!pip install spark-nlp==3.1.2 pyspark

In [None]:
# Initialize Spark
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline# Start Spark Session with Spark NLP
spark = SparkSession.builder.appName("BBC Text Categorization").getOrCreate()

In [None]:
# Load the Text Data
# File location and type
file_location = r'/kaggle/input/bbc-fulltext-and-category/bbc-text.csv'
file_type = "csv"# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","
df = spark.read.format(file_type).option("inferSchema", infer_schema).option("header", first_row_is_header).option("sep", delimiter) .load(file_location)

In [None]:
dataPd = df.toPandas()

In [None]:
#Analyst data
dataPd.info()

In [None]:
dataPd.groupby(['category']).size()

In [None]:
data = dataPd.drop_duplicates(subset ="text")
data.info()

In [None]:
from pyspark.sql.types import *
mySchema = StructType([StructField("category", StringType(), True), StructField("text", StringType(), True)])

In [None]:
pddata = spark.createDataFrame(data,schema=mySchema)

In [None]:
# Split the dataset into train and test sets
(trainingData, testData) = pddata.randomSplit([0.7, 0.3], seed = 100)
trainingData.show()
testData.show()

In [None]:
#NLP Pipeline using Spark NLP
from pyspark.ml.feature import (HashingTF, IDF, StringIndexer,IndexToString, Tokenizer, StopWordsRemover, Normalizer)
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
tokenizer = Tokenizer(inputCol = "text", outputCol="token")

In [None]:
stopWordsCleaner = StopWordsRemover(
                    inputCol = "token", outputCol="wordsCleaner", 
                    stopWords=StopWordsRemover.loadDefaultStopWords("english"))

In [None]:
hashingTf = HashingTF(inputCol = "wordsCleaner", outputCol="hashingTF")

In [None]:
idf = IDF(inputCol = "hashingTF", outputCol="features", minDocFreq=5)

In [None]:
label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")

In [None]:
lr = LogisticRegression(labelCol="label",maxIter=10, regParam=0.3, elasticNetParam=0.0)

In [None]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

In [None]:
from pyspark.ml.classification import LogisticRegression, OneVsRest

ovr = OneVsRest(classifier=lr)


In [None]:
label_to_stringIdx = IndexToString(inputCol="label", outputCol="article_class")

In [None]:
nlp_pipeline = Pipeline(
                stages=[tokenizer, stopWordsCleaner,hashingTf,idf, label_stringIdx, nb, label_to_stringIdx])

In [None]:
df1 = tokenizer.transform(trainingData)
df1.show()

In [None]:
df2 = stopWordsCleaner.transform(df1)
df2.show()

In [None]:
df3 = hashingTf.transform(df2)
df4 = idf.fit(df3).transform(df3)
df4.select(['category', 'text', 'hashingTF', 'features']).show()

In [None]:
df5 = label_stringIdx.fit(df4).transform(df4)
df5.select(['category', 'text', 'features','label']).show()

In [None]:
df6 = lr.fit(df5).transform(df5)
df6.select(["category", "text", "rawPrediction", "probability", "prediction"]).show()

In [None]:
# fit the pipeline on training data
pipeline_model = nlp_pipeline.fit(trainingData)


In [None]:
# perform predictions on test data
predictions =  pipeline_model.transform(testData)
predictions.select(["category", "text", "rawPrediction", "probability", "prediction", "article_class"]).show()

In [None]:
# import evaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % (accuracy))
print("Test Error = %g " % (1.0 - accuracy))

In [None]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % (accuracy))
print("Test Error = %g " % (1.0 - accuracy))

In [None]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedRecall")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % (accuracy))
print("Test Error = %g " % (1.0 - accuracy))