In [0]:
import sparknlp
print("Spark NLP version")
sparknlp.version()
print("Apache Spark version")
spark.version

In [0]:
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd
from sparknlp.common import *

In [0]:
#/FileStore/tables/news_category_train.csv
#/FileStore/tables/news_category_test.csv

In [0]:
spark=sparknlp.start()

In [0]:
train_df=spark.read.option('header','true').csv('dbfs:/FileStore/tables/news_category_train.csv')
train_df.show(5)

In [0]:
train_df.count()

In [0]:
from pyspark.sql import functions as func

In [0]:
train_df.groupBy('category').count().show()

In [0]:
test_df=spark.read.option('header','true').csv('dbfs:/FileStore/tables/news_category_test.csv')
test_df.show(5)

In [0]:
test_df.groupBy('category').count().show()

In [0]:
training_data,test_data=train_df.randomSplit([0.7,0.3],seed=101)

In [0]:
training_data.count()

In [0]:
test_data.count()

In [0]:
document_assembler=DocumentAssembler().setInputCol("description").setOutputCol("document")

In [0]:
tokenizer=Tokenizer().setInputCols("document").setOutputCol("token")

In [0]:
normalizer=Normalizer().setInputCols("token").setOutputCol("normalized")

In [0]:
stopwords_cleaner=StopWordsCleaner().setInputCols("normalized").setOutputCol("cleanToken")

In [0]:
lemma=LemmatizerModel.pretrained().setInputCols("cleanToken").setOutputCol("lemma")

In [0]:
embeddings=WordEmbeddingsModel().pretrained().setInputCols(['document','lemma']).setOutputCol("embeddings")

In [0]:
embeddingsSentence=SentenceEmbeddings().setInputCols(['document','embeddings']).setOutputCol('sentence_embeddings')

In [0]:
classifier_dl=ClassifierDLApproach().setInputCols("sentence_embeddings").setOutputCol("class").setLabelColumn("category").setMaxEpochs(1).setEnableOutputLogs(True)

In [0]:
pipeline=Pipeline(stages=[document_assembler,tokenizer,normalizer,stopwords_cleaner,lemma,embeddings,embeddingsSentence,classifier_dl])

In [0]:
model=pipeline.fit(training_data)

In [0]:
preds=model.transform(test_data)

In [0]:
preds.show(5)

In [0]:
preds.select('category','description',"class.result").show(5)

In [0]:
pred_df=preds.select('category','description',"class.result").toPandas()

In [0]:
pred_df.head()

Unnamed: 0,category,description,result
0,Business,A federal judge has scheduled jury selection...,[Business]
1,Business,A group led by privately held Colony Capital...,[Business]
2,Business,Although published reports yesterday claimed...,[Business]
3,Business,Americans paid their credit card bills on ti...,[Business]
4,Business,Applied Materials beat Wall Street #39;s thi...,[Business]


In [0]:
pred_df["result"]=pred_df["result"].apply(lambda x:x[0])
pred_df["result"].head(5)

In [0]:
from sklearn.metrics import classification_report

In [0]:
print(classification_report(pred_df["result"],pred_df["category"]))

In [0]:
print(classification_report(pred_df["category"],pred_df["result"]))

In [0]:
document = DocumentAssembler().setInputCol("description").setOutputCol("document")
use = UniversalSentenceEncoder.pretrained().setInputCols(["document"]).setOutputCol("sentence_embeddings")

In [0]:
classsifierdl = ClassifierDLApproach().setInputCols(["sentence_embeddings"]).setOutputCol("class").setLabelColumn("category").setMaxEpochs(1)

In [0]:
pipeline = Pipeline(stages=[document,use,classsifierdl])

In [0]:
new_model=pipeline.fit(training_data)

In [0]:
new_predict=new_model.transform(test_data)

In [0]:
new_predict.select('category','description',"class.result").show(10)

In [0]:
new_pred=new_predict.select('category','description',"class.result").toPandas()

In [0]:
new_pred.head(10)

Unnamed: 0,category,description,result
0,Business,A federal judge has scheduled jury selection...,[Business]
1,Business,A group led by privately held Colony Capital...,[Business]
2,Business,Although published reports yesterday claimed...,[Business]
3,Business,Americans paid their credit card bills on ti...,[Business]
4,Business,Applied Materials beat Wall Street #39;s thi...,[Business]
5,Business,"Asian stocks closed mainly higher Tuesday, l...",[Business]
6,Business,"BHP Billiton, the world #39;s biggest miner,...",[Business]
7,Business,"BHP Billiton, the world #39;s biggest mining...",[Business]
8,Business,"Cantor Fitzgerald LP, one of the two largest...",[Business]
9,Business,"Citigroup Inc., the world #39;s biggest bank...",[Business]


In [0]:
new_pred["result"]=new_pred["result"].apply(lambda x:x[0])
new_pred["result"].head(10)

In [0]:
print(classification_report(new_pred["result"],new_pred["category"]))

In [0]:
print(classification_report(new_pred["category"],new_pred["result"]))

In [0]:
from sparknlp.base import LightPipeline

In [0]:
light_model=LightPipeline(new_model)

In [0]:
test_data.select("description").take(2)

In [0]:
from pyspark.sql.types import StringType

In [0]:
dfTest = spark.createDataFrame([
    "Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.",
    "Scientists have discovered irregular lumps beneath the icy surface of Jupiter's largest moon, Ganymede. These irregular masses may be rock formations, supported by Ganymede's icy shell for billions of years..."
], StringType()).toDF("description")

In [0]:
dfTest.show()

In [0]:
light_prediction = light_model.transform(dfTest)

In [0]:
print(light_prediction)

In [0]:
light_prediction.select("class.result","description").show()

In [0]:
light_df=light_prediction.select("class.result","description").toPandas()
light_df

Unnamed: 0,result,description
0,[Business],Unions representing workers at Turner Newall s...
1,[Sci/Tech],Scientists have discovered irregular lumps ben...


In [0]:
light_prediction.select("class.metadata").show(truncate=False)

In [0]:
light_preds=light_model.transform(test_df)

In [0]:
light_preds.select('category','description',"class.result").show(20)

In [0]:
new_df=light_preds.select('category','description',"class.result").toPandas()

In [0]:
new_df.head(10)

Unnamed: 0,category,description,result
0,Business,Unions representing workers at Turner Newall...,[Business]
1,Sci/Tech,"TORONTO, Canada A second team of rocketeer...",[Sci/Tech]
2,Sci/Tech,A company founded by a chemistry researcher a...,[Sci/Tech]
3,Sci/Tech,It's barely dawn when Mike Fitzpatrick starts...,[Sci/Tech]
4,Sci/Tech,Southern California's smog fighting agency we...,[Business]
5,Sci/Tech,"""The British Department for Education and Skil...",[Sci/Tech]
6,Sci/Tech,"""confessed author of the Netsky and Sasser vir...",[Sci/Tech]
7,Sci/Tech,\\FOAF/LOAF and bloom filters have a lot of i...,[Sci/Tech]
8,Sci/Tech,"""Wiltshire Police warns about """"phishing"""" aft...",[Sci/Tech]
9,Sci/Tech,"In its first two years, the UK's dedicated car...",[Sci/Tech]


In [0]:
new_df['result'] = new_df['result'].apply(lambda x : x[0])

In [0]:
new_df['result'].head()

In [0]:
print(classification_report(new_df['result'],new_df['category']))