In [30]:
import sparknlp
spark = sparknlp.start(gpu=True) 
# sparknlp.start(gpu=True) >> for training on GPUfrom sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd
print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

Spark NLP version 4.2.2
Apache Spark version: 3.3.1


In [31]:
#! wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_train.csv

In [32]:
#! wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_test.csv

In [33]:
trainDataset = spark.read \
      .option("header", True) \
      .csv("text_train.csv")
trainDataset.show(10, truncate=50)

+--------------------------------+--------+
|                     description|category|
+--------------------------------+--------+
|           i want to make coffee|      DA|
|                      where am i|      LR|
|              what can i do here|      AR|
|                    prepare meal|      DA|
|                use refrigerator|      DA|
|                         use fan|      DA|
|                        use oven|      DA|
|                       use stove|      DA|
|     i would like to wash sheets|      DA|
|i would like to watch television|      DA|
+--------------------------------+--------+
only showing top 10 rows



In [34]:
testDataset = spark.read \
      .option("header", True) \
      .csv("text_test.csv")
testDataset.show(10, truncate=50)

+--------------------------------+--------+
|                     description|category|
+--------------------------------+--------+
|           i want to make coffee|      DA|
|                      where am i|      LR|
|              what can i do here|      AR|
|                    prepare meal|      DA|
|                use refrigerator|      DA|
|                         use fan|      DA|
|                        use oven|      DA|
|                       use stove|      DA|
|     i would like to wash sheets|      DA|
|i would like to watch television|      DA|
+--------------------------------+--------+
only showing top 10 rows



In [35]:
from pyspark.sql.functions import col
trainDataset.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------+-----+
|category|count|
+--------+-----+
|      VC|   46|
|      DA|   29|
|      CC|   26|
|      SD|   23|
|      LR|   21|
|      AR|   20|
|      SN|   20|
|     NAV|   18|
|      NA|    1|
+--------+-----+



In [36]:
testDataset = spark.read \
      .option("header", True) \
      .csv("text_test.csv")
testDataset.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------+-----+
|category|count|
+--------+-----+
|      VC|   46|
|      DA|   29|
|      CC|   26|
|      SD|   23|
|      LR|   21|
|      AR|   20|
|      SN|   20|
|     NAV|   18|
|      NA|    1|
+--------+-----+



In [37]:
# actual content is inside description column
document = DocumentAssembler()\
    .setInputCol("description")\
    .setOutputCol("document")
    
# we can also use sentence detector here 
# if we want to train on and get predictions for each sentence# downloading pretrained embeddings
use = UniversalSentenceEncoder.pretrained()\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")
# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("class")\
    .setLabelColumn("category")\
    .setMaxEpochs(500)\
    .setLr(0.01)\
    .setBatchSize(32)\
    .setRandomSeed(957)\
    .setVerbose(1)\
    .setEvaluationLogExtended(True) \
    .setEnableOutputLogs(True)\
    .setOutputLogsPath('classifer_logs')\

use_clf_pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdl
    ])

use_pipelineModel = use_clf_pipeline.fit(trainDataset)
#.setValidationSplit(0.2)\

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [38]:
use_pipelineModel.stages[2].write().overwrite().save('Text_Classification')

In [39]:
#predictions =  use_pipelineModel.transform(testDataset)
#predictions.select("category", "text", "class.result").show(5, truncate=30)

In [40]:
# actual content is inside description column
document = DocumentAssembler()\
    .setInputCol("description")\
    .setOutputCol("document")
    
# we can also use sentence detector here 
# if we want to train on and get predictions for each sentence# downloading pretrained embeddings
use = UniversalSentenceEncoder.pretrained()\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")
# the classes/labels/categories are in category column
loaded_ner_model = ClassifierDLModel.load("Text_Classification")\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")

use_clf_pipeline = Pipeline(
    stages = [
        document,
        use,
        loaded_ner_model
    ])

#use_pipelineModel = use_clf_pipeline.fit(trainDataset)

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [41]:
from sparknlp.base import LightPipeline
#clf_pipelineModel = use_clf_pipeline.fit(trainDataset)

text = "navigate to kitchen"
prediction_data = spark.createDataFrame([[text]]).toDF("text")
prediction_model = use_clf_pipeline.fit(prediction_data)
light = LightPipeline(prediction_model)
results = light.annotate(text)
print(results)
print(results['class'])

{'document': ['navigate to kitchen'], 'sentence_embeddings': ['navigate to kitchen'], 'class': ['NAV']}
['NAV']


In [42]:
text = input("Enter Testing Text\n")
while(text != "exit"):
    results = light.annotate(text)
    print(results['class'])
    text = input("Enter New Text\n")

['NAV']
['NAV']
['VC']
['SD']
['SD']
['VC']
['SD']
['DA']
['DA']
['DA']
['NAV']
['DA']
['LR']
['AR']
['AR']
['LR']
['LR']
