In [1]:
import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline


In [2]:

spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2")\
    .getOrCreate()

In [3]:

word_embeddings = WordEmbeddingsModel.pretrained("glove_100d")

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [4]:
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  4.4.4
Apache Spark version:  3.2.3


In [5]:
training_data = spark.read.text("resources/training_data.txt")
training_data.printSchema()
training_data.show()

root
 |-- value: string (nullable = true)

+--------------------+
|               value|
+--------------------+
|Factory Buys 32cm...|
|Beadlight Cirrus ...|
|Hamar Plant Stand...|
|Arrange Console T...|
|Artemis Oval Mirr...|
|Trim Sideboard 18...|
|Aster Door Mat - ...|
|Hamar Plant Stand...|
|Linear Wood table...|
|  Aqua Table PRODUCT|
|Taylor Dining Tab...|
|A- Joint Tabla PR...|
|A- Joint Round Ta...|
|Helborn Table PRO...|
|Wrongwoods Table ...|
|Cork Dining Table...|
|Crystal Brook 11 ...|
|Hampton's 11 Piec...|
|Eastport 11 Piece...|
|Republic 13 Piece...|
+--------------------+
only showing top 20 rows



In [6]:
# pretrained_model_name = "ner_dl"  # Choose the appropriate pretrained model
pretrained_model_name = "ner_dl_bert"  # Choose the appropriate pretrained model

pretrained_ner_model = NerDLModel.pretrained(pretrained_model_name, "en")

ner_dl_bert download started this may take some time.
Approximate size to download 15.4 MB
[OK!]


In [7]:
pipeline = PretrainedPipeline('analyze_sentiment', 'en')

analyze_sentiment download started this may take some time.
Approx size to download 4.8 MB
[OK!]


In [8]:
result = pipeline.annotate('This is a very boring movie. I recommend others to awoid this movie is not good..')


print(result['sentiment'])

['negative', 'negative', 'negative']


In [9]:

print(result['checked'])

['This', 'is', 'a', 'very', 'boring', 'movie', '.', 'I', 'recommend', 'others', 'to', 'avoid', 'this', 'movie', 'is', 'not', 'good', '.', '.']


In [10]:
pipeline = PretrainedPipeline('recognize_entities_dl', 'en')

recognize_entities_dl download started this may take some time.
Approx size to download 159 MB
[OK!]


In [11]:
result = pipeline.annotate('Google has announced the release of a beta version of the popular TensorFlow machine learning library.')
print(result['ner'])
print(result['entities'])

['B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O']
['Google', 'TensorFlow']
