In [12]:
import sparknlp

import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as fun
from pyspark.sql.types import *

import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from sparknlp.pretrained import PretrainedPipeline
from sparknlp import Finisher
#Spark version and Spark NLP version should be align

In [4]:
spark = SparkSession.builder \
    .appName("nlp")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.4")\
    .getOrCreate()

In [5]:
path = "/home/rjac/workspace/wikipedia-biography-dataset/wikipedia-biography-dataset/train/train.sent"

In [6]:
df = spark.read.text(path)

In [7]:
df.show(5,vertical=True,truncate=350)

[Stage 0:>                                                          (0 + 1) / 1]

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 value | walter extra is a german award-winning aerobatic pilot , chief aircraft designer and founder of extra flugzeugbau -lrb- extra aircraft construction -rrb- , a manufacturer of aerobatic aircraft . 
-RECORD 1---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 value | extra was trained as a mechanical engineer .                                                                                                                                                       
-RECORD 2---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [8]:
pipeline = PretrainedPipeline('explain_document_ml', lang='en')

explain_document_ml download started this may take some time.
Approx size to download 9.1 MB
[ | ]explain_document_ml download started this may take some time.
Approximate size to download 9.1 MB
[ \ ]Download done! Loading the resource.
[ | ]

[Stage 13:>                                                         (0 + 8) / 8]

[ / ]

                                                                                

[ \ ]



[ | ]

                                                                                

[OK!]


In [9]:
docs = pipeline.annotate(df,"value")

In [11]:
docs.printSchema()

root
 |-- text: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true

In [18]:
extractor = Finisher()

In [24]:
extractor = extractor.setInputCols(["spell","pos"])
extractor = extractor.setOutputAsArray(True)

In [27]:
preproceed_df = extractor.transform(docs)

In [29]:
preproceed_df.show(5,vertical=True,truncate=150)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------
 text           | walter extra is a german award-winning aerobatic pilot , chief aircraft designer and founder of extra flugzeugbau -lrb- extra aircraft construction... 
 finished_spell | [waiter, extra, is, a, germane, awardwinning, aerobatic, pilot, ,, chief, aircraft, designer, and, founder, of, extra, flugzeugbau, -, lrb, -, extr... 
 finished_pos   | [NN, NN, VBZ, DT, NN, VBG, JJ, NN, ,, JJ, NN, NN, CC, NN, IN, JJ, NN, -, NN, -, JJ, NN, NN, -, NN, -, ,, DT, NN, IN, JJ, NN, .]                        
-RECORD 1----------------------------------------------------------------------------------------------------------------------------------------------------------------
 text           | extra was trained as a mechanical engineer .                                                                                        

                                                                                

In [30]:
pandas_df = preproceed_df.toPandas()

[Stage 31:>                                                       (0 + 12) / 12]