### Libraries

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Spark version and Spark NLP version should be align

In [84]:
from sparknlp.base import DocumentAssembler, Finisher, Pipeline
from sparknlp.annotator import Lemmatizer, Stemmer, Tokenizer, Normalizer, StopWordsCleaner, PerceptronModel,LemmatizerModel
from sparknlp.pretrained import PretrainedPipeline

### Spark Session Builder

In [7]:
spark = SparkSession.builder\
    .appName("nlp")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.4")\
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

### Data Extraction

In [8]:
path = "/home/rjac/workspace/wikipedia-biography-dataset/wikipedia-biography-dataset/train/train.sent"
df = spark.read.text(path)

In [9]:
df.show(5,vertical=True,truncate=250)

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 value | walter extra is a german award-winning aerobatic pilot , chief aircraft designer and founder of extra flugzeugbau -lrb- extra aircraft construction -rrb- , a manufacturer of aerobatic aircraft . 
-RECORD 1---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 value | extra was trained as a mechanical engineer .                                                                                                                                                       
-RECORD 2---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [52]:
from pyspark.ml.feature import StopWordsRemover

In [58]:
stopwords = StopWordsRemover().getStopWords()

In [64]:
exclude = ["i","he","him","his","himself","she","her","hers","herself",'they','them', 'their','theirs','themselves',"she's","he's"]

In [65]:
stopwords = [w for w in stopwords if w not in exclude]

### Data Processing

In [139]:
document_assambler = DocumentAssembler().setInputCol("value").setOutputCol("document").setCleanupMode("shrink")
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")

#output1
normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalized").setLowercase(True)
stopword_cleaner = StopWordsCleaner().setInputCols(["normalized"]).setOutputCol("cleanTokens").setCaseSensitive(False).setStopWords(stopwords)
lemmatizer = LemmatizerModel.pretrained().setInputCols(["cleanTokens"]).setOutputCol("lemma")

#output2
pos_tagger = PerceptronModel().pretrained("pos_anc", 'en').setInputCols(["document","cleanTokens"]).setOutputCol("pos")

#Result
finisher = Finisher().setInputCols(["lemma","pos"]).setOutputCols(["tokens","pos"]).setOutputAsArray(True).setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
pos_anc download started this may take some time.
Approximate size to download 3.9 MB
[OK!]


In [140]:
nlp_pipeline = Pipeline(stages=[
    document_assambler
    ,tokenizer
    ,normalizer
    ,stopword_cleaner
    ,lemmatizer
    ,pos_tagger
    ,finisher]
)

In [141]:
nlp_model = nlp_pipeline.fit(df)

In [142]:
processed_df  = nlp_model.transform(df)

In [143]:
processed_df.printSchema()

root
 |-- value: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)


In [144]:
#docs = pipeline.annotate(df,"value")

In [145]:
#docs.printSchema()

In [146]:
#extractor = Finisher()

In [147]:
#extractor = extractor.setInputCols(["spell","pos"])
#extractor = extractor.setOutputAsArray(True)

In [148]:
#preproceed_df = extractor.transform(docs)

In [149]:
tokens = processed_df.select("value","tokens","pos")

In [150]:
tokens.show(5,vertical=True,truncate=150)

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------
 value  | walter extra is a german award-winning aerobatic pilot , chief aircraft designer and founder of extra flugzeugbau -lrb- extra aircraft construction... 
 tokens | [walter, extra, german, awardwinning, aerobatic, pilot, chief, aircraft, designer, founder, extra, flugzeugbau, lrb, extra, aircraft, construction,... 
 pos    | [NN, JJ, NN, VBG, JJ, NN, JJ, NN, NN, NN, JJ, NN, NN, NN, NN, NN, NN, NN, JJ, NN]                                                                      
-RECORD 1--------------------------------------------------------------------------------------------------------------------------------------------------------
 value  | extra was trained as a mechanical engineer .                                                                                                           
 tokens | [extra, train, mec

In [156]:
df1 = tokens.selectExpr('*',"size(filter(tokens, x -> x='he')) as he")
df2 = df1.selectExpr('*', "filter(arrays_zip(tokens,pos), x -> x.pos in ('VBG','JJ') ).tokens as nouns")
df2.show()

+--------------------+--------------------+--------------------+---+--------------------+
|               value|              tokens|                 pos| he|               nouns|
+--------------------+--------------------+--------------------+---+--------------------+
|walter extra is a...|[walter, extra, g...|[NN, JJ, NN, VBG,...|  0|[extra, awardwinn...|
|extra was trained...|[extra, train, me...|   [NN, VBN, JJ, NN]|  0|        [mechanical]|
|he began his flig...|[he, begin, he, f...|[PRP, VBD, PRP$, ...|  2|        [transition]|
|he built and flew...|[he, build, fly, ...|[PRP, VBD, JJ, NN...|  2|[fly, special, ex...|
|extra began desig...|[extra, begin, de...|[NN, VBD, VBG, NN...|  0|[design, compete,...|
|his aircraft cons...|[he, aircraft, co...|[PRP$, NN, NNS, J...|  1|[revolutionize, f...|
|the german pilot ...|[german, pilot, k...|[JJ, NN, NN, NN, ...|  1|[german, fly, extra]|
|walter extra has ...|[walter, extra, d...|[NN, NNS, VBN, NN...|  0|[unlimited, aerob...|
|aaron hoh