### Libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Spark version and Spark NLP version should be align

In [47]:
from sparknlp.base import DocumentAssembler, Finisher, Pipeline
from sparknlp.annotator import Tokenizer, Normalizer, StopWordsCleaner, PerceptronModel,LemmatizerModel,BertEmbeddings,NerDLModel,WordEmbeddingsModel
#from sparknlp.pretrained import PretrainedPipeline
from pyspark.ml.feature import StopWordsRemover

### Spark Session Builder

In [10]:
spark = SparkSession.builder\
    .appName("nlp")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.4")\
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

### Data Extraction

In [245]:
path = "/home/rjac/workspace/wikipedia-biography-dataset/wikipedia-biography-dataset/train/train.sent"
df = spark.read.text(path)

In [246]:
df.show(5,vertical=True,truncate=250)

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 value | walter extra is a german award-winning aerobatic pilot , chief aircraft designer and founder of extra flugzeugbau -lrb- extra aircraft construction -rrb- , a manufacturer of aerobatic aircraft . 
-RECORD 1---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 value | extra was trained as a mechanical engineer .                                                                                                                                                       
-RECORD 2---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [247]:
#df = df.selectExpr('*','initCap(value) as cap')

In [248]:
exclude = ["i","he","him","his","himself","she","her","hers","herself",'they','them', 'their','theirs','themselves',"she's","he's"]
stopwords = [w for w in StopWordsRemover().getStopWords() if w not in exclude]

### Data Processing

In [249]:
document_assambler = DocumentAssembler().setInputCol("value").setOutputCol("document").setCleanupMode("shrink")
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")

In [250]:
#output1
#normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalized").setLowercase(True)
stopword_cleaner = StopWordsCleaner().setInputCols(["token"]).setOutputCol("cleanTokens").setCaseSensitive(False).setStopWords(stopwords)
normalizer = Normalizer().setInputCols(["cleanTokens"]).setOutputCol("normalized").setLowercase(True)
pos_tagger = PerceptronModel().pretrained("pos_ud_ewt", 'en').setInputCols(["document","normalized"]).setOutputCol("pos")
lemmatizer = LemmatizerModel.pretrained().setInputCols(["normalized"]).setOutputCol("lemma")

pos_ud_ewt download started this may take some time.
Approximate size to download 2.2 MB
[OK!]
lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [251]:
#output2
#pos_tagger = PerceptronModel().pretrained("pos_ud_ewt", 'en').setInputCols(["document","token"]).setOutputCol("pos")

In [252]:

#output3
#embeddings = WordEmbeddingsModel().pretrained().setInputCols("document", "cleanTokens").setOutputCol("embeddings")
#ner_model = NerDLModel.pretrained().setInputCols(["document", "cleanTokens", "embeddings"]).setOutputCol("ner")

In [253]:
#Result
finisher = Finisher().setInputCols(["lemma","pos"]).setOutputCols(["tokens","pos"]).setOutputAsArray(True).setCleanAnnotations(False)

In [254]:
nlp_pipeline = Pipeline(stages=[
    document_assambler
    ,tokenizer
    ,stopword_cleaner
    ,normalizer
    ,pos_tagger
    ,lemmatizer
    ,finisher
    ]
)

In [255]:
nlp_model = nlp_pipeline.fit(df)

In [256]:
processed_df  = nlp_model.transform(df)

In [257]:
processed_df.printSchema()

root
 |-- value: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)


In [258]:
tokens = processed_df.select("value","tokens","pos")

In [259]:
tokens.show(5,vertical=True,truncate=150)

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------
 value  | walter extra is a german award-winning aerobatic pilot , chief aircraft designer and founder of extra flugzeugbau -lrb- extra aircraft construction... 
 tokens | [walter, extra, german, awardwinning, aerobatic, pilot, chief, aircraft, designer, founder, extra, flugzeugbau, lrb, extra, aircraft, construction,... 
 pos    | [NOUN, ADJ, ADJ, VERB, ADJ, NOUN, ADJ, NOUN, NOUN, NOUN, ADJ, NOUN, NOUN, ADJ, NOUN, NOUN, NOUN, NOUN, ADJ, NOUN]                                      
-RECORD 1--------------------------------------------------------------------------------------------------------------------------------------------------------
 value  | extra was trained as a mechanical engineer .                                                                                                           
 tokens | [extra, train, mec

In [266]:
preprocess_ds = tokens.selectExpr('*',"size(filter(tokens, x -> x='he')) as male_pronoun")
preprocess_ds = preprocess_ds.selectExpr('*',"size(filter(tokens, x -> x='she')) as female_pronoun")
preprocess_ds = preprocess_ds.selectExpr('*', "filter(arrays_zip(tokens,pos), x -> x.pos in ('VERB','NOUN','ADJ')).tokens as nouns")
preprocess_ds = preprocess_ds.selectExpr('*','(male_pronoun-female_pronoun)/((male_pronoun+female_pronoun)+0.001) as gender')


In [267]:
preprocess_ds.show(5,vertical=True,truncate=150)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------
 value          | walter extra is a german award-winning aerobatic pilot , chief aircraft designer and founder of extra flugzeugbau -lrb- extra aircraft construction... 
 tokens         | [walter, extra, german, awardwinning, aerobatic, pilot, chief, aircraft, designer, founder, extra, flugzeugbau, lrb, extra, aircraft, construction,... 
 pos            | [NOUN, ADJ, ADJ, VERB, ADJ, NOUN, ADJ, NOUN, NOUN, NOUN, ADJ, NOUN, NOUN, ADJ, NOUN, NOUN, NOUN, NOUN, ADJ, NOUN]                                      
 male_pronoun   | 0                                                                                                                                                      
 female_pronoun | 0                                                                                                                                   

In [268]:
preprocess_ds = preprocess_ds.filter(F.abs(F.col("gender"))>0.25)

In [269]:
preprocess_ds.show(5,vertical=True,truncate=150)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------
 value          | he began his flight training in gliders , transitioning to powered aircraft to perform aerobatics .                                                    
 tokens         | [he, begin, he, flight, training, glider, transition, power, aircraft, perform, aerobatic]                                                             
 pos            | [PRON, VERB, PRON, NOUN, NOUN, NOUN, VERB, VERB, NOUN, NOUN, NOUN]                                                                                     
 male_pronoun   | 2                                                                                                                                                      
 female_pronoun | 0                                                                                                                                   

                                                                                

In [270]:
preprocess_ds.count()

[Stage 64:>                                                       (0 + 12) / 12]