In [2]:
import sparknlp
spark = sparknlp.start(spark32=True)
import pandas as pd

In [3]:
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  3.4.4
Apache Spark version:  3.2.1


In [4]:
from sparknlp.annotator import *
from sparknlp.base import *

In [5]:
documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")\
      .setCleanupMode("shrink_full")

In [7]:
sentencerDL = SentenceDetectorDLModel\
  .pretrained("sentence_detector_dl", "xx") \
  .setInputCols(["document"]) \
  .setOutputCol("sentence")

sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]


In [9]:
embeddings = RoBertaEmbeddings.load("PlanTL-GOB-ES/roberta-base-ca_spark_nlp")\
  .setInputCols(["sentence",'token'])\
  .setOutputCol("embeddings")\
  .setCaseSensitive(True)

In [10]:
embeddingsSentence = SentenceEmbeddings() \
    .setInputCols(["sentence", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

In [11]:
embeddingsFinisher = EmbeddingsFinisher() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCols("finished_embeddings") \
    .setOutputAsVector(True) \
    .setCleanAnnotations(False)

In [12]:
ex_list = ["aprox.","pàg.","p.ex.","gen.","feb.","abr.","jul.","set.","oct.","nov.","des.","dr.","dra.","sr.","sra.","srta.","núm.","st.","sta.","pl.","etc.", "ex."]
#,"’", '”', "(", "[", "l'","l’","s'","s’","d’","d'","m’","m'","L'","L’","S’","S'","N’","N'","M’","M'"]
ex_list_all = []
ex_list_all.extend(ex_list)
ex_list_all.extend([x[0].upper() + x[1:] for x in ex_list])
ex_list_all.extend([x.upper() for x in ex_list])

In [258]:
tokenizer = Tokenizer() \
    .setInputCols(['sentence']).setOutputCol('token') \
    .setContextChars(['.', ',', ';', ':', '!', '?', '*', '-', '(', ')', '"', "'", "«", "»"]) \
    .setSuffixPattern("([A-zÀ-ú]*)(-la|-lo|-les|-los|-hi|-en|-ho|'n|'l|'ls|'m|'t|hi|ho|-LA|-LO|-LES|-LOS|-HI|-EN|-HO|'N|'L|'LS|'M|'T|HI|HO|)(.|,|;|:|!|\?|\)|\", »|)\z") \
    .setInfixPatterns(["(\"|«|¿|\(|^)(d'|l'|D'|L')([A-zÀ-ú]*)", "(\"|«|¿|\(|^)(d|p|D|P)(el|els|EL|ELS)$", "(\"|«|¿|\(|^)(a|A)(l|ls|L|LS)$", "([A-zÀ-ú]*)(-la|-lo|-les|-los|-nos|-vos|-te|-hi|-en|-ho|-n'|-l'|'ls|-m'|-t'|-hi|-ho|-LA|-LO|-LES|-LOS|-NOS|-VOS|-TE|-HI|-EN|-HO|-N'|-L'|'LS|-M'|-T'|-HI|-HO|)"]) \
    .setExceptions(ex_list_all)#.fit(data)

In [72]:
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("form")\
    .setLowercase(True)\
    .setCleanupPatterns(["\n "])

In [15]:
stop_words = StopWordsCleaner.pretrained("stopwords_iso","ca") \
    .setInputCols(["token"]) \
    .setOutputCol("cleanTokens")

stopwords_iso download started this may take some time.
Approximate size to download 2 KB
[ | ]stopwords_iso download started this may take some time.
Approximate size to download 2 KB
[ / ]Download done! Loading the resource.
[OK!]


In [16]:
lemmatizer = Lemmatizer() \
    .setInputCols(["form"]) \
    .setOutputCol("lemma") \
    .setDictionary("ca_lemma_dict.tsv", "\t", " ")

In [17]:
pos = PerceptronModel.pretrained("pos_ud_ancora", "ca") \
  .setInputCols(["document", "token"]) \
  .setOutputCol("pos")

pos_ud_ancora download started this may take some time.
Approximate size to download 2 MB
[ | ]pos_ud_ancora download started this may take some time.
Approximate size to download 2 MB
[ \ ]Download done! Loading the resource.




[OK!]


In [19]:
ner = RoBertaForTokenClassification.load("projecte-aina/roberta-base-ca-cased-ner_spark_nlp")
ner.setOutputCol('ner')

RoBertaForTokenClassification_e660c813981d

In [20]:
nerconverter = NerConverter()\
    .setInputCols(["document", "token", "ner"]) \
    .setOutputCol("entities")#\

In [353]:
chunker = Chunker() \
   .setInputCols(["sentence", "pos"]) \
   .setOutputCol("chunk") \
   .setRegexParsers(["<DET>*<ADV>*<ADJ>*<NOUN>+<ADV>*<ADJ>*", "<DET>*<PROPN>+", "<DET>+<ADV>*<ADJ>+<ADV>*", "<PRON>"])

In [354]:
nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentencerDL,
    tokenizer,
    normalizer,
    stop_words,
    embeddings,
    embeddingsSentence,
    embeddingsFinisher,
    lemmatizer,
    pos,
    ner,
    nerconverter,
    chunker
 ])

In [355]:
text = "Veig a l'home dels Estats Units amb el telescopi."
spark_df = spark.createDataFrame([[text]]).toDF("text")
empty_df = spark.createDataFrame([['']]).toDF("text")
pipelineModel = nlpPipeline.fit(empty_df)
result = pipelineModel.transform(spark_df)

In [356]:
from sparknlp.base import LightPipeline
light_model = LightPipeline(pipelineModel)

In [357]:
text = "venien (del delta) a buscar l'aigua. anem-nos-en de la casa. ella."

In [358]:
light_result = light_model.annotate(text)
result = pd.DataFrame(zip(light_result['token'], light_result['lemma'], light_result['pos'], light_result['ner']), columns = ["token", "lemma", "pos", "ner"])
print(result)

     token   lemma    pos ner
0   venien   venir   VERB   O
1        (       (  PUNCT   O
2        d      de   NOUN   O
3       el      el    DET   O
4    delta   delta   NOUN   O
5        )       )  PUNCT   O
6        a       a    ADP   O
7   buscar  buscar   VERB   O
8       l'      el    DET   O
9    aigua   aigua   NOUN   O
10       .       .  PUNCT   O
11    anem    anar   VERB   O
12    -nos     nos  PROPN   O
13     -en     -en  PROPN   O
14      de      de    ADP   O
15      la      la    DET   O
16    casa    casa   NOUN   O
17       .       .  PUNCT   O
18    ella     ell   PRON   O
19       .       .  PUNCT   O


In [339]:
print("entites:", light_result['entities'])

entites: []


In [359]:
print("chunk:", light_result['chunk'])

chunk: ['d', 'el delta', "l'aigua", 'la casa', '-nos-en', 'ella']
