In [58]:
##https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp.annotator.Tokenizer.html
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
spark = sparknlp.start()

In [59]:
documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")

In [60]:
ex_list = ["aprox.","pàg.","p.ex.","gen.","feb.","abr.","jul.","set.","oct.","nov.","des.","dr.","dra.","sr.","sra.","srta.","núm.","st.","sta.","pl.","etc.", "ex."]
#,"’", '”', "(", "[", "l'","l’","s'","s’","d’","d'","m’","m'","L'","L’","S’","S'","N’","N'","M’","M'"]
ex_list_all = []
ex_list_all.extend(ex_list)
ex_list_all.extend([x[0].upper() + x[1:] for x in ex_list])
ex_list_all.extend([x.upper() for x in ex_list])


In [433]:
text = "Anem-nos-en, dones d'aigua del delta. ALS adéus."
data = spark.createDataFrame([[text]]).toDF("text")

In [434]:
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token") \
    .setSuffixPattern("(\w*)(-la|-lo|-les|-los|-hi|-en|-ho|'n|'l|'ls|'m|'t|hi|ho|-LA|-LO|-LES|-LOS|-HI|-EN|-HO|'N|'L|'LS|'M|'T|HI|HO|)(.|,|;|:|!|\?|\)|\"|)\z") \
    .setInfixPatterns(["^(d'|l'|D'|L')(\w*)", "^(d|p|D|P)(el|els|EL|ELS)?$", "^(a|A)(l|ls|L|LS)?$", "(\w*)(-la|-lo|-les|-los|-nos|-vos|-te|-hi|-en|-ho|-n'|-l'|'ls|-m'|-t'|-hi|-ho|-LA|-LO|-LES|-LOS|-NOS|-VOS|-TE|-HI|-EN|-HO|-N'|-L'|'LS|-M'|-T'|-HI|-HO|)"]) \
    .setContextChars(['.', ',', ';', ':', '!', '?', '*', '-', '(', ')', '"', "'"]) \
    .setExceptions(ex_list_all).fit(data)

In [435]:
pipeline = Pipeline().setStages([documentAssembler, tokenizer]).fit(data)

In [436]:
result = pipeline.transform(data)

In [437]:
result.selectExpr("token.result").show(truncate=False)

+----------------------------------------------------------------------+
|result                                                                |
+----------------------------------------------------------------------+
|[Anem, -nos, -en, ,, dones, d', aigua, d, el, delta, ., A, LS, adéus.]|
+----------------------------------------------------------------------+



In [231]:
text = "Baixant de la font del gat pel gat als gats al delta."
data = spark.createDataFrame([[text]]).toDF("text")

In [232]:
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token") \
    .setInfixPatterns(["^(d|p)(el|els)?$", "^(a)(l|ls)?$"]) \
    .setExceptions(ex_list_all)

In [283]:
pipeline = Pipeline().setStages([documentAssembler, tokenizer]).fit(data)
result = pipeline.transform(data)
result.selectExpr("token.result").show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                  |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[Dona-n'hi, tres, ., Dona'n, tres, ., Canta-les, Canta-les, ., Cana'ls, cançons, ., Anem-nos-en, d'aquí, ., Baixant, dels, camins, pels, gats, de, la, sra., Rovirda, .]|
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+



In [19]:
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token") \
    .setSuffixPattern("([^\s\w]?)('hi|ls|'l|'ns|'t|'m|'n|-n|-en|-les|-la|-lo|-li|-los|-me|-nos|-te|-vos|-se|-hi|-ne|-ho)\z") \
    .setExceptions(ex_list_all)#.fit(data)
#tokenizer.addInfixPattern("([^\s\w]?)(-nos)")

In [20]:
pipeline = Pipeline().setStages([documentAssembler, tokenizer]).fit(data)
result = pipeline.transform(data)
result.selectExpr("token.result").show(truncate=False)

+------------------------------------------------------------------------------------------------------------+
|result                                                                                                      |
+------------------------------------------------------------------------------------------------------------+
|[Dona-n, 'hi, tres., Dona, 'n, tres., Canta, -les, Canta-les., Cana, ', ls, cançons., Anem-nos, -en, d'aquí]|
+------------------------------------------------------------------------------------------------------------+



In [112]:
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token") \
    .setSuffixPattern("([^\s\w]?)('hi|ls|'l|'ns|'t|'m|'n|-n|-en|-les|-la|-lo|-li|-los|-me|-nos|-te|-vos|-se|-hi|-ne|-ho)\z")\
    .setInfixPatterns(["(^[\w]*)(-nos)"]) \
    .setExceptions(ex_list_all)#.fit(data)

In [113]:
pipeline = Pipeline().setStages([documentAssembler, tokenizer]).fit(data)
result = pipeline.transform(data)
result.selectExpr("token.result").show(truncate=False)

+----------------------------------------------------------------------------------------------------+
|result                                                                                              |
+----------------------------------------------------------------------------------------------------+
|[Dona-n'hi, tres., Dona'n, tres., Canta-les, Canta-les., Cana'ls, cançons., Anem, -nos, -en, d'aquí]|
+----------------------------------------------------------------------------------------------------+



In [49]:
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token") \
    .setSuffixPattern("([^\w]?)('hi|-en|-nos)\z").setExceptions(ex_list_all).setInfixPatterns(["(\w*)(-nos)"]).fit(data)

In [50]:
pipeline = Pipeline().setStages([documentAssembler, tokenizer]).fit(data)
result = pipeline.transform(data)
result.selectExpr("token.result").show(truncate=False)

+----------------------------------------------------------------------------------------------------+
|result                                                                                              |
+----------------------------------------------------------------------------------------------------+
|[Dona-n'hi, tres., Dona'n, tres., Canta-les, Canta-les., Cana'ls, cançons., Anem, -nos, -en, d'aquí]|
+----------------------------------------------------------------------------------------------------+



In [49]:
import re

In [82]:
hores = re.compile(r"([^\s\w]?)('m|'t|'l|'n|'-me|-te|-en|-la|-lo|-les|-los|-li|-ho|-m'|-t'|-n'|-l')(-en|'n|hi|ho|-ho)?(\Z|\W)")

In [84]:
re.search(hores, "canta-li-ho,")

<re.Match object; span=(5, 12), match='-li-ho,'>