In [None]:
from spacy.matcher import Matcher


def extract_spanish_complex_keywords(nlp, text):

    doc = nlp(text)
    matcher = Matcher(nlp.vocab)
    # COUNT FREQUENCY - COMPLEX WORD

    pattern_1 = [{"POS": "NOUN"}, {"LOWER": "de"}, {"POS": "NOUN"}]
    matcher.add("NOUN-de-NOUN", [pattern_1])
    pattern_2 = [{"POS": "NOUN"}, {"POS": "ADJ"}]
    matcher.add("NOUN-ADJ", [pattern_2])
    pattern_3 = [{"POS": "NOUN"}, {"LOWER": "del"}, {"POS": "NOUN"}]
    matcher.add("NOUN-del-NOUN", [pattern_3])
    pattern_4 = [{"POS": "NOUN"}, {"LOWER": "de"},
                 {"LOWER": "la"}, {"POS": "NOUN"}]
    matcher.add("NOUN-de-la-NOUN", [pattern_4])

    doc = nlp(text)
    matches = matcher(doc)
    complex_words = []
    for match_id, start, end in matches:
        #string_id = nlp.vocab.strings[match_id]
        span = doc[start:end]  # The matched span
        complex_word = span.text
        complex_words.append(complex_word)

    return complex_words


def concat_text_cols(row, text_columns):
    content = ''
    for col in text_columns:
        if isinstance(row[col], str):
            content += row[col] + '. '
    return content


def extract_tokens_from_text(text, nlp, dframcy, lowercase=True):
    if lowercase:
        text = text.lower()
    complex_keywords = extract_spanish_complex_keywords(nlp, text)
    docs = dframcy.nlp(text)
    anotation_dataframe, entity_dataframe = dframcy.to_dataframe(docs,
                                                                 separate_entity_dframe=True)
    verbs = anotation_dataframe.loc[anotation_dataframe['token_tag_']
                                    == "VERB"]['token_text']
    nouns = anotation_dataframe.loc[anotation_dataframe['token_tag_']
                                    == "NOUN"]['token_text']
    return (complex_keywords, verbs.tolist(), nouns.tolist())


def concat_text_cols(row, text_columns):
    content = ''
    for col in text_columns:
        if isinstance(row[col], str):
            content += row[col] + '. '
    return content


def extract_df_tokens_inplace(df,
                              nlp,
                              dframcy,
                              text_columns=['subject',
                                            'mail_text', 'pdf_text'],
                              lowercase=True):
    df[["complex_keywords", "nouns", "verbs"]] = ""
    df['content'] = df.apply(
        lambda row: concat_text_cols(row, text_columns), axis=1)
    if lowercase:
        df['content'] = df['content'].str.lower()
    print("EXTRACTING TOKENS")
    for index, row in tqdm(df.iterrows(), desc='df rows - Keywords', total=df.shape[0]):
        complex_keywords = extract_spanish_complex_keywords(nlp,
                                                            row['content'])
        df.at[index, "complex_keywords"] = complex_keywords
        docs = dframcy.nlp(row['content'])
        anotation_dataframe, entity_dataframe = dframcy.to_dataframe(docs,
                                                                     separate_entity_dframe=True)
        verbs = anotation_dataframe.loc[anotation_dataframe['token_tag_']
                                        == "VERB"]['token_text']
        nouns = anotation_dataframe.loc[anotation_dataframe['token_tag_']
                                        == "NOUN"]['token_text']
        df.at[index, "verbs"] = verbs.tolist()
        df.at[index, "nouns"] = nouns.tolist()
    df.drop(columns=['content'], inplace=True)

In [None]:
from dframcy import DframCy
import spacy
from tqdm import tqdm

nlp = spacy.load('es_core_news_sm')
dframcy = DframCy(nlp)

In [None]:
def extract_keywords(row, nlp, dframcy, text_columns=['subject', 'mail_text', 'pdf_text']):
    content = ''
    for col in text_columns:
        if isinstance(row[col], str):
            content += row[col] + '. '
    complex_kws, verbs, nouns = extract_tokens_from_text(content, nlp, dframcy)
    row['complex_keywords'] = complex_kws
    row['verbs'] = verbs
    row['nouns'] = nouns
    return row

https://github.com/TeamSophia2/FinTree/blob/a474a948e6d4c050358e71ae963b3acd821a8553/notebooks/demos/normalize.ipynb