In [6]:
import nltk
from nltk.stem.lancaster import LancasterStemmer
import string

stemmer = LancasterStemmer()
tbl = str.maketrans({key:None for key in string.punctuation})

def remove_punctuation(txt:str) -> str:
    return txt.translate(tbl)

def tokenize(txt:str):
    return nltk.word_tokenize(txt)

def stem(token):
    return [stemmer.stem(w.lower()) for w in token]

ex = "Hello world!"
print(ex)
ex = remove_punctuation(ex)
print(ex)
ex = tokenize(ex)
print(ex)
ex = stem(ex)
print(ex)

Hello world!
Hello world
['Hello', 'world']
['hello', 'world']


In [7]:
s = LancasterStemmer()

print(s.stem('seeing'))

see


In [14]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

doc = nlp("Hello dear Tom, how are you my friend ?")
for ent in doc.ents:
    print(ent.sentiment)

displacy.render(doc, style = "ent")

0.0


In [7]:
from nltk.stem import WordNetLemmatizer
import nltk
import spacy
import string
nlp = spacy.load("en_core_web_sm")
t = nlp("I live in the usa! We are living in the US")
lemmatizer = WordNetLemmatizer()

for token in t:
    #print(token.text," -> ", lemmatizer.lemmatize(token.text))
    if token.lemma_ not in string.punctuation:
        print(token.text," -> ", token.lemma_, " :: ", token.is_stop)
for ent in t.ents:
    print(ent.label_)


I  ->  -PRON-  ::  True
live  ->  live  ::  False
in  ->  in  ::  True
the  ->  the  ::  True
usa  ->  usa  ::  False
We  ->  -PRON-  ::  True
are  ->  be  ::  True
living  ->  live  ::  False
in  ->  in  ::  True
the  ->  the  ::  True
US  ->  US  ::  True
GPE


In [58]:
print(type(nlp))

<class 'spacy.lang.en.English'>


In [30]:
from spacy.tokens.token import Token
from spacy.tokens.doc import Doc
from typing import List, Iterable, Sequence, Any
from functools import partial
import pandas as pd

def tokenize(sentence : str, spacy_model, to_merge_entities:Sequence[str]=["GPE", "LOC", "PERSON"]) -> Doc:
    doc = spacy_model(sentence)

    # Retrieve entities to merge
    ent_to_split = {ent.text:ent.text.split(' ') for ent in doc.ents if ent.label_ in to_merge_entities}

    # Set all entities lemma to the merged name
    for complete_entity, splitted_entity in ent_to_split.items():
        merged_entity = "_".join(splitted_entity)
        for token in doc:
            if token.text in splitted_entity:
                token.lemma_ = merged_entity

    return doc

def filter(tokens:Doc, to_avoid:Sequence[str]) -> List[str]:
    returned_tokens = set()
    for token in tokens:
        if (token.text not in to_avoid) and (token.lemma_ not in to_avoid) and (not token.is_stop):
            returned_tokens.add(token.lemma_)
    return list(returned_tokens)

def sent_preprocess(sentence:str, spacy_model, to_avoid:Sequence[str]) -> List[str]:
    pipe = (partial(tokenize, spacy_model=spacy_model),
            partial(filter, to_avoid=to_avoid))
    x = sentence
    for f in pipe:
        x = f(x)
    return x


def df_preprocess(df:pd.DataFrame, column:str, inplace=False, spacy_model_name="en_core_web_sm"):
    import string
    import warnings
    from functools import partial
    try:
        import spacy
    except ImportError:
        warnings.warn("Please install spacy package", ImportWarning)

    try:
        nlp = spacy.load(spacy_model_name)
    except OSError as os_error:
        import sys
        import subprocess
        warnings.warn(f"spacy model {spacy_model_name} was not yet installed. Install it now.", ResourceWarning)
        subprocess.check_call([sys.executable, "-m", "spacy", "download", spacy_model_name])
        df_preprocess(df=df, column=column, inplace=inplace, spacy_model_name=spacy_model_name)

    to_avoid = list(string.punctuation+' ')

    try: # Speedup preprocess with parallelization
        from pandarallel import pandarallel
        pandarallel.initialize(progress_bar=False, verbose=0)

        preprocessed_column = df[column].parallel_apply(partial(sent_preprocess, spacy_model=nlp, to_avoid=to_avoid))
    except ImportError:
        preprocessed_column = df[column].apply(partial(sent_preprocess, spacy_model=nlp, to_avoid=to_avoid))


    preprocessed_df = df if inplace else df.copy(deep=True)

    preprocessed_df[column] = preprocessed_column

    return preprocessed_df





In [5]:
txt = "Book written by Dr. Mickael Jackson and Doctor Dylan Bryan in United States"

import spacy
nlp = spacy.load("en_core_web_md")

In [31]:
import string
to_avoid = list(string.punctuation+' ')
doc = sent_preprocess(txt, nlp, to_avoid=to_avoid)
print(doc)


['write', 'Doctor', 'Dr.', 'Dylan_Bryan', 'United_States', 'book', 'Mickael_Jackson']


In [1]:
print("prout")


prout
