# NLP processing whit NLTK
- Load an manage corpus
- Tokenize
- POS
- Lemmatization and stemming
- NER
- Standford NLP engine
- Pipelines fo EN and ES

In [None]:
import nltk
import os

nltk.__version__


In [None]:
# The linguistic resources must be instaled in the nltk_data dir.

# Check the nltk_data dir path
print(nltk.data.path)


In [None]:
# If you plan to use another dir, add it

new_data_path='/tmp'
nltk.data.path.append(new_data_path)

print(nltk.data.path)

In [None]:
# Download resources of NLTK data
nltk.download('punkt') # Punkt Tokenizer Models

# List of available resources here: http://www.nltk.org/nltk_data/ 

# Load and manage a corpus

In [None]:
# Download the Brown corpus
nltk.download('brown') 


In [None]:
# Then you can import it
from nltk.corpus import brown

print('Corpus len:', len(brown.words()))

print('The first 10 words:', brown.words()[0:10])

print('The first 10 tagged words:', brown.tagged_words()[0:10])

# Tokenization
- Tokenize sentences
- Tokenize words

In [None]:
# Use the predefined sentence tokenizer.
from nltk.tokenize import sent_tokenize

text = "this’s a sent tokenize test. tis is sennt two. is this sent three? sent 4 is cool! Now it’s your turn."
sent_tokenize_list = sent_tokenize(text)

print(sent_tokenize_list)

In [None]:
# Use explicity the punkt english sentence tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer.tokenize(text)

In [None]:
# Use explicity the spanish sentence tokenizer
spanish_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
spanish_tokenizer.tokenize('¡Buenos días! ¿Estas bien?')


In [None]:
# List of sentence tokenizers available in the punkt module
os.listdir(nltk.data.path[0]+'/tokenizers/punkt/')




In [None]:
# Word tokenizer
from nltk.tokenize import word_tokenize

print(word_tokenize('Hello World!'))
print(word_tokenize("Can't is a contraction."))


In [None]:
# Others word tokenizers
from nltk.tokenize import WhitespaceTokenizer
tokenizer = WhitespaceTokenizer()
print(tokenizer.tokenize("Can't is a contraction."))


from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
print(tokenizer.tokenize("Can't is a contraction."))

In [None]:
# word_tokenize must be used over sentences of the text

text = "El Dpto de RR.HH. ha lanzado 1.000 ofertas de trabajo en Buenos Aires. " \
       "3,25€ perdidos en Madrid el 2/11/2017. "\
       "Las herramientas [h1 y h2] son compatibles."

sent_tok = nltk.tokenize.load('tokenizers/punkt/spanish.pickle')
word_tok = nltk.tokenize.TreebankWordTokenizer()


sents = sent_tok.tokenize(text)

tokens = []
for s in sents:
    tokens += word_tok.tokenize(s)
    
print(tokens)




In [None]:
# Stopwords
nltk.download('stopwords')

from nltk.corpus import stopwords

english_stops = set(stopwords.words('english'))

words = ["Can't", 'is', 'a', 'contraction']
words_clean = [word for word in words if word not in english_stops]
print(words_clean)

In [None]:
print('Available stopwords lists:', stopwords.fileids())

print('Stop words spanish:', stopwords.words('spanish'))

## Regular expressions
- https://www.regular-expressions.info/ 
- https://www.regextester.com/ 

In [None]:
# Regular expressions to clean text
import re

def clean_text(text):
    text_clean = text
    
    # Lowercase
    text_clean = str.lower(text_clean)
    
    # Replace numbers integer, float negatives. Not replace 1 digit numbers
    text_clean = re.sub("[-]?[\d]+[.]?[\d]+", "DIGIT", text_clean)

    # Delete characters [ ] { } ⋅ −
    text_clean = re.sub('[\[\]/{}⋅−]+', ' ', text_clean)
    
    # Other cleaning options 
    
    
    return text_clean

text = "Los datos son 23.5 y -12.8 [Medidos en unidades]."
print(text)
print(clean_text(text))




# POS

In [None]:
# Download the POS model
nltk.download('averaged_perceptron_tagger')

# The POS model must be applied over word tokenized text
text = nltk.word_tokenize("dive into NLTK: Part-of-speech tagging and POS Tagger")
print(text)
# Use the recommended part of speech tagger
print(nltk.pos_tag(text))

In [None]:
# Understand the tagset of the POS model
#nltk.download('tagsets')

print(nltk.help.upenn_tagset('JJ'))

## Steming

In [None]:
from nltk.stem.porter import PorterStemmer

porter_stemmer = PorterStemmer()
print('Porter stemmer:')
print('---------------')
print(porter_stemmer.stem('maximum'))
print(porter_stemmer.stem('presumably'))
print(porter_stemmer.stem('multiply'))
print(porter_stemmer.stem('provision'))
print(porter_stemmer.stem('saying'),'\n')


from nltk.stem.lancaster import LancasterStemmer

lancaster_stemmer = LancasterStemmer()
print('Lancaster stemmer:')
print('---------------')
print(lancaster_stemmer.stem('maximum'))
print(lancaster_stemmer.stem('presumably'))
print(lancaster_stemmer.stem('multiply'))
print(lancaster_stemmer.stem('provision'))
print(lancaster_stemmer.stem('saying'),'\n')

from nltk.stem import SnowballStemmer

snowball_stemmer = SnowballStemmer("english")
print('Snowball stemmer:')
print('---------------')
print(snowball_stemmer.stem('maximum'))
print(snowball_stemmer.stem('presumably'))
print(snowball_stemmer.stem('multiply'))
print(snowball_stemmer.stem('provision'))
print(snowball_stemmer.stem('saying'))



In [None]:
from nltk.stem import SnowballStemmer
print('Available languages in snowball stemmer:', " ".join(SnowballStemmer.languages))

In [None]:
#  Test for the spanish language
from nltk.stem.snowball import SpanishStemmer
stemmer = SpanishStemmer()
stemmer.stem("Semanalmente")

## Lemmatizer

In [None]:
#nltk.download('wordnet') 

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()


print('Sample of lemmatizations:')
print(wordnet_lemmatizer.lemmatize('dogs'))
print(wordnet_lemmatizer.lemmatize('churches'))
print(wordnet_lemmatizer.lemmatize('abaci'))
print(wordnet_lemmatizer.lemmatize('are'), '\n')

# Lemmatization with POS

print('Lemma of "is", no POS:', wordnet_lemmatizer.lemmatize('is'))

print('Lemma of "is", whit POS:', wordnet_lemmatizer.lemmatize('is', pos='v'))
print('Lemma of "are", whit POS:', wordnet_lemmatizer.lemmatize('are', pos='v'))

## Integrated process

In [None]:
# Integrating the treebank POS tags to wordnet compatible pos tags
# - The recomended POS use different codes for the POS labels that the wordnet lemmatizer needs
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None # for easy if-statement 

In [None]:
text = "You are good friends. We had two houses that are bigger!"

# Tokenize into sentences
sents = sent_tok.tokenize(text)

# for each sentence
# - Tokenize words
# - POS model
# - Lemmatizer whit POS
tokens = []
tokens_stem = []
for s in sents:
    t = word_tok.tokenize(s)
    tagged = nltk.pos_tag(t)
    tokens += [t]
    lemma_list = []
    for word, tag in tagged:
        wntag = get_wordnet_pos(tag)
        if wntag is None: # not supply tag in case of None
            lemma = wordnet_lemmatizer.lemmatize(word) 
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=wntag) 
        lemma_list += [lemma]
    tokens_stem += [lemma_list]
print(tokens)
print(tokens_stem)
 