# NLP processing whit NLTK
- Load an manage corpus
- Tokenize
- POS
- Lemmatization and stemming
- NER
- Standford NLP engine
- Pipelines fo EN and ES

In [1]:
import nltk
import os

nltk.__version__


'3.2.5'

In [2]:
# The linguistic resources must be instaled in the nltk_data dir.

# Check the nltk_data dir path
print(nltk.data.path)


['/Users/jorge/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data', '/Users/jorge/anaconda3/nltk_data', '/Users/jorge/anaconda3/lib/nltk_data']


In [3]:
# If you plan to use another dir, add it

new_data_path='/tmp'
nltk.data.path.append(new_data_path)

print(nltk.data.path)

['/Users/jorge/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data', '/Users/jorge/anaconda3/nltk_data', '/Users/jorge/anaconda3/lib/nltk_data', '/tmp']


In [4]:
# Download resources of NLTK data
nltk.download('punkt') # Punkt Tokenizer Models

# List of available resources here: http://www.nltk.org/nltk_data/ 

[nltk_data] Downloading package punkt to /Users/jorge/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load and manage a corpus

In [5]:
# Download the Brown corpus
nltk.download('brown') 


[nltk_data] Downloading package brown to /Users/jorge/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [6]:
# Then you can load it
from nltk.corpus import brown

print('Corpus len:', len(brown.words()))

print('The first 10 words:', brown.words()[0:10])

print('The first 10 tagged words:', brown.tagged_words()[0:10])

Corpus len: 1161192
The first 10 words: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of']
The first 10 tagged words: [('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN')]


# Tokenization
- Tokenize sentences
- Tokenize words

In [7]:
# Use the predefined sentence tokenizer.
from nltk.tokenize import sent_tokenize

text = "this’s a sent tokenize test. tis is sennt two. is this sent three? sent 4 is cool! Now it’s your turn."
sent_tokenize_list = sent_tokenize(text)

print(sent_tokenize_list)

['this’s a sent tokenize test.', 'tis is sennt two.', 'is this sent three?', 'sent 4 is cool!', 'Now it’s your turn.']


In [8]:
# Use explicity the punkt english sentence tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer.tokenize(text)

['this’s a sent tokenize test.',
 'this is sent two.',
 'is this sent three?',
 'sent 4 is cool!',
 'Now it’s your turn.']

In [8]:
# Use explicity the spanish sentence tokenizer
spanish_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
spanish_tokenizer.tokenize('¡Buenos días! ¿Estas bien?')


['¡Buenos días!', '¿Estas bien?']

In [9]:
# List of sentence tokenizers available in the punkt module
os.listdir(nltk.data.path[0]+'/tokenizers/punkt/')




['greek.pickle',
 'estonian.pickle',
 'turkish.pickle',
 'polish.pickle',
 'PY3',
 'czech.pickle',
 'portuguese.pickle',
 'README',
 'dutch.pickle',
 'norwegian.pickle',
 'slovene.pickle',
 'english.pickle',
 'danish.pickle',
 'finnish.pickle',
 'swedish.pickle',
 'spanish.pickle',
 'german.pickle',
 'italian.pickle',
 'french.pickle']

In [11]:
# Word tokenizer
from nltk.tokenize import word_tokenize

print(word_tokenize('Hello World!'))
print(word_tokenize("Can't is a contraction."))


['Hello', 'World', '!']
['Ca', "n't", 'is', 'a', 'contraction', '.']


In [12]:
# Others word tokenizers
from nltk.tokenize import WhitespaceTokenizer
tokenizer = WhitespaceTokenizer()
print(tokenizer.tokenize("Can't is a contraction."))


from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
print(tokenizer.tokenize("Can't is a contraction."))

["Can't", 'is', 'a', 'contraction.']
['Can', "'", 't', 'is', 'a', 'contraction', '.']


In [13]:
# word_tokenize must be used over sentences of the text

text = "El Dpto de RR.HH. ha lanzado 1.000 ofertas de trabajo en Buenos Aires. " \
       "3,25€ perdidos en Madrid el 2/11/2017. "\
       "Las herramientas [h1 y h2] son compatibles."

sent_tok = nltk.tokenize.load('tokenizers/punkt/spanish.pickle')
word_tok = nltk.tokenize.TreebankWordTokenizer()


sents = sent_tok.tokenize(text)

tokens = []
for s in sents:
    tokens += word_tok.tokenize(s)
    
print(tokens)




['El', 'Dpto', 'de', 'RR.HH', '.', 'ha', 'lanzado', '1.000', 'ofertas', 'de', 'trabajo', 'en', 'Buenos', 'Aires', '.', '3,25€', 'perdidos', 'en', 'Madrid', 'el', '2/11/2017', '.', 'Las', 'herramientas', '[', 'h1', 'y', 'h2', ']', 'son', 'compatibles', '.']


In [14]:
# Stopwords
nltk.download('stopwords')

from nltk.corpus import stopwords

english_stops = set(stopwords.words('english'))

words = ["Can't", 'is', 'a', 'contraction']
words_clean = [word for word in words if word not in english_stops]
print(words_clean)

[nltk_data] Downloading package stopwords to /Users/jorge/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
["Can't", 'contraction']


In [15]:
print('Available stopwords lists:', stopwords.fileids())

print('Stop words spanish:', stopwords.words('spanish'))

Available stopwords lists: ['arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish', 'turkish']
Stop words spanish: ['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'del', 'se', 'las', 'por', 'un', 'para', 'con', 'no', 'una', 'su', 'al', 'lo', 'como', 'más', 'pero', 'sus', 'le', 'ya', 'o', 'este', 'sí', 'porque', 'esta', 'entre', 'cuando', 'muy', 'sin', 'sobre', 'también', 'me', 'hasta', 'hay', 'donde', 'quien', 'desde', 'todo', 'nos', 'durante', 'todos', 'uno', 'les', 'ni', 'contra', 'otros', 'ese', 'eso', 'ante', 'ellos', 'e', 'esto', 'mí', 'antes', 'algunos', 'qué', 'unos', 'yo', 'otro', 'otras', 'otra', 'él', 'tanto', 'esa', 'estos', 'mucho', 'quienes', 'nada', 'muchos', 'cual', 'poco', 'ella', 'estar', 'estas', 'algunas', 'algo', 'nosotros', 'mi', 'mis', 'tú', 'te', 'ti', 'tu', 'tus', 'ellas', 'nosotras', 'vosostros

## Regular expressions
- https://www.regular-expressions.info/ 
- https://www.regextester.com/ 

In [16]:
# Regular expressions to clean text
import re

def clean_text(text):
    text_clean = text
    
    # Lowercase
    text_clean = str.lower(text_clean)
    
    # Replace numbers integer, float negatives. Not replace 1 digit numbers
    text_clean = re.sub("[-]?[\d]+[.]?[\d]+", "DIGIT", text_clean)

    # Delete characters [ ] { } ⋅ −
    text_clean = re.sub('[\[\]/{}⋅−]+', ' ', text_clean)
    
    # Other cleaning options 
    
    
    return text_clean

text = "Los datos son 23.5 y -12.8 [Medidos en unidades]."
print(text)
print(clean_text(text))




Los datos son 23.5 y -12.8 [Medidos en unidades].
los datos son DIGIT y DIGIT  medidos en unidades .


# POS

In [10]:
# Download the POS model
nltk.download('averaged_perceptron_tagger')

# The POS model must be applied over word tokenized text
text = nltk.word_tokenize("dive into NLTK: Part-of-speech tagging and POS Tagger")
print(text)
# Use the recommended part of speech tagger
print(nltk.pos_tag(text))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jorge/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
['dive', 'into', 'NLTK', ':', 'Part-of-speech', 'tagging', 'and', 'POS', 'Tagger']
[('dive', 'NN'), ('into', 'IN'), ('NLTK', 'NNP'), (':', ':'), ('Part-of-speech', 'JJ'), ('tagging', 'NN'), ('and', 'CC'), ('POS', 'NNP'), ('Tagger', 'NNP')]


In [18]:
# Understand the tagset of the POS model
#nltk.download('tagsets')

print(nltk.help.upenn_tagset('JJ'))

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...
None


## Steming

In [19]:
from nltk.stem.porter import PorterStemmer

porter_stemmer = PorterStemmer()
print('Porter stemmer:')
print('---------------')
print(porter_stemmer.stem('maximum'))
print(porter_stemmer.stem('presumably'))
print(porter_stemmer.stem('multiply'))
print(porter_stemmer.stem('provision'))
print(porter_stemmer.stem('saying'),'\n')


from nltk.stem.lancaster import LancasterStemmer

lancaster_stemmer = LancasterStemmer()
print('Lancaster stemmer:')
print('---------------')
print(lancaster_stemmer.stem('maximum'))
print(lancaster_stemmer.stem('presumably'))
print(lancaster_stemmer.stem('multiply'))
print(lancaster_stemmer.stem('provision'))
print(lancaster_stemmer.stem('saying'),'\n')

from nltk.stem import SnowballStemmer

snowball_stemmer = SnowballStemmer("english")
print('Snowball stemmer:')
print('---------------')
print(snowball_stemmer.stem('maximum'))
print(snowball_stemmer.stem('presumably'))
print(snowball_stemmer.stem('multiply'))
print(snowball_stemmer.stem('provision'))
print(snowball_stemmer.stem('saying'))



Porter stemmer:
---------------
maximum
presum
multipli
provis
say 

Lancaster stemmer:
---------------
maxim
presum
multiply
provid
say 

Snowball stemmer:
---------------
maximum
presum
multipli
provis
say


In [20]:
from nltk.stem import SnowballStemmer
print('Available languages in snowball stemmer:', " ".join(SnowballStemmer.languages))

Available naguages in snowball setemmer: arabic danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish


In [21]:
#  Test for the spanish language
from nltk.stem.snowball import SpanishStemmer
stemmer = SpanishStemmer()
stemmer.stem("Semanalmente")

'semanal'

## Lemmatizer

In [22]:
#nltk.download('wordnet') 

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()


print('Sample of lemmatizations:')
print(wordnet_lemmatizer.lemmatize('dogs'))
print(wordnet_lemmatizer.lemmatize('churches'))
print(wordnet_lemmatizer.lemmatize('abaci'))
print(wordnet_lemmatizer.lemmatize('are'), '\n')

# Lemmatization with POS

print('Lemma of "is", no POS:', wordnet_lemmatizer.lemmatize('is'))

print('Lemma of "is", whit POS:', wordnet_lemmatizer.lemmatize('is', pos='v'))
print('Lemma of "are", whit POS:', wordnet_lemmatizer.lemmatize('are', pos='v'))

Sample of lemmatizations:
dog
church
abacus
are 

Lemma of "is", no POS: is
Lemma of "is", whit POS: be
Lemma of "are", whit POS: be


## Integrated process

In [23]:
# Integrating the treebank POS tags to wordnet compatible pos tags
# - The recomended POS use different codes for the POS labels that the wordnet lemmatizer needs
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None # for easy if-statement 

In [24]:
text = "You are good friends. We had two houses that are bigger!"

# Tokenize into sentences
sents = sent_tok.tokenize(text)

# for each sentence
# - Tokenize words
# - POS model
# - Lemmatizer whit POS
tokens = []
tokens_stem = []
for s in sents:
    t = word_tok.tokenize(s)
    tagged = nltk.pos_tag(t)
    tokens += [t]
    lemma_list = []
    for word, tag in tagged:
        wntag = get_wordnet_pos(tag)
        if wntag is None: # not supply tag in case of None
            lemma = wordnet_lemmatizer.lemmatize(word) 
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=wntag) 
        lemma_list += [lemma]
    tokens_stem += [lemma_list]
print(tokens)
print(tokens_stem)
 

[['You', 'are', 'good', 'friends', '.'], ['We', 'had', 'two', 'houses', 'that', 'are', 'bigger', '!']]
[['You', 'be', 'good', 'friend', '.'], ['We', 'have', 'two', 'house', 'that', 'be', 'big', '!']]
