<a href="https://www.inove.com.ar"><img src="https://github.com/hernancontigiani/ceia_memorias_especializacion/raw/master/Figures/logoFIUBA.jpg" width="500" align="center"></a>


# Procesamiento de lenguaje natural
## Preprocesamiento con NLTK y Spacy


In [None]:
import json
import string
import random 

import numpy as np

### Datos

In [None]:
simple_text = "if she leaves now she might miss something importart!"

In [None]:
large_text = "Patients who in late middle age have smoked 20 cigarettes a day since their teens constitute an at-risk group. One thing they’re clearly at risk for is the acute sense of guilt that a clinician can incite, which immediately makes a consultation tense."

### 1 - Preprocesamiento con NLTK
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [None]:
import nltk
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords

# Descargar el diccionario
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
simple_text

'if she leaves now she might miss something importart!'

In [None]:
# Crear el derivador
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *
p_stemmer = PorterStemmer()

In [None]:
# Crear el lematizador
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
# Extraer los tokens de un doc
tokens = word_tokenize(simple_text)
print("Tokens:", tokens)

Tokens: ['if', 'she', 'leaves', 'now', 'she', 'might', 'miss', 'something', 'importart', '!']


In [None]:
# Transformar los tokens a sus respectivas palabras derivadas
# Stemming
nltk_stemedList = []
for word in tokens:
    nltk_stemedList.append(p_stemmer.stem(word))
print("Stemming:", nltk_stemedList)

Stemming: ['if', 'she', 'leav', 'now', 'she', 'might', 'miss', 'someth', 'importart', '!']


In [None]:
# Transformar los tokens a sus respectivas palabras raiz
# Lemmatization
nltk_lemmaList = []
for word in tokens:
    nltk_lemmaList.append(lemmatizer.lemmatize(word))
print("Lemmatization:", nltk_lemmaList)

Lemmatization: ['if', 'she', 'leaf', 'now', 'she', 'might', 'miss', 'something', 'importart', '!']


In [None]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
# Quitar los signos de puntuacion
nltk_punctuation = [w for w in nltk_lemmaList if w not in string.punctuation]
print("Punctuation filter:", nltk_punctuation)

Punctuation filter: ['if', 'she', 'leaf', 'now', 'she', 'might', 'miss', 'something', 'importart']


In [None]:
nltk_stop_words = set(stopwords.words("english"))
len(nltk_stop_words)

179

In [None]:
# Stop words
nltk_stop_words = set(stopwords.words("english"))
filtered_sentence = [w for w in nltk_punctuation if w not in nltk_stop_words]
print("Stop words filter:", filtered_sentence)

Stop words filter: ['leaf', 'might', 'miss', 'something', 'importart']


### 2 - Proceso completo con NLTK
Tokenization → Lemmatization → Remove stopwords → Remove punctuation

In [None]:
def nltk_process(text):
    # Tokenization
    nltk_tokenList = word_tokenize(text)
      
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    nltk_lemmaList = []
    for word in nltk_tokenList:
        nltk_lemmaList.append(lemmatizer.lemmatize(word))
    
    print("Lemmatization")
    print(nltk_lemmaList)

    # Stop words
    nltk_stop_words = set(stopwords.words("english"))
    filtered_sentence = [w for w in nltk_lemmaList if w not in nltk_stop_words]

    # Filter Punctuation
    filtered_sentence = [w for w in filtered_sentence if w not in string.punctuation]

    print(" ")
    print("Remove stopword & Punctuation")
    print(filtered_sentence)
    return filtered_sentence

In [None]:
nltk_text = nltk_process(large_text)
print("Text len:", len(nltk_text))

Lemmatization
['Patients', 'who', 'in', 'late', 'middle', 'age', 'have', 'smoked', '20', 'cigarette', 'a', 'day', 'since', 'their', 'teen', 'constitute', 'an', 'at-risk', 'group', '.', 'One', 'thing', 'they', '’', 're', 'clearly', 'at', 'risk', 'for', 'is', 'the', 'acute', 'sense', 'of', 'guilt', 'that', 'a', 'clinician', 'can', 'incite', ',', 'which', 'immediately', 'make', 'a', 'consultation', 'tense', '.']
 
Remove stopword & Punctuation
['Patients', 'late', 'middle', 'age', 'smoked', '20', 'cigarette', 'day', 'since', 'teen', 'constitute', 'at-risk', 'group', 'One', 'thing', '’', 'clearly', 'risk', 'acute', 'sense', 'guilt', 'clinician', 'incite', 'immediately', 'make', 'consultation', 'tense']
Text len: 27


### 3 - Proceso completo con spaCy
Tokenization → Lemmatization → Remove stopwords → Remove punctuation

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
print(simple_text)
doc = nlp(simple_text)
for token in doc:
    print([token.lemma_, token.is_stop])

    

if she leaves now she might miss something importart!
['if', True]
['-PRON-', True]
['leave', False]
['now', True]
['-PRON-', True]
['may', True]
['miss', False]
['something', True]
['importart', False]
['!', False]


In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

def spacy_process(text):
    doc = nlp(text)
    
    # Tokenization & lemmatization
    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
    print("Tokenize+Lemmatize:")
    print(lemma_list)
    
    # Stop words
    filtered_sentence =[]
    for word in lemma_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 
    
    # Filter punctuation
    filtered_sentence = [w for w in filtered_sentence if w not in string.punctuation]

    print(" ")
    print("Remove stopword & punctuation: ")
    print(filtered_sentence)
    return filtered_sentence

In [None]:
spacy_text = spacy_process(large_text)
print("Text len:", len(nltk_text))

Tokenize+Lemmatize:
['patient', 'who', 'in', 'late', 'middle', 'age', 'have', 'smoke', '20', 'cigarette', 'a', 'day', 'since', '-PRON-', 'teen', 'constitute', 'an', 'at', '-', 'risk', 'group', '.', 'one', 'thing', '-PRON-', 'be', 'clearly', 'at', 'risk', 'for', 'be', 'the', 'acute', 'sense', 'of', 'guilt', 'that', 'a', 'clinician', 'can', 'incite', ',', 'which', 'immediately', 'make', 'a', 'consultation', 'tense', '.']
 
Remove stopword & punctuation: 
['patient', 'late', 'middle', 'age', 'smoke', '20', 'cigarette', 'day', '-PRON-', 'teen', 'constitute', 'risk', 'group', 'thing', '-PRON-', 'clearly', 'risk', 'acute', 'sense', 'guilt', 'clinician', 'incite', 'immediately', 'consultation', 'tense']
Text len: 27


### 4 - Conclusiones
- NLTK no pasa a minúsculas el texto por su cuenta
- spacy algunas palabras las reemplaza por su Tag (como "'")
- spacy descompone palabras

In [None]:
from prettytable import PrettyTable
table = PrettyTable(['NLTK', 'spaCy'])
for nltk_word, spacy_word in zip(nltk_text, spacy_text):
    table.add_row([nltk_word, spacy_word])
print(table)

+-------------+--------------+
|     NLTK    |    spaCy     |
+-------------+--------------+
|   Patients  |   patient    |
|     late    |     late     |
|    middle   |    middle    |
|     age     |     age      |
|    smoked   |    smoke     |
|      20     |      20      |
|  cigarette  |  cigarette   |
|     day     |     day      |
|    since    |    -PRON-    |
|     teen    |     teen     |
|  constitute |  constitute  |
|   at-risk   |     risk     |
|    group    |    group     |
|     One     |    thing     |
|    thing    |    -PRON-    |
|      ’      |   clearly    |
|   clearly   |     risk     |
|     risk    |    acute     |
|    acute    |    sense     |
|    sense    |    guilt     |
|    guilt    |  clinician   |
|  clinician  |    incite    |
|    incite   | immediately  |
| immediately | consultation |
|     make    |    tense     |
+-------------+--------------+
