# Procesamiento del Lenguaje Natural

Rodrigo S. Cortez Madrigal

<img src="https://pcic.posgrado.unam.mx/wp-content/uploads/Ciencia-e-Ingenieria-de-la-Computacion_color.png" alt="Logo PCIC" width="128" />  

In [22]:
import numpy as np
import pandas as pd
import re
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

import plotly
from plotly import graph_objs as go
from plotly import express as px
from plotly.subplots import make_subplots

import spacy

In [23]:
# !pip install -e git+https://github.com/roicort/PyPoetryDB.git#egg=PyPoetryDB

In [24]:
from PyPoetryDB import PoetryDB

poetrydb = PoetryDB.API()

poem = poetrydb.get_poem('The Raven')

the_raven = poem[0]['lines']

In [25]:
the_raven

['Once upon a midnight dreary, while I pondered, weak and weary,',
 'Over many a quaint and curious volume of forgotten lore--',
 'While I nodded, nearly napping, suddenly there came a tapping,',
 'As of some one gently rapping--rapping at my chamber door.',
 '"\'Tis some visitor," I muttered, "tapping at my chamber door--',
 '        Only this and nothing more."',
 '',
 'Ah, distinctly I remember, it was in the bleak December,',
 'And each separate dying ember wrought its ghost upon the floor.',
 'Eagerly I wished the morrow;--vainly I had sought to borrow',
 'From my books surcease of sorrow--sorrow for the lost Lenore--',
 'For the rare and radiant maiden whom the angels name Lenore--',
 '        Nameless here for evermore.',
 '',
 'And the silken sad uncertain rustling of each purple curtain',
 'Thrilled me--filled me with fantastic terrors never felt before;',
 'So that now, to still the beating of my heart, I stood repeating',
 '"\'Tis some visitor entreating entrance at my chamb

In [26]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [27]:
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_trf')

corpus = " ".join(the_raven)

doc = nlp(corpus)

tokens = [token.text.lower() for token in doc if token.is_alpha and not token.is_stop]

In [28]:
the_raven_tags = [(token.text, token.tag_) for token in doc]

In [29]:
the_raven_tags

[('Once', 'RB'),
 ('upon', 'IN'),
 ('a', 'DT'),
 ('midnight', 'NN'),
 ('dreary', 'JJ'),
 (',', ','),
 ('while', 'IN'),
 ('I', 'PRP'),
 ('pondered', 'VBD'),
 (',', ','),
 ('weak', 'JJ'),
 ('and', 'CC'),
 ('weary', 'JJ'),
 (',', ','),
 ('Over', 'IN'),
 ('many', 'PDT'),
 ('a', 'DT'),
 ('quaint', 'JJ'),
 ('and', 'CC'),
 ('curious', 'JJ'),
 ('volume', 'NN'),
 ('of', 'IN'),
 ('forgotten', 'VBN'),
 ('lore--', 'NN'),
 ('While', 'IN'),
 ('I', 'PRP'),
 ('nodded', 'VBD'),
 (',', ','),
 ('nearly', 'RB'),
 ('napping', 'VBG'),
 (',', ','),
 ('suddenly', 'RB'),
 ('there', 'EX'),
 ('came', 'VBD'),
 ('a', 'DT'),
 ('tapping', 'NN'),
 (',', ','),
 ('As', 'IN'),
 ('of', 'IN'),
 ('some', 'DT'),
 ('one', 'CD'),
 ('gently', 'RB'),
 ('rapping', 'VBG'),
 ('--', ':'),
 ('rapping', 'VBG'),
 ('at', 'IN'),
 ('my', 'PRP$'),
 ('chamber', 'NN'),
 ('door', 'NN'),
 ('.', '.'),
 ('"', '``'),
 ("'", 'PRP'),
 ('Tis', 'VBZ'),
 ('some', 'DT'),
 ('visitor', 'NN'),
 (',', ','),
 ('"', "''"),
 ('I', 'PRP'),
 ('muttered', 'VBD'

In [30]:

def generate_report(doc, tokens):

    num_tokens = len(tokens)
    num_types = len(set(tokens))
    num_lemmas = len(set(token.lemma_ for token in doc))
    lexical_richness = num_types / num_tokens

    report = {
        "num_tokens": num_tokens,
        "num_types": num_types,
        "num_lemmas": num_lemmas,
        "lexical_richness": lexical_richness,
    }
    
    return report

report = generate_report(doc, tokens)
print("Número de tokens:", report["num_tokens"])
print("Número de types:", report["num_types"])
print("Número de lemas:", report["num_lemmas"])
print("Riqueza léxica:", report["lexical_richness"])

Número de tokens: 518
Número de types: 330
Número de lemas: 452
Riqueza léxica: 0.637065637065637


In [31]:
import nltk

nltk.Text(tokens).concordance("night")

Displaying 2 of 2 matches:
en wandering nightly tell thy lordly night plutonian shore quoth raven nevermor
ing bird fiend shrieked thee tempest night plutonian shore leave black plume to


In [32]:
doc.ents

(midnight,
 the bleak,
 December,
 the morrow;--vainly,
 Lenore--,
 Lenore--,
 Lenore,
 Lenore,
 the saintly days of yore,
 Pallas,
 Raven,
 Plutonian,
 Quoth the Raven,
 Nevermore,
 Nevermore,
 Raven,
 one,
 one,
 the morrow,
 one,
 Nevermore,
 Seraphim,
 Lenore,
 Lenore,
 Tempter,
 Quoth the Raven,
 Nevermore,
 Lenore--,
 Lenore,
 Quoth the Raven,
 Nevermore,
 Quoth the Raven,
 Nevermore,
 Raven,
 Pallas)

In [33]:
from pysentimiento import create_analyzer

emotion_analyzer = create_analyzer(task="emotion", lang="en")

emotions = emotion_analyzer.predict(corpus)

In [34]:
# Plot the emotions dict

emotions.probas

import plotly.express as px
import pandas as pd

emotions_df = pd.DataFrame(emotions.probas, index=[0])
emotions_df = emotions_df.melt(var_name='emotion', value_name='probability')
fig = px.bar(emotions_df, x='emotion', y='probability', title='Emotion Probabilities')
fig.update_layout(xaxis_title='Emotion', yaxis_title='Probability')
fig.show()