In [142]:
import numpy as np
import pandas as pd
import csv
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
import requests
from bs4 import BeautifulSoup
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from collections import Counter
import re
import matplotlib.pyplot as plt
import plotly.express as px

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### 1. Load National Research Council

Se realiza la carga del Lexicón a utilizar, en este caso se ha escogido *NRC-Emotion-Lexicon-Wordlevel-v0.92.txt* ya que contiene las emociones de las palabras en inglés.

In [143]:
file_path = "NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
nrc_lexicon = {}
with open(file_path, newline='') as csvfile:
    text = csv.reader(csvfile, delimiter='\t', quotechar='|')
    for row in text:
        # print(row)
        if int(row[2]) == 1:
            if row[0] not in nrc_lexicon.keys():
                nrc_lexicon[row[0]] = []
            nrc_lexicon[row[0]].append(row[1])

In [144]:
# Vista primeros 5 elementos del diccionario
print(len(nrc_lexicon.keys()))
for k,v in list(nrc_lexicon.items())[:5]:
    print(k, v)

6453
abacus ['trust']
abandon ['fear', 'negative', 'sadness']
abandoned ['anger', 'fear', 'negative', 'sadness']
abandonment ['anger', 'fear', 'negative', 'sadness', 'surprise']
abba ['positive']


### 2. Extender el léxico NRC utilizando WordNet

A continuación, se cargan los los diccionarios que sirven para pasar del POS-tag de wordnet a penn y viceversa.

In [145]:
wordnet_to_penn = {
 'n': 'NN', # sustantivo
 'v': 'VB', # verbo
 'a': 'JJ', # adjetivo
 's': 'JJ', # adjetivo superlativo
 'r': 'RB', # adverbio
 'c': 'CC' # conjunción
}

penn_to_wordnet = {
 'CC': 'c', # Coordinating conjunction
 'CD': 'c', # Cardinal number
 'DT': 'c', # Determiner
 'EX': 'c', # Existential there
 'FW': 'x', # Foreign word
 'IN': 'c', # Preposition or subordinating conjunction
 'JJ': 'a', # Adjective
 'JJR': 'a', # Adjective, comparative
 'JJS': 'a', # Adjective, superlative
 'LS': 'c', # List item marker
 'MD': 'v', # Modal
 'NN': 'n', # Noun, singular or mass
 'NNS': 'n', # Noun, plural
 'NNP': 'n', # Proper noun, singular
 'NNPS': 'n', # Proper noun, plural
  'PDT': 'c', # Predeterminer
 'POS': 'c', # Possessive ending
 'PRP': 'n', # Personal pronoun
 'PRP$': 'n', # Possessive pronoun
 'RB': 'r', # Adverb
 'RBR': 'r', # Adverb, comparative
 'RBS': 'r', # Adverb, superlative
 'RP': 'r', # Particle
 'SYM': 'x', # Symbol
 'TO': 'c', # to
 'UH': 'x', # Interjection
 'VB': 'v', # Verb, base form
 'VBD': 'v', # Verb, past tense
 'VBG': 'v', # Verb, gerund or present participle
 'VBN': 'v', # Verb, past participle
 'VBP': 'v', # Verb, non-3rd person singular present
 'VBZ': 'v', # Verb, 3rd person singular present
 'WDT': 'c', # Wh-determiner
 'WP': 'n', # Wh-pronoun
 'WP$': 'n', # Possessive wh-pronoun
 'WRB': 'r', # Wh-adverb
 'X': 'x' # Any word not categorized by the other tags
 }

Implementamos cirtas funciones que serás útiles para extender el lexicón.

In [146]:
def getSynonyms(word):
    """
    Esta función recibe una palabra y retorna los sinonimos con su respectivo
    POS-tag
    """
    synonyms = wn.synonyms(word)
    synonyms = [sub_array[0] for sub_array in synonyms if sub_array]
    sims = []
    for s in synonyms:
        for s1 in wn.synsets(s):
            sims.append((s1.name().split('.')[0], s1.pos()))
    return list(set(sims))

# Ejemplo de la función getSynonyms
getSynonyms('abandon')

[('release', 'v'),
 ('abandon', 'v'),
 ('evacuate', 'v'),
 ('empty', 'n'),
 ('give_up', 'v'),
 ('ferocity', 'n'),
 ('empty', 's'),
 ('defect', 'v'),
 ('drop_out', 'v'),
 ('empty', 'v'),
 ('surrender', 'v'),
 ('desert', 'n'),
 ('forfeit', 'v'),
 ('vacate', 'v'),
 ('desert', 'v'),
 ('wildness', 'n'),
 ('abandon', 'n'),
 ('discontinue', 'v'),
 ('empty', 'a'),
 ('kick', 'v'),
 ('spare', 'v')]

In [147]:
def getHypernyms(word):
    """
    Esta función recibe una palabra y retorna los hiperónimos con su respectivo
    POS-tag
    """
    synsets = wn.synsets(word)
    hypernyms = []
    for synset in synsets:
        for hypernym in synset.hypernyms():
            hypernyms.append((hypernym.name().split('.')[0], hypernym.pos()))
    return list(set(hypernyms))

# Ejemplo de la función getHypernyms
getHypernyms('abandon')

[('passion', 'n'), ('discard', 'v'), ('unrestraint', 'n'), ('leave', 'v')]

In [148]:
def getHyponyms(word):
    """
    Esta función recibe una palabra y retorna los hipónimos con su respectivo
    POS-tag
    """
    synsets = wn.synsets(word)
    hyponyms = []
    for synset in synsets:
        for hyponym in synset.hyponyms():
            hyponyms.append((hyponym.name().split('.')[0], hyponym.pos()))
    return list(set(hyponyms))

# Ejemplo de la función getHyponyms
getHyponyms('abandon')

[('forfeit', 'v'),
 ('walk_out', 'v'),
 ('foreswear', 'v'),
 ('ditch', 'v'),
 ('expose', 'v'),
 ('dispense_with', 'v'),
 ('consign', 'v'),
 ('chuck', 'v'),
 ('maroon', 'v')]

In [149]:
def getDerivedWords(word):
  """
  Esta función recibe una palabra y retorna las palabras derivadas con su
  respectivo POS-tag
  """
  synsets = wn.synsets(word)
  derived_words = []
  for synset in synsets:
      derived_words.append((word, synset.pos()))
      for lemma in synset.lemmas():
          for related_lemma in lemma.derivationally_related_forms():
              derived_words.append((related_lemma.name(), related_lemma.synset().pos()))

  return list(set(derived_words))

getDerivedWords('abandon')


[('desertion', 'n'),
 ('abandonment', 'n'),
 ('abandon', 'v'),
 ('deserter', 'n'),
 ('abandon', 'n'),
 ('vacant', 's'),
 ('wild', 's'),
 ('forsaking', 'n'),
 ('wanton', 's'),
 ('desolation', 'n')]

A continiación, procedemos a extender el lexicón original utilizando las funciones creadas anteriormente (getSynonyms, getHypernyms, getHyponyms y getDerivedWords). De esta manera, creamos un diccionario en el cual utilizamos como clave la tupla <lemma, POS-tag> con sus respectivas emociones asociadas.

In [150]:
extended_lexicon = {}

for word in list(nrc_lexicon.keys()):
    emotions = nrc_lexicon[word]
    for symnonym in getSynonyms(word):
        if symnonym not in extended_lexicon.keys():
            extended_lexicon[symnonym] = emotions
    for hypernym in getHypernyms(word):
        if hypernym not in extended_lexicon.keys():
            extended_lexicon[hypernym] = emotions

    for hyponym in getHyponyms(word):
        if hyponym not in extended_lexicon.keys():
            extended_lexicon[hyponym] = emotions

    for derived_words in getDerivedWords(word):
        if derived_words not in extended_lexicon.keys():
            extended_lexicon[derived_words] = emotions

print('Primeros 5 elementos del lexicon extendido:')
for key, value in list(extended_lexicon.items())[:5]:
    print(key, value)

Primeros 5 elementos del lexicon extendido:
('calculator', 'n') ['trust']
('tablet', 'n') ['trust']
('abacus', 'n') ['trust']
('release', 'v') ['fear', 'negative', 'sadness']
('abandon', 'v') ['fear', 'negative', 'sadness']


### 3. Cargar el texto de novelas clásicas disponibles en Project Gutenberg

Realizamos la carga de los textos a utilizar.

In [151]:
books = {
 'Moby Dick': 'https://www.gutenberg.org/cache/epub/15/pg15.txt',
 'War and Peace': 'http://www.gutenberg.org/files/2600/2600-0.txt',
 'Pride and Prejudice': 'http://www.gutenberg.org/files/1342/1342-0.txt',
 'Crime and Punishment': 'https://www.gutenberg.org/cache/epub/2554/pg2554.txt',
 'The Adventures of Sherlock Holmes': 'http://www.gutenberg.org/files/1661/1661-0.txt',
 'Ulysses': 'http://www.gutenberg.org/files/4300/4300-0.txt',
 'The Odyssey': 'https://www.gutenberg.org/cache/epub/1727/pg1727.txt',
 'The Divine Comedy': 'https://www.gutenberg.org/cache/epub/8800/pg8800.txt',
 'Irish Fairy Tales': 'https://www.gutenberg.org/cache/epub/2892/pg2892.txt',
 'Critias': 'https://www.gutenberg.org/files/1571/1571-h/1571-h.htm'
}

In [152]:
def download_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.get_text().lower()
    return text

In [153]:
downloaded_books = {}
for book in books:
    downloaded_books[book] = download_text(books[book])

### 4. Implementar una función para analizar el texto y contar las ocurrencias de palabras vinculadas con emociones en el texto

Con el objetivo de realizar un trabajo más limpio y liberar almacenamiento removemos las stop words de los textos.

In [154]:
stoplist = ["also", "could", "p", "pp", "th", "however", "one", "two", "many", "i", "de", "la", "me", "my", "myself",
            "the", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him",
            "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
            "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is",
            "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing",
            "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for",
            "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below",
            "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once",
            "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most",
            "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s",
            "t", "can", "will", "just", "don", "should", "now"]

Procedemos a realizar la toquenización de los textos.

In [155]:
tokenized_tagget_books = {}
for book in downloaded_books:
    tokenized_tagget_books[book] = pos_tag([word for word in re.findall(r"[^\W\d_]+|\d+", downloaded_books[book]) if word not in stoplist])

In [156]:
print('Primeros 10 a 20 tokens del libro Moby Dick:')
tokenized_tagget_books['Moby Dick'][10:20]

Primeros 10 a 20 tokens del libro Moby Dick:


[('united', 'JJ'),
 ('states', 'NNS'),
 ('parts', 'NNS'),
 ('world', 'NN'),
 ('cost', 'NN'),
 ('almost', 'RB'),
 ('restrictions', 'NNS'),
 ('whatsoever', 'VBP'),
 ('may', 'MD'),
 ('copy', 'VB')]

Obtenemos el POS-tag de las palbras de los textos y su correspondiente lemma.

In [157]:
lemmatizer = WordNetLemmatizer()

for book in tokenized_tagget_books:
    new_words_tag = []
    for word, tag in tokenized_tagget_books[book]:
        if tag in list(penn_to_wordnet.keys())[:-1]:
            new_words_tag.append((lemmatizer.lemmatize(word), penn_to_wordnet[tag]))
        else:
            new_words_tag.append((lemmatizer.lemmatize(word), penn_to_wordnet['X']))

    tokenized_tagget_books[book] = new_words_tag

In [158]:
print('Primeros 10 a 20 tokens del libro Moby Dick:')
tokenized_tagget_books['Moby Dick'][10:20]

Primeros 10 a 20 tokens del libro Moby Dick:


[('united', 'a'),
 ('state', 'n'),
 ('part', 'n'),
 ('world', 'n'),
 ('cost', 'n'),
 ('almost', 'r'),
 ('restriction', 'n'),
 ('whatsoever', 'v'),
 ('may', 'v'),
 ('copy', 'v')]

Realizamos la comparación de las tuplas con nuestro lexicón y asociamos las emociones correspondientes a cada una de las tuplas.

In [159]:
books_emotions={}
for book in tokenized_tagget_books:
    books_emotions[book] = []
    for wordpos in tokenized_tagget_books[book]:
        if wordpos in extended_lexicon.keys():
            books_emotions[book]+=extended_lexicon[wordpos]

### 5. Presentar los resultados del análisis de sentimientos en las novelas clásicas.

Realizamos el conteo de las emociones y obtenemos los resultados.

In [160]:
for book in books_emotions.keys():
    books_emotions[book] = Counter(books_emotions[book])

In [161]:
books_emotions

{'Moby Dick': Counter({'anticipation': 9399,
          'joy': 7084,
          'positive': 25544,
          'surprise': 4621,
          'trust': 12558,
          'negative': 19644,
          'anger': 6627,
          'disgust': 5389,
          'fear': 10415,
          'sadness': 7776}),
 'War and Peace': Counter({'anticipation': 24742,
          'joy': 18961,
          'positive': 68521,
          'surprise': 12098,
          'trust': 34200,
          'fear': 22249,
          'anger': 12979,
          'disgust': 9931,
          'negative': 38317,
          'sadness': 17424}),
 'Pride and Prejudice': Counter({'anticipation': 6336,
          'joy': 4986,
          'positive': 15485,
          'surprise': 2910,
          'trust': 8721,
          'negative': 7627,
          'anger': 2571,
          'disgust': 2115,
          'fear': 3934,
          'sadness': 3484}),
 'Crime and Punishment': Counter({'anticipation': 8630,
          'joy': 5946,
          'positive': 21470,
          'surpris

Se puede apreciar que en todos los textos la emoción principal corresponte a emociones positivasa, segido de emociones negativas.

In [166]:
for book in books_emotions.keys():
    emotions = books_emotions[book].keys()
    freq = books_emotions[book].values()
    d = pd.DataFrame({'emotions':emotions, 'freq':freq})
    fig = px.pie(d, values = 'freq', names ='emotions', title = book,
                 color_discrete_sequence=px.colors.colorbrewer.Accent
                 )

    fig.show()