In [43]:
import numpy as np
import pandas as pd
import csv
import nltk
from nltk.corpus import wordnet as wn

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\marlo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### 1. Load National Research Council

In [44]:
file_path = "NRC-Emotion-Lexicon/NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
nrc_lexicon = {}
with open(file_path, newline='') as csvfile:
    text = csv.reader(csvfile, delimiter='\t', quotechar='|')
    for row in text:
        if int(row[2]) == 1:
            if row[0] not in nrc_lexicon.keys():
                nrc_lexicon[row[0]] = []
            nrc_lexicon[row[0]].append(row[1])

In [45]:
# Vista primors 5 elementos del diccionario
for k,v in list(nrc_lexicon.items())[:5]:
    print(k, v)

abacus ['trust']
abandon ['fear', 'negative', 'sadness']
abandoned ['anger', 'fear', 'negative', 'sadness']
abandonment ['anger', 'fear', 'negative', 'sadness', 'surprise']
abba ['positive']


### 2. Extender el léxico NRC utilizando WordNet

In [46]:
wordnet_to_penn = {
 'n': 'NN', # sustantivo
 'v': 'VB', # verbo
 'a': 'JJ', # adjetivo
 's': 'JJ', # adjetivo superlativo
 'r': 'RB', # adverbio
 'c': 'CC' # conjunción
}

penn_to_wordnet = {
 'CC': 'c', # Coordinating conjunction
 'CD': 'c', # Cardinal number
 'DT': 'c', # Determiner
 'EX': 'c', # Existential there
 'FW': 'x', # Foreign word
 'IN': 'c', # Preposition or subordinating conjunction
 'JJ': 'a', # Adjective
 'JJR': 'a', # Adjective, comparative
 'JJS': 'a', # Adjective, superlative
 'LS': 'c', # List item marker
 'MD': 'v', # Modal
 'NN': 'n', # Noun, singular or mass
 'NNS': 'n', # Noun, plural
 'NNP': 'n', # Proper noun, singular
 'NNPS': 'n', # Proper noun, plural
  'PDT': 'c', # Predeterminer
 'POS': 'c', # Possessive ending
 'PRP': 'n', # Personal pronoun
 'PRP$': 'n', # Possessive pronoun
 'RB': 'r', # Adverb
 'RBR': 'r', # Adverb, comparative
 'RBS': 'r', # Adverb, superlative
 'RP': 'r', # Particle
 'SYM': 'x', # Symbol
 'TO': 'c', # to
 'UH': 'x', # Interjection
 'VB': 'v', # Verb, base form
 'VBD': 'v', # Verb, past tense
 'VBG': 'v', # Verb, gerund or present participle
 'VBN': 'v', # Verb, past participle
 'VBP': 'v', # Verb, non-3rd person singular present
 'VBZ': 'v', # Verb, 3rd person singular present
 'WDT': 'c', # Wh-determiner
 'WP': 'n', # Wh-pronoun
 'WP$': 'n', # Possessive wh-pronoun
 'WRB': 'r', # Wh-adverb
 'X': 'x' # Any word not categorized by the other tags
 }

In [79]:
def penn_to_wordnet_pos(penn_pos):
    return penn_to_wordnet.get(penn_pos, None)

def get_wordnet_relations(word, pos):
    pos = penn_to_wordnet_pos(pos)
    if pos:
        synsets = wn.synsets(word, pos=pos)
        print(synsets)
    # if pos:
    #     synsets = wn.synsets(word, pos=pos)
    #     related_words = set()
    #     for synset in synsets:
    #         related_words.update(synset.lemma_names())
    #         related_words.update([lemma.name() for lemma in synset.lemmas()])
    #         related_words.update([hyper.name().split('.')[0] for hyper in synset.hypernyms()])
    #         related_words.update([hypo.name().split('.')[0] for hypo in synset.hyponyms()])
    #         related_words.update([lemma.name() for lemma in synset.lemmas() if lemma.derivationally_related_forms()])
    #     return related_words
    # else:
    #     return set()

In [90]:
get_wordnet_relations('joy', 'WP')

[Synset('joy.n.01'), Synset('joy.n.02')]


In [73]:
extended_lexicon = {}

for word, emotions in nrc_lexicon.items():
    for pos in ['n', 'v', 'a', 'r']:  # Consideramos solo sustantivos, verbos, adjetivos y adverbios
        related_words = get_wordnet_relations(word, pos)
        for related_word in related_words:
            extended_lexicon.setdefault((related_word, pos), []).extend(emotions)

In [72]:
wn.synset('joy.n.01').lemmas()[0].derivationally_related_forms()

[Lemma('joyous.a.01.joyous'),
 Lemma('gladden.v.01.joy'),
 Lemma('rejoice.v.01.joy')]

In [65]:
[str(lemma.name()) for lemma in wn.synset('joy.n.01').lemmas()]

['joy', 'joyousness', 'joyfulness']