***nltk 패키지 다운로드***

In [18]:
!pip install --user -U nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nltk
[0mSuccessfully installed nltk-3.8.1


**wordnet 사용방법**:
Natural Language Toolkit (NLTK) 라이브러리를 사용한 예제

In [1]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
from nltk.corpus import wordnet

# 'dog'라는 단어의 synset 가져오기

synsets = wordnet.synsets('dog')

# 첫 번째 synset의 정의 확인
print("dog의 정의:", synsets[0].definition())

dog의 정의: a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds


In [3]:
# 'happy'라는 단어의 synset 가져오기
happy_synsets = wordnet.synsets('happy')

# 동의어 확인
print("happy의 동의어:", [lemma.name() for synset in happy_synsets for lemma in synset.lemmas()])

# 반의어 확인
print("happy의 반의어:", [lemma.antonyms()[0].name() for synset in happy_synsets for lemma in synset.lemmas() if lemma.antonyms()])

happy의 동의어: ['happy', 'felicitous', 'happy', 'glad', 'happy', 'happy', 'well-chosen']
happy의 반의어: ['unhappy']


In [4]:
# 'dog'의 상위어 (hypernyms) 확인
dog_synset = synsets[0]
hypernyms = dog_synset.hypernyms()
print("dog의 상위어:", [hypernym.name() for hypernym in hypernyms])

# 'dog'의 하위어 (hyponyms) 확인
hyponyms = dog_synset.hyponyms()
print("dog의 하위어:", [hyponym.name() for hyponym in hyponyms])

dog의 상위어: ['canine.n.02', 'domestic_animal.n.01']
dog의 하위어: ['basenji.n.01', 'corgi.n.01', 'cur.n.01', 'dalmatian.n.02', 'great_pyrenees.n.01', 'griffon.n.02', 'hunting_dog.n.01', 'lapdog.n.01', 'leonberg.n.01', 'mexican_hairless.n.01', 'newfoundland.n.01', 'pooch.n.01', 'poodle.n.01', 'pug.n.01', 'puppy.n.01', 'spitz.n.01', 'toy_dog.n.01', 'working_dog.n.01']


***wordnet 관련 패키지 nltk import***

In [5]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import wordnet
from nltk import word_tokenize
from nltk.corpus import stopwords
import sys

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


***단어와  문장에 나타난 단어에 대해  Best Sense 추출***

In [6]:
def disambiguate(word, sentence, stopwords):
        # Best sense 를 얻기위한 Lesk 알고리즘을 작성해보세요.

        word_senses = wordnet.synsets(word)
        best_sense = word_senses[0]  # Assume that first sense is most freq.
        max_overlap = 0
        context = set(word_tokenize(sentence))
        for sense in word_senses:
            signature = tokenized_gloss(sense)
            overlap = compute_overlap(signature, context, stopwords)
            if overlap > max_overlap:
                max_overlap = overlap
                best_sense = sense

        return best_sense

***sense의 definition에 대한 모든 token 추출***

In [7]:
def tokenized_gloss(sense):
        tokens = set(word_tokenize(sense.definition()))
        for example in sense.examples():
            tokens.union(set(word_tokenize(example)))
        return tokens

***겹치는 단어 비교***

In [8]:
def compute_overlap(signature, context, stopwords):
        gloss = signature.difference(stopwords)
        return len(gloss.intersection(context))

***Main***

In [9]:
stopwords = set(stopwords.words('english'))# NLTK에서 지정한 영어 불용어 처리 ex) i, my, they...
sentence = ("They eat a meal")
context = set(word_tokenize(sentence))
word = 'eat'

print("Word :", word)
syn = wordnet.synsets('eat')[1]
print("Sense :", syn.name())
print("Definition :", syn.definition())
print("Sentence :", sentence)

signature = tokenized_gloss(syn)
print(signature)
print(compute_overlap(signature, context, stopwords))
print("Best sense: ", disambiguate(word, sentence, stopwords))

Word : eat


LookupError: ignored