# 비지도학습 감성분석 - Lexicon 기반

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from google.colab import files
up = files.upload()

Saving labeledTrainData.tsv to labeledTrainData.tsv


### Wordnet Synset 및 Sentiwordnet SentiSysnset 클래스

In [4]:
from nltk.corpus import wordnet

In [None]:
import nltk
nltk.download('wordnet')

In [6]:
term = 'lemon'
synsets = wordnet.synsets(term)

In [8]:
print(synsets)
type(synsets), len(synsets)

[Synset('lemon.n.01'), Synset('gamboge.n.02'), Synset('lemon.n.03'), Synset('lemon.n.04'), Synset('lemon.n.05')]


(list, 5)

In [40]:
for synset in synsets:
    print(f'##### name: {synset.name()} #####')
    print('POS:', synset.lexname())
    print('정의:', synset.definition())
    print('표제어:', synset.lemma_names())

##### name: lemon.n.01 #####
POS: noun.food
정의: yellow oval fruit with juicy acidic flesh
표제어: ['lemon']
##### name: gamboge.n.02 #####
POS: noun.attribute
정의: a strong yellow color
표제어: ['gamboge', 'lemon', 'lemon_yellow', 'maize']
##### name: lemon.n.03 #####
POS: noun.plant
정의: a small evergreen tree that originated in Asia but is widely cultivated for its fruit
표제어: ['lemon', 'lemon_tree', 'Citrus_limon']
##### name: lemon.n.04 #####
POS: noun.cognition
정의: a distinctive tart flavor characteristic of lemons
표제어: ['lemon']
##### name: lemon.n.05 #####
POS: noun.artifact
정의: an artifact (especially an automobile) that is defective or unsatisfactory
표제어: ['lemon', 'stinker']


- 어휘 유사도

In [10]:
# synsets(): 전체 단어집
for synset in wordnet.synsets('tiger'):
    print(synset.name(), synset.definition())

tiger.n.01 a fierce or audacious person
tiger.n.02 large feline of forests in most of Asia having a tawny coat with black stripes; endangered


In [11]:
# synset(): 특정 단어
tiger = wordnet.synset('tiger.n.02')
tree = wordnet.synset('tree.n.01')
lion = wordnet.synset('lion.n.01')
cat = wordnet.synset('cat.n.01')
dog = wordnet.synset('dog.n.01')

In [None]:
# 어휘 유사도 확인
tiger.path_similarity(lion), tiger.path_similarity(dog), tiger.path_similarity(tree)

(0.3333333333333333, 0.16666666666666666, 0.07142857142857142)

In [12]:
# 5개 어휘간의 유사도
similarities = []
entities = [tree, lion, tiger, cat, dog]
for entity in entities:
    similarity = [entity.path_similarity(another) for another in entities]
    similarities.append(similarity)

In [13]:
df = pd.DataFrame(similarities, columns=['tree', 'lion', 'tiger', 'cat', 'dog'],
                  index=['tree', 'lion', 'tiger', 'cat', 'dog'])
df

Unnamed: 0,tree,lion,tiger,cat,dog
tree,1.0,0.071429,0.071429,0.076923,0.125
lion,0.071429,1.0,0.333333,0.25,0.166667
tiger,0.071429,0.333333,1.0,0.25,0.166667
cat,0.076923,0.25,0.25,1.0,0.2
dog,0.125,0.166667,0.166667,0.2,1.0


- SentiSynset 클래스

In [15]:
nltk.download('sentiwordnet')

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.


True

In [66]:
from nltk.corpus import sentiwordnet
senti_synsets = list(sentiwordnet.senti_synsets('slow'))

In [67]:
senti_synsets

[SentiSynset('decelerate.v.01'),
 SentiSynset('slow.v.02'),
 SentiSynset('slow.v.03'),
 SentiSynset('slow.a.01'),
 SentiSynset('slow.a.02'),
 SentiSynset('dense.s.04'),
 SentiSynset('slow.a.04'),
 SentiSynset('boring.s.01'),
 SentiSynset('dull.s.08'),
 SentiSynset('slowly.r.01'),
 SentiSynset('behind.r.03')]

In [68]:
print(senti_synsets[0])

<decelerate.v.01: PosScore=0.0 NegScore=0.0>


In [44]:
# father 단어의 긍정/부정/객관성 지수
father = sentiwordnet.senti_synset('father.n.01')
father.pos_score(), father.neg_score(), father.obj_score()

(0.0, 0.0, 1.0)

In [45]:
# mother 단어의 긍정/부정/객관성 지수
mother = sentiwordnet.senti_synset('mother.n.01')
mother.pos_score(), mother.neg_score(), mother.obj_score()

(0.0, 0.0, 1.0)

In [46]:
# fabulous 단어의 긍정/부정/객관성 지수
fabulous = sentiwordnet.senti_synset('fabulous.a.01')
fabulous.pos_score(), fabulous.neg_score(), fabulous.obj_score()

(0.875, 0.125, 0.0)

In [47]:
# love 단어의 긍정/부정/객관성 지수
love = sentiwordnet.senti_synset('love.v.01')
love.pos_score(), love.neg_score(), love.obj_score()

(0.5, 0.0, 0.5)

- 감성지수 계산

In [None]:
nltk.download('punkt')

In [50]:
from nltk import word_tokenize, pos_tag
sentence = "It's good to see you again."
word_list = word_tokenize(sentence)
word_list

['It', "'s", 'good', 'to', 'see', 'you', 'again', '.']

In [None]:
nltk.download('averaged_perceptron_tagger')

In [53]:
pos_tag(word_list)

[('It', 'PRP'),
 ("'s", 'VBZ'),
 ('good', 'JJ'),
 ('to', 'TO'),
 ('see', 'VB'),
 ('you', 'PRP'),
 ('again', 'RB'),
 ('.', '.')]

In [32]:
wordnet.NOUN, wordnet.ADJ, wordnet.ADV, wordnet.VERB

('n', 'a', 'r', 'v')

In [54]:
def penn_to_wordnet(tag):
    if tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('R'):
        return wordnet.ADV
    elif tag.startswith('V'):
        return wordnet.VERB

In [55]:
for word, pos in pos_tag(word_list):
    print(word, penn_to_wordnet(pos))

It None
's v
good a
to None
see v
you None
again r
. None


- Sentence로부터 감성지수를 계산하는 과정

In [56]:
sentence = "It's good to see you again."
word_list = [word for word in word_tokenize(sentence) if len(word) > 2]
word_list

['good', 'see', 'you', 'again']

In [57]:
pos_tag(word_list)

[('good', 'JJ'), ('see', 'NN'), ('you', 'PRP'), ('again', 'RB')]

In [69]:
# 긍부정지수 출력
for word, pos in pos_tag(word_list):
    wn_tag = penn_to_wordnet(pos)
    if wn_tag:  # None이면 실행X
        synsets = list(sentiwordnet.senti_synsets(word, wn_tag))
        synset = synsets[0]
        print(synset)

<good.a.01: PosScore=0.75 NegScore=0.0>
<see.n.01: PosScore=0.0 NegScore=0.0>
<again.r.01: PosScore=0.0 NegScore=0.0>


In [80]:
# Sentence 감성지수(긍부정 점수 계산)
sentiment = 0
for word, pos in pos_tag(word_list):
    wn_tag = penn_to_wordnet(pos)
    if wn_tag:
        synsets = list(sentiwordnet.senti_synsets(word, wn_tag))
        synset = synsets[0]
        sentiment += synset.pos_score() - synset.neg_score()    # 긍부정지수 계산
print(sentiment)

0.75


In [71]:
from nltk import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [81]:
# 표제어 추출까지 고려한 Sentence 감성지수
sentiment = 0
for word, pos in pos_tag(word_list):
    wn_tag = penn_to_wordnet(pos)
    if wn_tag:
        lemma = lemmatizer.lemmatize(word, wn_tag)  # 표제어 적용
        synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
        synset = synsets[0]
        sentiment += synset.pos_score() - synset.neg_score()
print(sentiment)

0.75


- Document에서 감성지수를 계산하는 과정 및 함수

In [125]:
from nltk import sent_tokenize
document = '''
It would be easy to call this a knockoff of Indiana Jones, but in some ways it kind of knows it, has a fun self aware vibe to it.
It's merely just a satire of those treasure adventure films.
Sandra Bullock and Channing Tatum have good chemistry, they just play off each other perfectly you can tell they are having a blast.
Tatum is having fun paroding his pretty boy image. Daniel Radcliffe is quite the scene stealer has an over the top villain.
It's just a fun movie, delivers some incredible laughs, and not for the faint hearted.
'''

In [132]:
sentiment = 0.0
for sentence in sent_tokenize(document):    # 문단 -> 문장으로 토큰화
    word_list = [word for word in word_tokenize(sentence) if len(word) > 2]     # 문장 -> 단어로 토큰화
    for word, pos in pos_tag(word_list):
        wn_tag = penn_to_wordnet(pos)
        if wn_tag:
            lemma = lemmatizer.lemmatize(word, wn_tag)
            synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
            if not synsets:     # synsets가 None이면(감성분석 결과가 없으면)
                print(word)
                continue
            synset = synsets[0]
            sentiment += synset.pos_score() - synset.neg_score()
print(f'감성지수: {sentiment}이므로 긍정입니다.' if sentiment >= 0 else f'감성지수: {sentiment}이므로 부정입니다.')

easy
fun
satire
treasure
Sandra
Channing
paroding
pretty
Radcliffe
hearted
감성지수: 0.875이므로 긍정입니다.


In [133]:
def swn_polarity(text):
    lemmatizer = WordNetLemmatizer()
    sentiment = 0.0
    for sentence in sent_tokenize(text):
        word_list = [word for word in word_tokenize(sentence) if len(word) > 2]
        for word, pos in pos_tag(word_list):
            wn_tag = penn_to_wordnet(pos)
            if wn_tag:
                lemma = lemmatizer.lemmatize(word, wn_tag)
                synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
                if not synsets:
                    continue
                synset = synsets[0]
                sentiment += synset.pos_score() - synset.neg_score()
    return 1 if sentiment >= 0 else 0

- IMDB 영화평 감성분석

In [134]:
df = pd.read_csv('labeledTrainData.tsv', sep='\t', quoting=3)
df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [135]:
# <br /> 공백으로 변환
df.review = df.review.str.replace('<br />', ' ')

# 구두점, 숫자 제거 - 영문자가 아닌 글자는 공백으로 변환
df.review = df.review.str.replace('[^A-Za-z]', ' ').str.strip()
df.review[0][:1000]

'With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay   Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him   The actual feature film bit when it finally starts is only on for  

In [136]:
df.shape

(25000, 3)

In [137]:
df = df.head(10000)

In [145]:
%time df['pred'] = df.review.apply(lambda x: swn_polarity(x))

CPU times: user 3min 12s, sys: 1.07 s, total: 3min 13s
Wall time: 3min 19s


In [150]:
df.head()

Unnamed: 0,id,sentiment,review,pred
0,"""5814_8""",1,With all this stuff going down at the moment w...,1
1,"""2381_9""",1,The Classic War of the Worlds by Timothy Hin...,1
2,"""7759_3""",0,The film starts with a manager Nicholas Bell ...,0
3,"""3630_4""",0,It must be assumed that those who praised this...,0
4,"""9495_8""",1,Superbly trashy and wondrously unpretentious ...,0


In [151]:
# 정확도 계산
from sklearn.metrics import accuracy_score
accuracy_score(df.sentiment, df.pred)

0.6309

In [152]:
def swn_polarity(text):
    # 감성 지수 초기화 
    sentiment = 0.0
    tokens_count = 0
    
    lemmatizer = WordNetLemmatizer()
    # 분해된 문장별로 단어 토큰 -> 품사 태깅 후에 SentiSynset 생성 -> 감성 지수 합산 
    for raw_sentence in sent_tokenize(text):
        # NTLK 기반의 품사 태깅 문장 추출  
        word_list = [word for word in word_tokenize(raw_sentence) if len(word) > 2]
        tagged_sentence = pos_tag(word_list)
        for word, tag in tagged_sentence:
            # WordNet 기반 품사 태깅과 어근 추출
            wn_tag = penn_to_wordnet(tag)
            if wn_tag not in (wordnet.NOUN, wordnet.ADJ, wordnet.ADV, wordnet.VERB):
                continue                   
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
            # 어근을 추출한 단어와 WordNet 기반 품사 태깅을 입력해 Synset 객체를 생성. 
            synsets = wordnet.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
            # sentiwordnet의 감성 단어 분석으로 감성 synset 추출
            # 모든 단어에 대해 긍정 감성 지수는 +로 부정 감성 지수는 -로 합산해 감성 지수 계산. 
            synset = synsets[0]
            swn_synset = sentiwordnet.senti_synset(synset.name())
            sentiment += swn_synset.pos_score() - swn_synset.neg_score()           
            tokens_count += 1
    
    if not tokens_count:
        return 0
    
    # 총 score가 0 이상일 경우 긍정(Positive) 1, 그렇지 않을 경우 부정(Negative) 0 반환
    return 1 if sentiment >= 0 else 0

### VADER Lexicon을 이용한 감성분석

In [42]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [138]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
senti_analyzer = SentimentIntensityAnalyzer()
senti_analyzer.polarity_scores(df.review[0])

{'compound': -0.7943, 'neg': 0.13, 'neu': 0.743, 'pos': 0.127}

In [139]:
def vader_polarity(document, threshold=0.1):
    score = senti_analyzer.polarity_scores(document)
    return 1 if score['compound'] >= threshold else 0

In [153]:
%time df['vader'] = df.review.apply(lambda x: vader_polarity(x, 0.1))

CPU times: user 30.1 s, sys: 303 ms, total: 30.4 s
Wall time: 30.4 s


In [154]:
accuracy_score(df.sentiment, df.vader)

0.6997