In [1]:
%cd /content/drive/MyDrive/multi/0428

/content/drive/MyDrive/multi/0428


# 비지도학습 감성분석 - Lexicon

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('./train/trainData.tsv', sep='\t', quoting=3)
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [4]:
import warnings
warnings.filterwarnings('ignore')

### Wordnet Synset 및 SentiWordNet SentiSynset

In [12]:
import nltk

In [11]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [19]:
from nltk.corpus import wordnet

term = 'eat'
synsets = wordnet.synsets(term)

In [20]:
type(synsets), len(synsets)

(list, 6)

In [21]:
for synset in synsets:
    print(f'#### name: {synset.name()} ####')
    print('POS: ', synset.lexname())
    print('정의: ', synset.definition())
    print('표제어: ', synset.lemma_names())

#### name: eat.v.01 ####
POS:  verb.consumption
정의:  take in solid food
표제어:  ['eat']
#### name: eat.v.02 ####
POS:  verb.consumption
정의:  eat a meal; take a meal
표제어:  ['eat']
#### name: feed.v.06 ####
POS:  verb.consumption
정의:  take in food; used of animals only
표제어:  ['feed', 'eat']
#### name: eat.v.04 ####
POS:  verb.emotion
정의:  worry or cause anxiety in a persistent way
표제어:  ['eat', 'eat_on']
#### name: consume.v.05 ####
POS:  verb.consumption
정의:  use up (resources or materials)
표제어:  ['consume', 'eat_up', 'use_up', 'eat', 'deplete', 'exhaust', 'run_through', 'wipe_out']
#### name: corrode.v.01 ####
POS:  verb.change
정의:  cause to deteriorate due to the action of water, air, or an acid
표제어:  ['corrode', 'eat', 'rust']


- 어휘간의 유사도

In [23]:
# 단어, 품사를 모를 경우에는 synsets(word)으로 알아낸다.

for synset in wordnet.synsets('tiger'):
    print(synset.name(), synset.definition())

tiger.n.01 a fierce or audacious person
tiger.n.02 large feline of forests in most of Asia having a tawny coat with black stripes; endangered


In [24]:
# 단어, 품사를 아는 경우에는 synset()하면 된다.

tiger = wordnet.synset('tiger.n.02')
tree = wordnet.synset('tree.n.01')
lion = wordnet.synset('lion.n.01')
cat = wordnet.synset('cat.n.01')
dog = wordnet.synset('dog.n.01')

In [27]:
# 단어간의 유사도
print(f'{tiger.path_similarity(lion):.3f}')
print(f'{tiger.path_similarity(tree):.3f}')
print(f'{tiger.path_similarity(dog):.3f}')
print(f'{tiger.path_similarity(cat):.3f}')

0.333
0.071
0.167
0.250


In [33]:
# 5개 단어간의 유사도

sim = []
entities = [tree, lion, tiger, cat, dog]

# for entity in entities:
#     tmp = []
#     for another in entities:
#         tmp.append(entity.path_similarity(another))
#     sim.append(tmp)

for entity in entities:
    tmp = [entity.path_similarity(another) for another in entities]
    sim.append(tmp)

res = pd.DataFrame(sim, columns=['tree', 'lion', 'tiger', 'cat', 'dog'], 
                   index = ['tree', 'lion', 'tiger', 'cat', 'dog'])
res.head()

Unnamed: 0,tree,lion,tiger,cat,dog
tree,1.0,0.071429,0.071429,0.076923,0.125
lion,0.071429,1.0,0.333333,0.25,0.166667
tiger,0.071429,0.333333,1.0,0.25,0.166667
cat,0.076923,0.25,0.25,1.0,0.2
dog,0.125,0.166667,0.166667,0.2,1.0


- SentiSynset 클래스

In [38]:
nltk.download('sentiwordnet')

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.


True

In [39]:
from nltk.corpus import sentiwordnet
senti_synsets = list(sentiwordnet.senti_synsets('slow'))

In [40]:
senti_synsets

[SentiSynset('decelerate.v.01'),
 SentiSynset('slow.v.02'),
 SentiSynset('slow.v.03'),
 SentiSynset('slow.a.01'),
 SentiSynset('slow.a.02'),
 SentiSynset('dense.s.04'),
 SentiSynset('slow.a.04'),
 SentiSynset('boring.s.01'),
 SentiSynset('dull.s.08'),
 SentiSynset('slowly.r.01'),
 SentiSynset('behind.r.03')]

In [44]:
# father 단어의 긍/부정/객관성 지수

father = sentiwordnet.senti_synset('father.n.01')
father.pos_score(), father.neg_score(), father.obj_score()

(0.0, 0.0, 1.0)

In [45]:
mother = sentiwordnet.senti_synset('mother.n.01')
mother.pos_score(), mother.neg_score(), mother.obj_score()

(0.0, 0.0, 1.0)

In [47]:
fabulous = sentiwordnet.senti_synset('fabulous.a.01')
fabulous.pos_score(), fabulous.neg_score(), fabulous.obj_score()

(0.875, 0.125, 0.0)

In [48]:
love = sentiwordnet.senti_synset('love.v.01')
love.pos_score(), love.neg_score(), love.obj_score()

(0.5, 0.0, 0.5)

- 감성지수 계산

In [52]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [50]:
from nltk import word_tokenize, pos_tag
sentence = "It's good to see you again"
word_list = word_tokenize(sentence)
word_list

['It', "'s", 'good', 'to', 'see', 'you', 'again']

In [53]:
pos_tag(word_list)

[('It', 'PRP'),
 ("'s", 'VBZ'),
 ('good', 'JJ'),
 ('to', 'TO'),
 ('see', 'VB'),
 ('you', 'PRP'),
 ('again', 'RB')]

In [55]:
def pen_to_wordnet(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    if tag.startswith('N'):
        return wordnet.NOUN
    if tag.startswith('R'):
        return wordnet.ADV
    if tag.startswith('V'):
        return wordnet.VERB

In [56]:
for word, pos in pos_tag(word_list):
    print(word, pen_to_wordnet(pos))

It None
's v
good a
to None
see v
you None
again r


- Sentence로부터 Senti_Synset 객체를 만드는 과정

In [57]:
sentence = "It's good to see you again"

word_list = [word for word in word_tokenize(sentence) if len(word) > 2]
word_list

['good', 'see', 'you', 'again']

In [59]:
for word, pos in pos_tag(word_list):
    wn_tag = pen_to_wordnet(pos)
    if wn_tag:
        synsets = list(sentiwordnet.senti_synsets(word, wn_tag))
        synset = synsets[0]
        print(synset)

<good.a.01: PosScore=0.75 NegScore=0.0>
<see.n.01: PosScore=0.0 NegScore=0.0>
<again.r.01: PosScore=0.0 NegScore=0.0>


In [60]:
sentiment = 0
for word, pos in pos_tag(word_list):
    wn_tag = pen_to_wordnet(pos)
    if wn_tag:
        synsets = list(sentiwordnet.senti_synsets(word, wn_tag))
        synset = synsets[0]
        sentiment += synset.pos_score() - synset.neg_score()

sentiment

0.75

In [61]:
from nltk import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [62]:
# 표제어 추출까지 고려

sentiment = 0
for word, pos in pos_tag(word_list):
    wn_tag = pen_to_wordnet(pos)
    if wn_tag:
        lemma = lemmatizer.lemmatize(word, wn_tag)
        synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
        synset = synsets[0]
        sentiment += synset.pos_score() - synset.neg_score()

sentiment

0.75

- Document에서 감성지수를 계산하는 과정 및 함수

In [64]:
from nltk import sent_tokenize

doc = '''
Spider-Man: No Way Home is a phenomenal conclusion to the trilogy and Holland's best outing as Spidey yet.
Starts off fun, safe and familiar and then becomes extremely emotional, satisfying and full of great callbacks.
A love letter to all things Spider-Man.
Tom Holland gives an incredible lead performance once again, reliably charming and likeable but with a lot more emotional heft this time around.
Zendaya and Jacob Batalon both have perfect chemistry with Holland and are also incredible in their own ways.
Benedict Cumberbatch reaffirms why he's such a good Doctor Strange with his excellent dry wit and gravitas.
All the returning villains give strong performances but Willem Dafoe is definitely the standout performance with an unsettling and terrifying presence.
Jon Watts' direction is superb, the action sequences are thrillingly acrobatic once again but refreshingly impactful this time around.
There's a few homages to the styles of the previous iterations and some gorgeous imagery.
It's also perfectly paced with none of its roughly 2hr 30 minute runtime feeling dull or overly long due to a strong momentum established early on.
The CG is extremely impressive with only a few weak spots.
The music by Michael Giacchino is fantastic, a nice balance of themes from the previous iterations and use of Holland's iconic motifs which are slightly altered in ways that work as well as a few new additions that are beautifully epic.
'''

In [67]:
sentiment = 0.0
for sentence in sent_tokenize(doc):
    word_list = [word for word in word_tokenize(sentence) if len(word) > 2]
    for word, pos in pos_tag(word_list):
        wn_tag = pen_to_wordnet(pos)
        if wn_tag:
            lemma = lemmatizer.lemmatize(word, wn_tag)
            synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
            if not synsets:
                print(word)
                continue
            synset = synsets[0]
            sentiment += synset.pos_score() - synset.neg_score()

print('============= 분석 끝 =================')
print('긍정' if sentiment >= 0 else '부정')

Spider-Man
Spidey
satisfying
Spider-Man
lead
Zendaya
Batalon
Cumberbatch
Strange
Willem
Dafoe
standout
unsettling
Jon
superb
thrillingly
impactful
runtime
Giacchino
iconic
긍정


In [80]:
# 특정 문서가 긍정인지 부정인지 중립인지 판단
def swn_polarity(doc):
    lemmatizer = WordNetLemmatizer()
    sentiment = 0.0
    for sentence in sent_tokenize(doc):
        word_list = [word for word in word_tokenize(sentence) if len(word) > 2]
        for word, pos in pos_tag(word_list):
            wn_tag = pen_to_wordnet(pos)
            if wn_tag:
                lemma = lemmatizer.lemmatize(word, wn_tag)
                synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
                if synsets:
                    synset = synsets[0]
                    sentiment += synset.pos_score() - synset.neg_score()

    return 1 if sentiment >= 0 else 0

- IMDB 영화평 감성 분석

In [73]:
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [74]:
df.review = df.review.str.replace('<br />', ' ')
df.review = df.review.str.replace('[^a-zA-Z]', ' ').str.strip()

In [75]:
df.shape

(25000, 3)

In [76]:
#상위 데이터 10000개만 추출
df = df.head(10000)

In [78]:
df['pred'] = df.review.apply(lambda x: swn_polarity(x))

In [79]:
df.head()

Unnamed: 0,id,sentiment,review,pred
0,"""5814_8""",1,With all this stuff going down at the moment w...,긍정
1,"""2381_9""",1,The Classic War of the Worlds by Timothy Hin...,긍정
2,"""7759_3""",0,The film starts with a manager Nicholas Bell ...,부정
3,"""3630_4""",0,It must be assumed that those who praised this...,부정
4,"""9495_8""",1,Superbly trashy and wondrously unpretentious ...,부정


In [92]:
# 정확도 계산

from sklearn.metrics import accuracy_score

y = np.array(df.sentiment)
pred = np.array(df.pred).astype(int)

accuracy_score(y, pred)

0.6309

### VADER Lexicon을 이용한 감성 분석

In [93]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [94]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

senti_analyzer = SentimentIntensityAnalyzer()
senti_analyzer.polarity_scores(df.review[0])

{'compound': -0.7943, 'neg': 0.13, 'neu': 0.743, 'pos': 0.127}

In [95]:
def vader_polarity(doc, threshold=0.1):
    score = senti_analyzer.polarity_scores(doc)
    return 1 if score['compound'] >= threshold else 0

In [96]:
df['vader'] = df.review.apply(lambda x: vader_polarity(x))

In [97]:
accuracy_score(df.sentiment, df.vader)

0.6997