In [25]:
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score 
from sklearn.metrics import recall_score, f1_score, roc_auc_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import numpy as np
import re

In [3]:
term = 'present'

synsets = wn.synsets(term)
print(type(synsets), len(synsets), synsets)

<class 'list'> 18 [Synset('present.n.01'), Synset('present.n.02'), Synset('present.n.03'), Synset('show.v.01'), Synset('present.v.02'), Synset('stage.v.01'), Synset('present.v.04'), Synset('present.v.05'), Synset('award.v.01'), Synset('give.v.08'), Synset('deliver.v.01'), Synset('introduce.v.01'), Synset('portray.v.04'), Synset('confront.v.03'), Synset('present.v.12'), Synset('salute.v.06'), Synset('present.a.01'), Synset('present.a.02')]


In [4]:
for synset in synsets:
    print('###Synset name:', synset.name(), '###')
    print('POS :', synset.lexname())
    print('Definition:', synset.definition())
    print('Lemmas', synset.lemma_names())

###Synset name: present.n.01 ###
POS : noun.time
Definition: the period of time that is happening now; any continuous stretch of time including the moment of speech
Lemmas ['present', 'nowadays']
###Synset name: present.n.02 ###
POS : noun.possession
Definition: something presented as a gift
Lemmas ['present']
###Synset name: present.n.03 ###
POS : noun.communication
Definition: a verb tense that expresses actions or states at the time of speaking
Lemmas ['present', 'present_tense']
###Synset name: show.v.01 ###
POS : verb.perception
Definition: give an exhibition of to an interested audience
Lemmas ['show', 'demo', 'exhibit', 'present', 'demonstrate']
###Synset name: present.v.02 ###
POS : verb.communication
Definition: bring forward and present to the mind
Lemmas ['present', 'represent', 'lay_out']
###Synset name: stage.v.01 ###
POS : verb.creation
Definition: perform (a play), especially on a stage
Lemmas ['stage', 'present', 'represent']
###Synset name: present.v.04 ###
POS : verb.

In [6]:
tree = wn.synset('tree.n.01')
lion = wn.synset('lion.n.01')
tiger = wn.synset('tiger.n.02')
cat = wn.synset('cat.n.01')
dog = wn.synset('dog.n.01')

entities = [tree, lion, tiger, cat, dog]
similarities = []
entity_names = [entity.name().split('.')[0] for entity in entities]

# 유사도 측정
for entity in entities:
    similarity = [round(entity.path_similarity(compared_entity), 2)
                 for compared_entity in entities]
    similarities.append(similarity)

similarity_df = pd.DataFrame(similarities, columns=entity_names, index=entity_names)
similarity_df


Unnamed: 0,tree,lion,tiger,cat,dog
tree,1.0,0.07,0.07,0.08,0.12
lion,0.07,1.0,0.33,0.25,0.17
tiger,0.07,0.33,1.0,0.25,0.17
cat,0.08,0.25,0.25,1.0,0.2
dog,0.12,0.17,0.17,0.2,1.0


In [8]:
senti_synsets = list(swn.senti_synsets('slow'))
print(type(senti_synsets), len(senti_synsets), senti_synsets)

<class 'list'> 11 [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'), SentiSynset('slow.v.03'), SentiSynset('slow.a.01'), SentiSynset('slow.a.02'), SentiSynset('dense.s.04'), SentiSynset('slow.a.04'), SentiSynset('boring.s.01'), SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'), SentiSynset('behind.r.03')]


In [10]:
father = swn.senti_synset('father.n.01')
print(father.pos_score())
print(father.neg_score())
print(father.obj_score())

lovely = swn.senti_synset('lovely.a.01')
print(lovely.pos_score())

0.0
0.0
1.0
0.625


# # Sentiment analysis of review data

In [11]:
# POS
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB

In [20]:
def swn_polarity(text):
    sentiment = 0.0
    tokens_count = 0
    
    lemmatizer = WordNetLemmatizer()
    raw_sentences = sent_tokenize(text)
    # 문장별로 단어 토큰 -> 품사 태깅 후에 SentiSynset 생성 -> 감성 지수 합산
    for raw_sentence in raw_sentences:
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))
        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
            
            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
            
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            sentiment += (swn_synset.pos_score() - swn_synset.neg_score())
            tokens_count += 1
            
    if not tokens_count:
        return 0
    
    if sentiment >= 0:
        return 1
    
    return 0

In [14]:
review_df = pd.read_csv(r'C:/Users/AMD3600/git/MLguide/Text_Analysis/word2vec-nlp-tutorial/labeledTrainData.tsv', 
                        header=0, sep="\t", quoting=3)
review_df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [17]:
review_df['review'] = review_df['review'].str.replace('<br />', ' ')
review_df['review'] = review_df['review'].apply(lambda x: re.sub("[^a-zA-Z]", " ", x))

In [21]:
review_df['preds'] = review_df['review'].apply(lambda x: swn_polarity(x))
y_target = review_df['sentiment'].values
preds = review_df['preds'].values

In [23]:
def get_clf_eval(y_test=None, pred=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [24]:
print('### SentiWordNet 예측 성능 평가 ###')
get_clf_eval(y_target, preds)

### SentiWordNet 예측 성능 평가 ###
오차 행렬
[[7668 4832]
 [3636 8864]]
정확도: 0.6613, 정밀도: 0.6472, 재현율: 0.7091,    F1: 0.6767, AUC:0.6613


# # Vader

In [27]:
senti_analyzer = SentimentIntensityAnalyzer()
senti_scores = senti_analyzer.polarity_scores(review_df['review'][0])
print(senti_scores)

{'neg': 0.13, 'neu': 0.743, 'pos': 0.127, 'compound': -0.7943}


In [28]:
def vader_polarity(review, threshold=0.1):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    
    agg_score = scores['compound']
    final_sentiment = 1 if agg_score >= threshold else 0
    return final_sentiment

In [29]:
review_df['vader_preds'] = review_df['review'].apply(lambda x: vader_polarity(x, 0.1))
y_target = review_df['sentiment'].values
vader_preds = review_df['vader_preds'].values

print('### vader ###')
get_clf_eval(y_target, vader_preds)

### vader ###
오차 행렬
[[ 6736  5764]
 [ 1867 10633]]
정확도: 0.6948, 정밀도: 0.6485, 재현율: 0.8506,    F1: 0.7359, AUC:0.6948
