<a href="https://colab.research.google.com/github/chu-ise/411A-2022/blob/main/notebooks/08/01_sentiment_lexicon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis



In [None]:
%%capture
%pip install ekorpkit[model,visualize]==0.1.22.post0.dev10
%pip install ipython-autotime
%load_ext autotime

In [None]:
from ekorpkit.models.metrics import evaluate_classification_performance
from ekorpkit.visualize.classification import plot_confusion_matrix

from ekorpkit import eKonf

config_group='visualize/plot=confusion_matrix'
cfg = eKonf.compose(config_group=config_group)
cfg.display_labels = ['pos', 'neg']

## Lexicon-based Sentiment Analysis

### NLTK Movie Reviews Dataset

In [None]:
import nltk
nltk.download('movie_reviews')

from nltk.corpus import movie_reviews

print('#review count:', len(movie_reviews.fileids())) #영화 리뷰 문서의 id를 반환
print('#samples of file ids:', movie_reviews.fileids()[:10]) #id를 10개까지만 출력
print('#categories of reviews:', movie_reviews.categories()) # label, 즉 긍정인지 부정인지에 대한 분류
print('#num of "neg" reviews:', len(movie_reviews.fileids(categories='neg'))) #label이 부정인 문서들의 id를 반환
print('#num of "pos" reviews:', len(movie_reviews.fileids(categories='pos'))) #label이 긍정인 문서들의 id를 반환

fileid = movie_reviews.fileids()[0] #첫번째 문서의 id를 반환
print('#id of the first review:', fileid)
print('#part of the first review:', movie_reviews.raw(fileid)[:500]) #첫번째 문서의 내용을 500자까지만 출력
print('#sentiment of the first review:', movie_reviews.categories(fileid)) #첫번째 문서의 감성

fileids = movie_reviews.fileids() #movie review data에서 file id를 가져옴
reviews = [movie_reviews.raw(fileid) for fileid in fileids] #file id를 이용해 raw text file을 가져옴
categories = [movie_reviews.categories(fileid)[0] for fileid in fileids] 

### TextBlob

- https://textblob.readthedocs.io/en/dev/quickstart.html

In [None]:
%%capture
%pip install -U textblob

In [None]:
from textblob import TextBlob

result = TextBlob(reviews[0])
print(result.sentiment)

In [None]:
def sentiment_TextBlob(docs):
    results = []

    for doc in docs:
        testimonial = TextBlob(doc)
        if testimonial.sentiment.polarity > 0:
            results.append('pos')
        else:
            results.append('neg')
    return results

predictions = sentiment_TextBlob(reviews)

In [None]:
cm = evaluate_classification_performance(categories, predictions)
plot_confusion_matrix(cm, **cfg)

### AFINN

- https://github.com/fnielsen/afinn 
- http://corpustext.com/reference/sentiment_afinn.html

In [None]:
%%capture
%pip install afinn

In [None]:
from afinn import Afinn

def sentiment_Afinn(docs):
    afn = Afinn(emoticons=True)
    results = []

    for doc in docs:
        if afn.score(doc) > 0:
            results.append('pos')
        else:
            results.append('neg')
    return results

In [None]:
predictions = sentiment_Afinn(reviews)
cm = evaluate_classification_performance(categories, predictions)
plot_confusion_matrix(cm, **cfg)

### VADER

- https://github.com/cjhutto/vaderSentiment

In [None]:
%%capture
import nltk
nltk.download('vader_lexicon')

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def sentiment_vader(docs):
    analyser = SentimentIntensityAnalyzer()
    results = []

    for doc in docs:
        score = analyser.polarity_scores(doc)
        if score['compound'] > 0:
            results.append('pos')
        else:
            results.append('neg')

    return results

In [None]:
predictions = sentiment_vader(reviews)
cm = evaluate_classification_performance(categories, predictions)
plot_confusion_matrix(cm, **cfg)

## ML-based Sentiment Analysis

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(reviews, categories, test_size=0.2, random_state=7)

print('Train set count: ', len(X_train))
print('Test set count: ', len(X_test))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

tfidf = TfidfVectorizer().fit(X_train) 

X_train_tfidf = tfidf.transform(X_train)
print('#Train set dimension:', X_train_tfidf.shape)
X_test_tfidf = tfidf.transform(X_test)
print('#Test set dimension:', X_test_tfidf.shape)

NB_clf = MultinomialNB(alpha=0.01)
NB_clf.fit(X_train_tfidf, y_train)
print('#Train set score: {:.3f}'.format(NB_clf.score(X_train_tfidf, y_train)))
print('#Test set score: {:.3f}'.format(NB_clf.score(X_test_tfidf, y_test)))

In [None]:
predictions = NB_clf.predict(X_test_tfidf)
cm = evaluate_classification_performance(y_test, predictions)
plot_confusion_matrix(cm, **cfg)

In [None]:
name = input("What is your name? ")
sid = input("What is your student ID? ")
print("Name: " + name + "\nStudent ID: " + sid)