# 4.5.4 [실습] 다항분포 나이브 베이즈를 활용한 영화 리뷰 분류

## 문제 정의

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

## 데이터 획득

In [2]:
review_list = [
    {
        "movie_review": "this is great great movie. I will watch again",
        "type": "positive",
    },
    {"movie_review": "I like this movie", "type": "positive"},
    {"movie_review": "amazing movie in this year", "type": "positive"},
    {
        "movie_review": "cool my boyfriend also said the movie is cool",
        "type": "positive",
    },
    {"movie_review": "awesome of the awesome movie ever", "type": "positive"},
    {"movie_review": "shame I wasted money and time", "type": "negative"},
    {
        "movie_review": "regret on this move. I will never never what movie from this director",
        "type": "negative",
    },
    {"movie_review": "I do not like this movie", "type": "negative"},
    {"movie_review": "I do not like actors in this movie", "type": "negative"},
    {"movie_review": "boring boring sleeping movie", "type": "negative"},
]

df = pd.DataFrame(review_list)
df

Unnamed: 0,movie_review,type
0,this is great great movie. I will watch again,positive
1,I like this movie,positive
2,amazing movie in this year,positive
3,cool my boyfriend also said the movie is cool,positive
4,awesome of the awesome movie ever,positive
5,shame I wasted money and time,negative
6,regret on this move. I will never never what m...,negative
7,I do not like this movie,negative
8,I do not like actors in this movie,negative
9,boring boring sleeping movie,negative


## 데이터 다듬기

In [3]:
df["label"] = df["type"].map({"positive": 1, "negative": 0})

In [4]:
df_x = df["movie_review"]
df_y = df["label"]

In [6]:
cv = CountVectorizer()
x_traincv = cv.fit_transform(df_x)
encoded_input = x_traincv.toarray()
encoded_input

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 2,
        0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0

In [9]:
cv.inverse_transform(encoded_input[0].reshape(1, -1))

[array(['again', 'great', 'is', 'movie', 'this', 'watch', 'will'],
       dtype='<U9')]

In [10]:
cv.get_feature_names_out()

array(['actors', 'again', 'also', 'amazing', 'and', 'awesome', 'boring',
       'boyfriend', 'cool', 'director', 'do', 'ever', 'from', 'great',
       'in', 'is', 'like', 'money', 'move', 'movie', 'my', 'never', 'not',
       'of', 'on', 'regret', 'said', 'shame', 'sleeping', 'the', 'this',
       'time', 'wasted', 'watch', 'what', 'will', 'year'], dtype=object)

## 다항분포 나이브 베이즈 모델 학습하기

In [11]:
mnb = MultinomialNB()
y_train = df_y.astype("int")
mnb.fit(x_traincv, y_train)

## 테스트 데이터 다듬기

In [12]:
test_feedback_list = [
    {"movie_review": "great great great movie ever", "type": "positive"},
    {"movie_review": "I like this amazing movie", "type": "positive"},
    {"movie_review": "my boyfriend said great movie ever", "type": "positive"},
    {"movie_review": "cool cool cool", "type": "positive"},
    {"movie_review": "awesome boyfriend said cool movie ever", "type": "positive"},
    {"movie_review": "shame shame shame", "type": "negative"},
    {"movie_review": "awesome director shame movie boring movie", "type": "negative"},
    {"movie_review": "do not like this movie", "type": "negative"},
    {"movie_review": "I do not like this boring movie", "type": "negative"},
    {"movie_review": "aweful terrible boring movie", "type": "negative"},
]

test_df = pd.DataFrame(test_feedback_list)
test_df["label"] = test_df["type"].map({"positive": 1, "negative": 0})

test_x = test_df["movie_review"]
test_y = test_df["label"]

## 테스트

In [13]:
x_testcv = cv.transform(test_x)
predictions = mnb.predict(x_testcv)

In [14]:
accuracy_score(test_y, predictions)

1.0