In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix


In [0]:
# 로튼토마토 영화리뷰자료를 이용한 다항 로지스틱회귀 분석
# 감상평은 0(부정) ~ 2(중립) ~ 4(긍정)으로 분류됨


# 데이터 불러오기
movies = pd.read_csv('RottenTomato.tsv', sep='\t')


# EDA
movies.head()
movies.tail()
movies.info()
movies.describe()


# 전체자료수
movies.count()


# 결측치 여부 확인
movies.isnull().sum()


# 분류값 시각화
movies['Sentiment'].value_counts()

sns.countplot(movies['Sentiment'])
plt.show()


# 전처리 : 대소문자 변환, tfidf벡터라이즈
import re 

data = movies['Phrase'].str.lower()   # 소문자변환
target = movies['Sentiment']

tmp = []
for line in data:
    line = re.sub('[^a-z]', ' ', line)   # 숫자/기호 제거
    line = re.sub('[\s]+', ' ', line)   # 공백 하나로 합침
    tmp.append(''.join(line.strip()))

data = tmp

data[:5]


# 훈련/검증 데이터 분리
X_train, X_test, y_train, y_test = \
train_test_split(data, target, random_state=1907081230)


# 문자데이터를 숫자벡터로 변환
vectors = TfidfVectorizer()
vX_train = vectors.fit_transform(X_train)   
vX_test = vectors.transform(X_test)
# fit_transform : 단어사전 생성후 tfidf 조사
# transform : 생성한 단어사전을 토대로 tfidf 조사



# 다항 로지스틱 회귀 분석
lgr = LogisticRegression(solver='saga', multi_class='multinomial')

lgr.fit(vX_train, y_train)

pred = lgr.predict(vX_test)



# 분석 평가
print('훈련정확도', lgr.score(vX_train, y_train))
print('검증정확도', lgr.score(vX_test, y_test))
confusion_matrix(y_test, pred)
# v = ([[  325,   907,   455,    49,     4],
#       [  200,  2540,  3769,   272,    10],
#       [   47,  1053, 17493,  1303,    39],
#       [    6,   220,  3880,  3870,   306],
#       [    0,    28,   335,  1325,   579]])



# 예측결과 확인
yy_test = list(y_test)   # 리스트로 변환
for i in range(10, 20):
    print(pred[i], X_test[i], yy_test[i])







1 generates little narrative momentum and invites unflattering comparisons to other installments in the ryan series 0
3 succeeds primarily 3
2 a point of view nor a compelling reason for being 2
3 for its originality 3
2 s not an original character siuation or joke in the entire movie 1
3 s packed to bursting with incident and with scores of characters some fictional some from history 3
2 where the plot should be 2
3 a celebration of feminine energy a tribute to the power of women to heal 3
2 third time s the charm yeah baby 4
2 about the business of making movies 2
