# 영화 리뷰 데이터 감성 분석

 `Doc2Vec`, `LogisticRegresion`을 이용해서 이진 분류를 수행한다.

In [None]:
# 모듈 불러 오기
import numpy as np
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, Flatten
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as K
import matplotlib.pyplot as plt

In [None]:
# 경로 설정
root_path = "/content/drive/My Drive/멀티캠퍼스/[혁신성장] 인공지능 자연어처리 기반/[강의]/조성현 강사님"
data_path = f"{root_path}/dataset"

## _1_. 로지스틱 회귀 : 0.84528

### 데이터 준비

In [None]:
# 데이터 로드
df = pd.read_csv(f"{data_path}/4-1.train_clean.csv")
display(df)

Unnamed: 0,review,sentiment
0,stuff going moment mj started listening music ...,1
1,classic war worlds timothy hines entertaining ...,1
2,film starts manager nicholas bell giving welco...,0
3,must assumed praised film greatest filmed oper...,0
4,superbly trashy wondrously unpretentious explo...,1
...,...,...
24995,seems like consideration gone imdb reviews fil...,0
24996,believe made film completely unnecessary first...,0
24997,guy loser get girls needs build picked stronge...,0
24998,minute documentary bu uel made early one spain...,0


In [None]:
# 데이터 분리
reviews = list(df['review'])
sentiments = list(df['sentiment'])

In [None]:
# 문장 데이터 분리
sentences = []
for review in reviews:
    sentences.append(review.split())

print(sentences[:5])

[['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker', 'maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'whether', 'guilty', 'innocent', 'moonwalker', 'part', 'biography', 'part', 'feature', 'film', 'remember', 'going', 'see', 'cinema', 'originally', 'released', 'subtle', 'messages', 'mj', 'feeling', 'towards', 'press', 'also', 'obvious', 'message', 'drugs', 'bad', 'kay', 'visually', 'impressive', 'course', 'michael', 'jackson', 'unless', 'remotely', 'like', 'mj', 'anyway', 'going', 'hate', 'find', 'boring', 'may', 'call', 'mj', 'egotist', 'consenting', 'making', 'movie', 'mj', 'fans', 'would', 'say', 'made', 'fans', 'true', 'really', 'nice', 'actual', 'feature', 'film', 'bit', 'finally', 'starts', 'minutes', 'excluding', 'smooth', 'criminal', 'sequence', 'joe', 'pesci', 'convincing', 'psychopathic', 'powerful', 'drug', 'lord',

### Doc2Vec 임베딩

In [None]:
# Doc2Vec 모델 생성
model_path = f"{data_path}/300features.doc2vec"
model_saved = True # 지금은 일단 저장된 게 없다

if model_saved :
    doc_model = Doc2Vec.load(model_path)
else:
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)] # 번호로 태그
    doc_model = Doc2Vec(vector_size=300, 
                        alpha=0.025, 
                        min_alpha=0.00025,
                        min_count=10, 
                        workers=4, 
                        dm=1)
    doc_model.build_vocab(documents)
    doc_model.train(documents, total_examples=model.corpus_count, epochs=10)
    doc_model.save(model_path)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
# 모델 확인
keys = list(doc_model.wv.vocab.keys())
print(f"단어 개수: {len(keys)}")
print("========= 샘플 확인 =========")
print(keys[:20])

단어 개수: 19717
['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'moonwalker', 'maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought']


모델의 word vector 확인

In [None]:
# 단어 'stuff'의 벡터 확인
print(f"stuff:\n {model.wv['stuff']}")

# 단어 유사도 측정
print(f"dog, cat : {model.wv.similarity('dog', 'cat')}")
print(f"dog, cake : {model.wv.similarity('dog', 'cake')}")

# 단어 유사도 측정: 벡터 내적
print(f"dog, cat : {np.dot(model.wv['dog'], model.wv['cat'])}")
print(f"dog, cake : {np.dot(model.wv['dog'], model.wv['cake'])}")

stuff:
 [-0.18611595 -0.36705476 -0.23880133 -0.38356516 -0.1215637  -0.31198502
 -0.1462326  -0.81891495 -0.02643439 -0.06654777  0.03929293 -0.8636567
  0.46589312  0.24193387 -0.15810192 -0.15600042  0.05049789 -0.59269065
 -0.03312543 -0.523009   -0.16240235  0.65150017 -0.13370064  0.13427418
 -0.3869068   0.01416051  0.8123483  -0.48103186  0.24293292 -0.3215318
 -0.41009226 -0.5711381   0.86399955  0.35488522  0.28767172  0.2832965
 -0.32490394 -0.6875084  -0.5823474  -0.2030511   0.06413027 -0.328916
  0.3617928   0.3433004   0.04010413  0.18418486  0.25569075 -0.7578971
 -0.16020748  0.5819255  -0.67376256 -0.27500638 -0.14619175 -0.18352857
 -0.0070066   0.14206877  0.04845129 -0.62029594  0.06632705  0.81629825
  1.1358607   0.01165283  0.47023997  0.29312617  0.4921738   0.5336238
  0.18848604 -0.10422531  0.43303135  0.512983    1.2141435   0.08608381
 -0.06378851  0.2943112   0.11136171  0.19842027  1.213685   -0.9718013
  0.36714703 -0.6624622   0.1054258   0.6882721  -0

  if np.issubdtype(vec.dtype, np.int):


모델의 document 벡터 확인

In [None]:
# 첫 번째 문장의 벡터 확인
print(reviews[0])
print(model.docvecs[0])

stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate working

In [None]:
# 감성분석 데이터 생성
X_data_raw = [model.docvecs[i] for i in range(len(sentences))]
y_data = np.array(sentiments)

### 로지스틱 회귀 모델

In [None]:
# 데이터 분리
X_data = X_data_raw.copy()

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data,
                                                    test_size=0.25,
                                                    random_state=42)
print(f"Train: {len(X_train)}, {y_train.shape}")
print(f"Test: {len(X_test)}, {y_test.shape}")

Train: 18750, (18750,)
Test: 6250, (6250,)


In [None]:
# 로지스틱 회귀 분석
lr_model = LogisticRegression(class_weight='balanced', solver='newton-cg')
lr_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
# 정확도 측정
predicted = lr_model.predict(X_test)
print(f"Sample Predicted: {predicted[:20]}")
print(f"Test Accuracy: {lr_model.score(X_test, y_test)}")

Sample Predicted: [0 1 0 1 0 1 1 1 0 1 0 0 0 1 0 1 0 1 1 1]
Test Accuracy: 0.84528


In [None]:
# 새로운 문장에 대한 예측
new_sentence = model.infer_vector(['system', 'repsponse', 'cpu', 'compute'])
print(new_sentence)
new_sentence_pred = lr_model.predict(new_sentence.reshape(1, -1)) # 문장 하나이므로 reshape
print(new_sentence_pred)

[ 4.93865088e-03  1.98567472e-02  3.61140780e-02  3.82797755e-02
 -3.80598125e-03 -3.15198675e-02  2.69053131e-03 -2.52771042e-02
 -1.97380092e-02 -8.14476609e-02 -6.34776652e-02  7.86244491e-05
  3.03491391e-02 -1.17518324e-02  1.59860216e-02 -1.96323320e-02
 -2.27632467e-02 -1.17330188e-02  2.62095705e-02  1.23778824e-02
  4.95460536e-03 -2.13385969e-02  6.64318970e-04  4.00892682e-02
 -2.29727174e-03 -8.89134873e-03  1.28722815e-02 -1.51277091e-02
 -1.44082997e-02  3.46479677e-02 -1.69262812e-02  1.79347582e-03
  1.52955633e-02 -2.22750735e-02 -4.83419280e-03  4.72374894e-02
  6.39771635e-04  2.60303058e-02 -5.02311252e-03 -2.79587507e-02
  3.08272969e-02 -1.88137882e-03 -1.31927747e-02  4.24919128e-02
 -1.57636944e-02 -1.82467997e-02  1.64566077e-02 -3.69514562e-02
 -6.04261756e-02 -3.68457958e-02 -3.09357122e-02 -2.92656869e-02
 -9.16608237e-03  4.28173095e-02  2.62064356e-02  1.51867578e-02
 -2.33184304e-02  1.95562318e-02  6.28911611e-03  8.42724927e-03
  1.46305906e-02 -1.97559

# ============ 과제 ============

## _2_. FFN : 최고 0.84704

In [None]:
# 데이터 준비
X_data = np.array(X_data_raw)

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data,
                                                    test_size=0.25,
                                                    random_state=42)

print(f"Train: {X_train.shape}, {y_train.shape}")
print(f"Test: {X_test.shape}, {y_test.shape}")

Train: (18750, 300), (18750,)
Test: (6250, 300), (6250,)


In [None]:
# 딥러닝 모델 네트워크 구성
K.clear_session()

X_input = Input(batch_shape=(None, X_train.shape[1]))
X_hidden = Dense(128)(X_input)
X_hidden = Dropout(0.2)(X_hidden)
X_hidden = Dense(256)(X_hidden)
X_hidden = Dropout(0.2)(X_hidden)
X_hidden = Dense(128)(X_hidden)
y_output = Dense(1, activation='sigmoid')(X_hidden)

model = Model(X_input, y_output)
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001))
print("============= 모델 전체 구조 =============")
print(model.summary())

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 300)]             0         
_________________________________________________________________
dense (Dense)                (None, 128)               38528     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               33024     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                

In [None]:
# 모델 학습
BATCH = int(input('배치 사이즈 설정: '))
EPOCHS = int(input('학습 에폭 설정: '))

es = EarlyStopping(monitor='val_loss', patience=4, verbose=1)

# hist = model.fit(X_train, y_train,
#                  batch_size=BATCH,
#                  validation_split = 0.1,
#                  epochs=EPOCHS,
#                  callbacks=[es])ㅌ

model.fit(X_train, y_train,
          batch_size=BATCH,
          validation_split=0.2,
          epochs=EPOCHS)

배치 사이즈 설정: 200
학습 에폭 설정: 300
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch

<tensorflow.python.keras.callbacks.History at 0x7f404a8b47b8>

In [None]:
# 예측
y_pred = model.predict(X_test)
y_pred = np.round(y_pred, 0).reshape(-1, )
accuracy = (y_test == y_pred).mean()
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.84688


## _3_. LSTM 모델
- embedding matrix 만들고 하면?
- embedding matrix 안 만들고 하면?

### embedding matrix 안 만들고 그냥 해 보기 : 최고 0.84416

In [None]:
# 데이터 준비
X_data = np.array(X_data_raw)

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data,
                                                    test_size=0.25,
                                                    random_state=42)

X_train = X_train.reshape(X_train.shape[0], 1, -1)
X_test = X_test.reshape(X_test.shape[0], 1, -1)
print(f"Train: {X_train.shape}, {y_train.shape}")
print(f"Test: {X_test.shape}, {y_test.shape}")

Train: (18750, 1, 300), (18750,)
Test: (6250, 1, 300), (6250,)


In [None]:
K.clear_session()

X_input = Input(batch_shape=(None, X_train.shape[1], X_train.shape[2]))
# X_lstm = Bidirectional(LSTM(128, return_sequences=True), merge_mode='concat')(X_input)
# X_flatten = Flatten()(X_lstm)
# X_hidden = Dense(128)(X_flatten)
X_lstm = Bidirectional(LSTM(128))(X_input)
X_lstm = Dropout(0.2)(X_lstm)
X_hidden = Dense(256)(X_lstm)
X_hidden = Dropout(0.3)(X_hidden)
y_output = Dense(1, activation='sigmoid')(X_hidden)

model = Model(X_input, y_output)
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.002))
print("============= 모델 전체 구조 =============")
print(model.summary())

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1, 300)]          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               439296    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 257       
Total params: 505,345
Trainable params: 505,345
Non-trainable params: 0
________________________________________________

In [None]:
BATCH = int(input('배치 사이즈 설정: '))
EPOCHS = int(input('학습 에폭 설정: '))

es = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

# hist = model.fit(X_train, y_train,
#                  batch_size=BATCH,
#                  validation_split = 0.1,
#                  epochs=EPOCHS,
#                  callbacks=[es])

model.fit(X_train, y_train,
          batch_size=BATCH,
          validation_split=0.2,
          epochs=EPOCHS,
          callbacks=[es])

배치 사이즈 설정: 200
학습 에폭 설정: 1000
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 00007: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f400b4b5d68>

In [None]:
# 예측
y_pred = model.predict(X_test)
y_pred = np.round(y_pred, 0).reshape(-1, )
accuracy = (y_test == y_pred).mean()
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.83904


### embedding matrix 만들기

In [None]:
def check_len(m, sentences):
    cnt = 0    
    for sent in sentences:
        if len(sent) <= m:
            cnt += 1
    
    return f'전체 문장 중 길이가 {m} 이하인 샘플의 비율: {(cnt/len(sentences))*100}'

def get_vector(word, pretrained):
    if word in pretrained.wv.vocab.keys():
        return pretrained.wv[word]
    else:
        return None

In [None]:
# 문장 길이 설정
for length in range(100, 1000, 50):
    print(check_len(length, sentences))

max_length = int(input('문장 길이 설정: '))

전체 문장 중 길이가 100 이하인 샘플의 비율: 57.628
전체 문장 중 길이가 150 이하인 샘플의 비율: 76.264
전체 문장 중 길이가 200 이하인 샘플의 비율: 85.84
전체 문장 중 길이가 250 이하인 샘플의 비율: 91.228
전체 문장 중 길이가 300 이하인 샘플의 비율: 94.552
전체 문장 중 길이가 350 이하인 샘플의 비율: 96.5
전체 문장 중 길이가 400 이하인 샘플의 비율: 97.88
전체 문장 중 길이가 450 이하인 샘플의 비율: 98.784
전체 문장 중 길이가 500 이하인 샘플의 비율: 99.444
전체 문장 중 길이가 550 이하인 샘플의 비율: 99.804
전체 문장 중 길이가 600 이하인 샘플의 비율: 99.92
전체 문장 중 길이가 650 이하인 샘플의 비율: 99.94800000000001
전체 문장 중 길이가 700 이하인 샘플의 비율: 99.964
전체 문장 중 길이가 750 이하인 샘플의 비율: 99.97200000000001
전체 문장 중 길이가 800 이하인 샘플의 비율: 99.98
전체 문장 중 길이가 850 이하인 샘플의 비율: 99.984
전체 문장 중 길이가 900 이하인 샘플의 비율: 99.988
전체 문장 중 길이가 950 이하인 샘플의 비율: 99.996
문장 길이 설정: 300


In [None]:
X_train, X_test, y_train, y_test = train_test_split(sentences, sentiments,
                                                    test_size=0.25,
                                                    random_state=42)

print(f"Train: {len(X_train)}, {len(y_train)}")
print(f"Test: {len(X_test)}, {len(y_test)}")

Train: 18750, 18750
Test: 6250, 6250


In [None]:
# 토큰화
Tokenizer().texts_to_sequences(X_train)

In [None]:
X_train = pad_sequences(X_train, maxlen=max_length) # truncate 되는 건가?
X_test = pad_sequences(X_test, maxlen=max_length)
print("========== 패딩 후 ==========")
print(f"훈련 데이터: {X_train.shape}")
print(f"테스트 데이터: {X_test.shape}")

ValueError: ignored

In [None]:
print(doc_model)
keys = list(doc_model.wv.vocab.keys())
vocab_size = len(keys)
print(vocab_size)

Doc2Vec(dm/m,d300,n5,w5,mc10,s0.001,t4)
19717


In [None]:
# doc2vec 가중치 행렬
embedding_vocab = len(keys)
embedding_dim = 300
g_embed_300 = np.zeros((embedding_vocab, embedding_dim))
for word, idx in enumerate(keys):
    temp = get_vector(word, doc_model)
    if temp is not None:
        g_embed_300[idx] = temp

In [None]:
g_embed_300.shape

(19717, 300)

In [None]:
Embedding(input_)

In [None]:
K.clear_session()

X_input = Input(batch_shape=(None, X_train.shape[1])
X_embed = Embedding(input_dim=vocab_size, output_dim=300, input_length=)(X_input)
X_lstm = LSTM(128)(X_embed)
y_output = Dense(1, activation='sigmoid')(X_lstm)

model = Model(X_input, y_output)
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001))
print("============= 모델 전체 구조 =============")
print(model.summary())


model.add(Embedding(vocab_size, 100, input_length=max_len, 
                    weights=[g_embed_100],
                    mask_zero=True, trainable=False))

# ============ 테스트 ============

In [None]:
for i, doc in enumerate(sentences):
    print(TaggedDocument(doc, [i]))
    if i == 5:
        break

TaggedDocument(['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker', 'maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'whether', 'guilty', 'innocent', 'moonwalker', 'part', 'biography', 'part', 'feature', 'film', 'remember', 'going', 'see', 'cinema', 'originally', 'released', 'subtle', 'messages', 'mj', 'feeling', 'towards', 'press', 'also', 'obvious', 'message', 'drugs', 'bad', 'kay', 'visually', 'impressive', 'course', 'michael', 'jackson', 'unless', 'remotely', 'like', 'mj', 'anyway', 'going', 'hate', 'find', 'boring', 'may', 'call', 'mj', 'egotist', 'consenting', 'making', 'movie', 'mj', 'fans', 'would', 'say', 'made', 'fans', 'true', 'really', 'nice', 'actual', 'feature', 'film', 'bit', 'finally', 'starts', 'minutes', 'excluding', 'smooth', 'criminal', 'sequence', 'joe', 'pesci', 'convincing', 'psychopathic', 'powerful', '