<a href="https://colab.research.google.com/github/seunghyunmoon2/NLP/blob/master/NLP11_Kaggle_BOWmeetsBagsOfPopcorns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Kaggle's Bags of Words Meets Bags of Popcorns

영화리뷰 데이터 분류

## 영화리뷰 데이터 전처리 

In [None]:
# 영화리뷰 데이터 전처리
# 데이터 : Kaggle의 Bags of Words Meets Bags of Popcorns
# ------------------------------------------------------
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# 4.1 장에서 사용할 데이터인 영화 리뷰 데이터를 불러온다
train_data = pd.read_csv('dataset/4-1.labeledTrainData.tsv', 
                         header = 0, delimiter = '\t', quoting = 3)
train_data.head()

# 전처리 작업
def preprocessing(review, stops, remove_stopwords = False): 
    # 1. HTML 태그 제거
    review_text = BeautifulSoup(review, "html.parser").get_text()

    # 2. 영어가 아닌 특수문자들을 공백(" ")으로 바꾸기
    review_text = re.sub("[^a-zA-Z]", " ", review_text)

    # 3. 대문자들을 소문자로 바꾸고 공백단위로 텍스트들 나눠서 리스트로 만든다.
    words = review_text.lower().split()

    if remove_stopwords: 
        # 4. 불용어 제거       
        # 불용어가 아닌 단어들로 이루어진 새로운 리스트 생성
        words = [w for w in words if not w in stops]
        
        # 5. 단어 리스트를 공백을 넣어서 하나의 글로 합친다.
        clean_review = ' '.join(words)

    else: # 불용어를 제거하지 않을 때
        clean_review = ' '.join(words)

    return clean_review

stops = set(stopwords.words("english"))
clean_train_reviews = []
for review in train_data['review']:
    r = preprocessing(review, stops, remove_stopwords = True)
    clean_train_reviews.append(r)

# 전처리한 데이터 확인
clean_train_reviews[0]

# 전처리가 완료된 리뷰 데이터를 데이터프레임으로 구성한다. (학습할 데이터)
clean_train_df = pd.DataFrame({'review': clean_train_reviews, 
                               'sentiment': train_data['sentiment']})

# 리뷰 데이터를 워드 인덱스로 표현한다.
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train_reviews)
text_sequences = tokenizer.texts_to_sequences(clean_train_reviews)
print(text_sequences[0])
len(text_sequences[0])

# 리뷰 데이터의 길이를 통일시킨다.
# 길이는 중간값인 174로 하고, 리뷰 데이터의 워드 인덱스 길이가 174보다 작으면 
# 뒷 부분을 0으로 패딩하고, 174보다 크면 뒷 부분을 버린다.
MAX_SEQUENCE_LENGTH = 174 

train_inputs = pad_sequences(text_sequences, 
                             maxlen=MAX_SEQUENCE_LENGTH,
                             padding='post',
                             truncating='post')
print('Shape of train data: ', train_inputs.shape)

print(train_inputs[0])
print("길이 = ", len(train_inputs[0]))

# 리뷰 문서의 라벨 (1 or 0)을 가져온다
train_labels = np.array(train_data['sentiment'])
print('Shape of label tensor:', train_labels.shape)
train_labels[0]

TRAIN_INPUT_DATA = 'train_input.npy'
TRAIN_LABEL_DATA = 'train_label.npy'
TRAIN_CLEAN_DATA = 'train_clean.csv'

# 전처리가 완료된 학습 데이터를 파일에 저장해 둔다
TRAIN_INPUT_DATA = '4-1.train_input.npy'
TRAIN_LABEL_DATA = '4-1.train_label.npy'
TRAIN_CLEAN_DATA = '4-1.train_clean.csv'
DATA_IN_PATH = './dataset/'

# 전처리 된 데이터를 넘파이 형태로 저장
np.save(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'wb'), train_inputs)
np.save(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'wb'), train_labels)

# 정제된 텍스트를 csv 형태로 저장
clean_train_df.to_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA, index = False)

- output
```
[404, 70, 419, 8815, 506, 2456, 115, 54, 873, 516, 178, 18686, 178, 11242, 165, 78, 14, 662, 2457, 117, 92, 10, 499, 4074, 165, 22, 210, 581, 2333, 1194, 11242, 71, 4826, 71, 635, 2, 253, 70, 11, 302, 1663, 486, 1144, 3265, 8815, 411, 793, 3342, 17, 441, 600, 1500, 15, 4424, 1851, 998, 146, 342, 1442, 743, 2424, 4, 8815, 418, 70, 637, 69, 237, 94, 541, 8815, 26055, 26056, 120, 1, 8815, 323, 8, 47, 20, 323, 167, 10, 207, 633, 635, 2, 116, 291, 382, 121, 15535, 3315, 1501, 574, 734, 10013, 923, 11578, 822, 1239, 1408, 360, 8815, 221, 15, 576, 8815, 22224, 2274, 13426, 734, 10013, 27, 28606, 340, 16, 41, 18687, 1500, 388, 11243, 165, 3962, 8815, 115, 627, 499, 79, 4, 8815, 1430, 380, 2163, 114, 1919, 2503, 574, 17, 60, 100, 4875, 5100, 260, 1268, 26057, 15, 574, 493, 744, 637, 631, 3, 394, 164, 446, 114, 615, 3266, 1160, 684, 48, 1175, 224, 1, 16, 4, 8815, 3, 507, 62, 25, 16, 640, 133, 231, 95, 7426, 600, 3439, 8815, 37248, 1864, 1, 128, 342, 1442, 247, 3, 865, 16, 42, 1487, 997, 2333, 12, 549, 386, 717, 6920, 12, 41, 16, 158, 362, 4392, 3388, 41, 87, 225, 438, 207, 254, 117, 3, 18688, 18689, 316, 1356]
Shape of train data:  (25000, 174)
[  404    70   419  8815   506  2456   115    54   873   516   178 18686
   178 11242   165    78    14   662  2457   117    92    10   499  4074
   165    22   210   581  2333  1194 11242    71  4826    71   635     2
   253    70    11   302  1663   486  1144  3265  8815   411   793  3342
    17   441   600  1500    15  4424  1851   998   146   342  1442   743
  2424     4  8815   418    70   637    69   237    94   541  8815 26055
 26056   120     1  8815   323     8    47    20   323   167    10   207
   633   635     2   116   291   382   121 15535  3315  1501   574   734
 10013   923 11578   822  1239  1408   360  8815   221    15   576  8815
 22224  2274 13426   734 10013    27 28606   340    16    41 18687  1500
   388 11243   165  3962  8815   115   627   499    79     4  8815  1430
   380  2163   114  1919  2503   574    17    60   100  4875  5100   260
  1268 26057    15   574   493   744   637   631     3   394   164   446
   114   615  3266  1160   684    48  1175   224     1    16     4  8815
     3   507    62    25    16   640]
길이 =  174
Shape of label tensor: (25000,)
```

## Tfidf와 Logistic Regression을 이용

In [None]:
# Tfidf와 Logistic Regression을 이용한 영화리뷰 데이터 분류
# 데이터 : Kaggle의 Bags of Words Meets Bags of Popcorns
# --------------------------------------------------------
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

TRAIN_CLEAN_DATA = '4-1.train_clean.csv'
DATA_IN_PATH = './dataset/'

# 전처리가 완료된 학습 데이터를 읽어온다.
train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

# 리뷰 문서를 tfidf로 변환한다.
# min_df : float in range [0.0, 1.0] or int (default=1)
# When building the vocabulary ignore terms that have a document frequency 
# strictly lower than the given threshold. This value is also called cut-off 
# in the literature. If float, the parameter represents a proportion of 
# documents, integer absolute counts. This parameter is ignored if vocabulary
# is not None.
#
# analyzer : string, {‘word’, ‘char’, ‘char_wb’} or callable
# Whether the feature should be made of word or character n-grams. 
# Option ‘char_wb’ creates character n-grams only from text inside word 
# boundaries; n-grams at the edges of words are padded with space.
#
# ngram_range : tuple (min_n, max_n) (default=(1, 1))
# The lower and upper boundary of the range of n-values for different n-grams
# to be extracted.
# All values of n such that min_n <= n <= max_n will be used.
# ngram_range(1, 2) means unigrams and bigrams, (2, 2) means only bigrams.
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="word", sublinear_tf=True, 
                             ngram_range=(1,2), max_features=1000) 

X = vectorizer.fit_transform(reviews)
y = np.array(sentiments)

# 학습 데이터와 시험 데이터로 분리한다.
RANDOM_SEED = 42
TEST_SPLIT = 0.2
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, 
                                                    random_state=RANDOM_SEED)

# Logistic Regression으로 학습 데이터를 학습한다
lgs = LogisticRegression(class_weight='balanced', solver='lbfgs') 
lgs.fit(X_train, y_train)

# 시험 데이터로 학습 성능을 평가한다
predicted = lgs.predict(X_eval)
print(predicted[:20])
print("Accuracy: %f" % lgs.score(X_eval, y_eval))

- output
```
[0 1 0 1 0 1 1 1 0 1 0 0 0 1 1 1 0 1 1 1]
Accuracy: 0.862800
```

## Word2Vec과 Logistic Regression을 이용

In [None]:
# Word2Vec과 Logistic Regression을 이용한 영화리뷰 데이터 분류
# 데이터 : Kaggle의 Bags of Words Meets Bags of Popcorns
# pip install gensim
# ----------------------------------------------------------
import pandas as pd
import numpy as np

from gensim.models import word2vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

TRAIN_CLEAN_DATA = '4-1.train_clean.csv'
DATA_IN_PATH = './dataset/'

# 전처리가 완료된 학습 데이터를 읽어온다.
train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

sentences = []
for review in reviews:
    sentences.append(review.split())

model_name = '4-1.300features.word2vec'
num_features = 300      # 워드 벡터 특징값 수
min_word_count = 40     # 단어에 대한 최소 빈도수
num_workers = 4         # 프로세스 개수
context = 10            # 컨텍스트 윈도우 크기
downsampling = 1e-3     # 다운 샘플링 비율
model_saved = True

if model_saved:
    model = word2vec.Word2Vec.load(DATA_IN_PATH + model_name)
else:
    # gensim 패키지를 이용하여 단어를 vector화 한다 (Word2Vec)
    model = word2vec.Word2Vec(sentences,
                              workers = num_workers,
                              size = num_features,
                              min_count = min_word_count,
                              window = context,
                              sample = downsampling)
    model.save(DATA_IN_PATH + model_name)

# model을 확인해 본다.
keys = list(model.wv.vocab.keys())[:20]
print(keys)

# 단어 'stuff'의 vector를 확인한다. 길이 = 300개
model.wv['stuff']

# 단어 유사도를 측정해 본다.
model.wv.similarity("dog", "cat")
model.wv.similarity("dog", "cake")

np.dot(model.wv['dog'], model.wv['cat'])
np.dot(model.wv['dog'], model.wv['cake'])

model.wv.most_similar("dog")

# 1개 문장을 300개 feature로 vector화 한다.
# 문장 = ['dog', 'eat']라면 word.wv['dog']의 300개 vector와 word.wv['eag']의
# 300개 vector의 평균을 계산한다.
def get_features(words, model, num_features):
    feature_vector = np.zeros((num_features),dtype=np.float32)

    num_words = 0
    index2word_set = set(model.wv.index2word)

    for w in words:
        if w in index2word_set:
            num_words += 1
            feature_vector = np.add(feature_vector, model.wv[w])

    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

# reviews 문장들을 각각 300개 feature로 vector화 한다.
def get_dataset(reviews, model, num_features):
    dataset = list()

    for s in reviews:
        dataset.append(get_features(s, model, num_features))

    reviewFeatureVecs = np.stack(dataset)
    
    return reviewFeatureVecs

test_data_vecs = get_dataset(sentences, model, num_features)

# 학습 데이터와 시험 데이터로 분리한다.
RANDOM_SEED = 42
TEST_SPLIT = 0.2
X = test_data_vecs
y = np.array(sentiments)

X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, 
                                                    random_state=RANDOM_SEED)

# Logistic Regression으로 학습 데이터를 학습한다
lgs = LogisticRegression(class_weight='balanced', solver='newton-cg') 
lgs.fit(X_train, y_train)

# 시험 데이터로 학습 성능을 평가한다
predicted = lgs.predict(X_eval)
print(predicted[:20])
print("Accuracy: %f" % lgs.score(X_eval, y_eval))

- output

```
['stuff', 'going', 'moment', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties']

C:\Users\student\.conda\envs\Python_practice_M\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)

C:\Users\student\.conda\envs\Python_practice_M\lib\site-packages\sklearn\utils\optimize.py:204: UserWarning: Line Search failed
  warnings.warn('Line Search failed')
[0 1 0 1 0 1 1 0 0 1 0 0 0 1 0 1 0 1 1 0]
Accuracy: 0.864000
```

## Doc2Vec과 Logistic Regression을 이용

In [None]:
# Doc2Vec과 Logistic Regression을 이용한 영화리뷰 데이터 분류
# 데이터 : Kaggle의 Bags of Words Meets Bags of Popcorns
# pip install gensim
# ----------------------------------------------------------
import pandas as pd
import numpy as np

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

TRAIN_CLEAN_DATA = '4-1.train_clean.csv'
DATA_IN_PATH = './dataset/'

# 전처리가 완료된 학습 데이터를 읽어온다.
train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

sentences = []
for review in reviews:
    sentences.append(review.split())

model_name = '4-1.300features.doc2vec'
model_saved = True

if model_saved:
    model = Doc2Vec.load(DATA_IN_PATH + model_name)
else:
    # gensim 패키지를 이용하여 문장을 vector화 한다 (Doc2Vec)
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)]
    model = Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.00025, 
                    min_count=10, workers=4, dm =1)
    model.build_vocab(documents)
    model.train(documents, total_examples=model.corpus_count, epochs=10)
    model.save(DATA_IN_PATH + model_name)

# model을 확인해 본다.
keys = list(model.wv.vocab.keys())[:20]
print(keys)

# 단어 'stuff'의 vector를 확인한다. 길이 = 300개
model.wv['stuff']

# 단어 유사도를 측정해 본다.
model.wv.similarity("dog", "cat")
model.wv.similarity("dog", "cake")

np.dot(model.wv['dog'], model.wv['cat'])
np.dot(model.wv['dog'], model.wv['cake'])

model.wv.most_similar("dog")

# 첫 번째 문장의 vector (300 개)
model.docvecs[0]

# 새로운 문장의 vector를 추정한다.
new_sentence = model.infer_vector(["system", "response", "cpu", "compute"])

# 학습 데이터와 시험 데이터로 분리한다.
RANDOM_SEED = 42
TEST_SPLIT = 0.2
X = [model.docvecs[i] for i in range(len(sentences))]
y = np.array(sentiments)

X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, 
                                                    random_state=RANDOM_SEED)

# Logistic Regression으로 학습 데이터를 학습한다
lgs = LogisticRegression(class_weight='balanced', solver='newton-cg') 
lgs.fit(X_train, y_train)

# 시험 데이터로 학습 성능을 평가한다
predicted = lgs.predict(X_eval)
print(predicted[:20])
print("Accuracy: %f" % lgs.score(X_eval, y_eval))

- output

```
['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'moonwalker', 'maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought']
[0 1 0 1 0 1 1 1 0 1 0 0 0 1 0 1 0 1 1 1]
Accuracy: 0.845200
```

## Word2Vec - Doc2Vec 의 결과 비교



In [None]:
# 단어 'stuff'의 vector를 확인한다. 길이 = 300개
model.wv['stuff']

# 단어 유사도를 측정해 본다.
model.wv.similarity("dog", "cat")
model.wv.similarity("dog", "cake")

#np.dot(model.wv['dog'], model.wv['cat'])
#np.dot(model.wv['dog'], model.wv['cake'])

model.wv.most_similar("dog")

- Word2vec

```
0.7890252

0.29079747

[('rat', 0.8182035684585571),
 ('chicken', 0.8072102665901184),
 ('dude', 0.7966217994689941),
 ('eat', 0.7944402694702148),
 ('eats', 0.7918289303779602),
 ('cat', 0.7890251874923706),
 ('underwear', 0.7868368625640869),
 ('drink', 0.7856504917144775),
 ('butt', 0.7746052742004395),
 ('drinking', 0.7740159034729004)]
```
---
- Doc2Vec

```
0.5822584

0.05612491

[('chicken', 0.592168390750885),
 ('cat', 0.5822584629058838),
 ('puppy', 0.5540207624435425),
 ('cats', 0.5450679063796997),
 ('eat', 0.5350972414016724),
 ('dogs', 0.5316440463066101),
 ('bunny', 0.5241804122924805),
 ('worm', 0.5236088633537292),
 ('bite', 0.5121653079986572),
 ('bike', 0.5107408761978149)]
 ```

# gensim.models.doc2vec 들여다보기

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

samples = ['너 오늘 이뻐 보인다.',
          '나는 오늘 기분이 더러워',
          '끝내주는데, 좋은 일이 있나봐',
          '나 좋은 일이 생겼어',
          '아 오늘 진짜 짜증나',
          '환상적인데, 정말 좋은 것 같아']

sentences = [s.split() for s in samples]

# 문장마다 Paragraph ID assign
documents = [TaggedDocument(doc, [f'd{i}']) \
             for i, doc in enumerate(sentences)]
    
# [f'doc-{i}' for i in range(10)]


# PV- DM 모델을 생성한다.
model = Doc2Vec(vector_size=5, alpha=0.025, min_alpha=0.0025, min_count=1, dm =1)#if dm =0: PV-DBOW로 학습하다.
 

# train model (PV-DM 모델을 학습한다.)
model.build_vocab(documents)
model.train(documents, total_examples=len(samples),epochs=100)


# word vector 확인해본다.
model.wv['오늘']
#array([-0.03693939, -0.04885605, -0.05943887,  0.05963083,  0.0555545 ], dtype=float32)


# Paragraph vector 확인해본다.
# 너 오늘 이뻐 보인다.
model.docvecs[0]
#array([-0.05443092, -0.04604649, -0.05194417, -0.05120996,  0.02557259], dtype=float32)


# sentences에 대한 각각의 paragraph vector를 확인해본다.
# word2vec 에서는 문장이 2차원 구조의 매트릭스. 문장을 얻기위해 평균을 내더라.
# doc2vec 문장이 벡터로 나와버림.
model.docvecs.vectors_docs
#array([[-0.05443092, -0.04604649, -0.05194417, -0.05120996,  0.02557259],
#       [ 0.08317213,  0.0322421 , -0.08741622,  0.08744427,  0.0764193 ],
#       [ 0.02685709, -0.03138647,  0.00841046, -0.07948915, -0.07992024],
#       [ 0.02179648,  0.03237828, -0.07769433, -0.04575829, -0.05825642],
#       [ 0.05020343, -0.07546338, -0.10376928, -0.01349617,  0.00015175],
#       [-0.04932483, -0.08273362,  0.00135178,  0.0470594 ,  0.00538549]],
#      dtype=float32)


# 새로운 문장에 대한 벡터 추론
model.infer_vector('오늘 좋은 일 있을 것 같아'.split())
#array([ 0.09398302,  0.06397881, -0.08147865, -0.0935128 , -0.03737117], dtype=float32)

### Doc2Vec과 DL을 이용-2 

LR대신 Deeplearning을 이용한다. - FFN을 쓴다.

In [None]:
import pandas as pd
import numpy as np

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

TRAIN_CLEAN_DATA = '4-1.train_clean.csv'
DATA_IN_PATH = './dataset/'

# 전처리가 완료된 학습 데이터를 읽어온다.
train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

sentences = []
for review in reviews:
    sentences.append(review.split())

model_name = '4-1.300features.doc2vec'
model_saved = True

if model_saved:
    model = Doc2Vec.load(DATA_IN_PATH + model_name)
else:
    # gensim 패키지를 이용하여 문장을 vector화 한다 (Doc2Vec)
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)]
    model = Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.00025, 
                    min_count=10, workers=4, dm =1)
    model.build_vocab(documents)
    model.train(documents, total_examples=model.corpus_count, epochs=10)
    model.save(DATA_IN_PATH + model_name)

# model을 확인해 본다.
keys = list(model.wv.vocab.keys())[:20]
print(keys)

# 단어 'stuff'의 vector를 확인한다. 길이 = 300개
model.wv['stuff']

# 단어 유사도를 측정해 본다.
model.wv.similarity("dog", "cat")
model.wv.similarity("dog", "cake")

np.dot(model.wv['dog'], model.wv['cat'])
np.dot(model.wv['dog'], model.wv['cake'])

model.wv.most_similar("dog")

# 첫 번째 문장의 vector (300 개)
model.docvecs[0]

# 새로운 문장의 vector를 추정한다.
new_sentence = model.infer_vector(["system", "response", "cpu", "compute"])

# 학습 데이터와 시험 데이터로 분리한다.
RANDOM_SEED = 42
TEST_SPLIT = 0.2
X = np.array([model.docvecs[i] for i in range(len(sentences))])
y = np.array(sentiments)

X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, 
                                                    random_state=RANDOM_SEED)


Xcopy = np.array(X.copy())
input_layer = Input(batch_shape=(None, Xcopy.shape[1])) # 300
#rnn_layers = LSTM(50)(input_layer)
hidden_layer = Dense(400, activation='relu', kernel_regularizer=regularizers.l2(0.01))(input_layer)
hidden_layer2 = Dense(50, activation='relu', kernel_regularizer=regularizers.l2(0.05))(hidden_layer)
output_layer = Dense(1, activation='sigmoid')(hidden_layer2)

model = Model(input_layer, output_layer)
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.0005))
model.summary()

#model.fit(np.array(X_train).reshape(-1, 1, 300), np.array(y_train).reshape(-1, 1, 300), \
#          batch_size = 500, epochs = 30)
    
#xInput = Input(batch_shape=(None, trainX.shape[1]))
#xHidden = Dense(8, kernel_regularizer=regularizers.l2(0.05), activation='relu')(xInput)
#yOutput = Dense(trainY.shape[1], kernel_regularizer=regularizers.l2(0.05), activation='sigmoid')(xHidden)
#model = Model(xInput, yOutput)
#model.compile(loss='binary_crossentropy', optimizer=optimizers.Adam(lr=0.01))

# 학습한다
model.fit(X_train, y_train, epochs=100, batch_size=50)# validation_data = (testX, testY), 

# 학습이 완료되면 testX를 넣어서 출력값을 확인한다.
# textX의 출력값 (추정값)과 testY (실제값)를 이용하여 정확도를 측정한다.
yHat = model.predict(X_eval)
testYhat = np.where(yHat > 0.5, 1, 0)
accuracy = 100 * (y_eval == testYhat).mean()
print(accuracy)
# 50.008056 
# 확연히 낮아졌다. vs 0.845200
# 더 높게 나오기도 한다더라...