<a href="https://colab.research.google.com/github/seunghyunmoon2/NLP/blob/master/NLP10_Similarity_CoOccurence_HashingVectorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# OneHot encoding

In [None]:
# one-hot encoding 연습
# ---------------------
import numpy as np

data = ['남자', '여자', '아빠', '엄마', '삼촌', '이모']
values = np.array(data)
print(values)
print(sorted(values))

# sklearn의 preprocessing을 이용한 one-hot encoding
import sklearn.preprocessing as sk

label_encoder = sk.LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)

# integer encoding
print(integer_encoded)

# binary encoding
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoder = sk.OneHotEncoder(sparse=False, categories='auto')
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

# Keras를 이용한 one-hot encoding
from tensorflow.keras.utils import to_categorical
encoded = to_categorical(integer_encoded)
print(encoded)

['남자' '여자' '아빠' '엄마' '삼촌' '이모']
['남자', '삼촌', '아빠', '엄마', '여자', '이모']
[0 4 2 3 1 5]
[[1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]]
[[1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]]


# word embedding using Hashing Trick

In [None]:
# Hashing trick을 이용한 word embedding
# -------------------------------------
import tensorflow as tf
from tensorflow.keras.preprocessing.text import hashing_trick
from tensorflow.keras.layers import Input, Embedding, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import numpy as np
    
samples = ['너 오늘 이뻐 보인다', 
           '나는 오늘 기분이 더러워', 
           '끝내주는데, 좋은 일이 있나봐', 
           '나 좋은 일이 생겼어', 
           '아 오늘 진짜 짜증나', 
           '환상적인데, 정말 좋은거 같아']
labels = [[1], [0], [1], [1], [0], [1]]

# hash 테이블로 문서를 수치화한다.
VOCAB_SIZE = 10 # vocabulary 크기 (hash table)를 10개로 한정한다.
sequences = [hashing_trick(s, VOCAB_SIZE) for s in samples]
sequences = np.array(sequences)
labels = np.array(labels)
print(sequences)

# Embedding layer 내부의 출력층 개수임. 단어의 latent feature 개수
EMB_SIZE = 8

# 딥러닝 모델을 빌드한다.
xInput = Input(batch_shape=(None, sequences.shape[1]))
embed_input = Embedding(input_dim=VOCAB_SIZE + 1, output_dim=EMB_SIZE)(xInput)
embed_input1 = tf.reduce_mean(embed_input, axis=-1)

hidden_layer = Dense(128, activation=tf.nn.relu)(embed_input1)
output = Dense(1, activation='sigmoid')(hidden_layer)
model = Model(xInput, output)
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.01))

# 학습
model.fit(sequences, labels, epochs=100)

# 추정
pred = model.predict(sequences)
print(np.round(pred, 0))


[[8 7 7 3]
 [9 7 5 9]
 [4 1 7 5]
 [5 1 7 9]
 [6 7 7 5]
 [3 6 7 2]]
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 

# Co-Occurrence

In [None]:
# co-occurrence encoding 연습
# ---------------------------
from sklearn.feature_extraction.text import CountVectorizer

docs = ['성진과 창욱은 야구장에 갔다',
        '성진과 태균은 도서관에 갔다',
        '성진과 창욱은 공부를 좋아한다']

count_model = CountVectorizer(ngram_range=(1,1))
x = count_model.fit_transform(docs)

# 문서에 사용된 사전을 조회한다.
print(count_model.vocabulary_)

# co-occurrence 행렬을 조회한다. Compact Sparse Row(CSR) format
# (row, col) value
print(x)

# 행렬 형태로 표시한다.
print(x.toarray())
print()
print(x.T.toarray())

#x.T의 의미
#          1 2 3  - 문장
#갔다    [[1 1 0] - '갔다'라는 단어는 문장-1과 문장-2에 쓰였음.
#공부를   [0 0 1] - '공부를'은 문장-3에만 쓰였음.
#도서관에 [0 1 0]
#성진과   [1 1 1]
#야구장에 [1 0 0]
#좋아한다 [0 0 1]
#창욱은   [1 0 1]
#태균은   [0 1 0]]

xc = x.T * x # this is co-occurrence matrix in sparse csr format
xc.setdiag(0) # sometimes you want to fill same word cooccurence to 0
print(xc.toarray())

#              0       1       2        3        4         5        6       7
#             갔다  공부를  도서관에  성진과  야구장에  좋아한다  창욱은  태균은
#0 갔다        0       0       1        2        1         0        1       1
#1 공부를      0       0       0        1        0         1        1       0
#2 도서관에    1       0       0        1        0         0        0       1
#3 성진과      2       1       1        0        1         1        2       1
#4 야구장에    1       0       0        1        0         0        1       0
#5 좋아한다    0       1       0        1        0         0        1       0
#6 창욱은      1       1       0        2        1         1        0       0
#7 태균은      1       0       1        1        0         0        0       0

# ngram_range(min_n = 1, max_n = 2)인 경우
#count_model = CountVectorizer(ngram_range=(1,2))
#x = count_model.fit_transform(docs)

# 문서에 사용된 사전을 조회한다.
#print(count_model.vocabulary_)

xc = x.T * x # this is co-occurrence matrix in sparse csr format
xc.setdiag(0) # sometimes you want to fill same word cooccurence to 0
print(xc.toarray())

# Co-occurrence matrix를 SVD로 분해한다.
# C = U.S.VT
# numpy를 이용한 SVD 예시
import numpy as np
C = xc.toarray()
U, S, VT = np.linalg.svd(C, full_matrices = True)
print(np.round(U, 2), '\n')
print(np.round(S, 2), '\n')
print(np.round(VT, 2), '\n')

# S를 정방행렬로 바꾼다.
s = np.diag(S)
print(np.round(s, 2))

# A = U.s.VT를 계산하고, A와 C가 일치하는지 확인한다.
A = np.dot(U, np.dot(s, VT))
print(np.round(A, 1))
print(C)

# sklearn을 이용한 SVD 예시
from sklearn.decomposition import TruncatedSVD

# 특이값 (S)이 큰 4개를 주 성분으로 C의 차원을 축소한다.
svd = TruncatedSVD(n_components=4, n_iter=7)
D = svd.fit_transform(xc.toarray())

U = D / svd.singular_values_
S = np.diag(svd.singular_values_)
VT = svd.components_

print("\nU, S, VT :")
print(np.round(U, 2), '\n')
print(np.round(S, 2), '\n')
print(np.round(VT, 2), '\n')

print("C를 4개 차원으로 축소 : truncated (U * S)")
print(np.round(D, 2))

# U * S * VT 하면 원래 C의 차원과 동일해 진다. U * S가 축소된
# 차원을 의미하고, V는 축소된 차원을 원래 차원으로 되돌리는 역할을
# 한다 (mapping back)

{'성진과': 3, '창욱은': 6, '야구장에': 4, '갔다': 0, '태균은': 7, '도서관에': 2, '공부를': 1, '좋아한다': 5}
  (0, 3)	1
  (0, 6)	1
  (0, 4)	1
  (0, 0)	1
  (1, 3)	1
  (1, 0)	1
  (1, 7)	1
  (1, 2)	1
  (2, 3)	1
  (2, 6)	1
  (2, 1)	1
  (2, 5)	1
[[1 0 0 1 1 0 1 0]
 [1 0 1 1 0 0 0 1]
 [0 1 0 1 0 1 1 0]]

[[1 1 0]
 [0 0 1]
 [0 1 0]
 [1 1 1]
 [1 0 0]
 [0 0 1]
 [1 0 1]
 [0 1 0]]
[[0 0 1 2 1 0 1 1]
 [0 0 0 1 0 1 1 0]
 [1 0 0 1 0 0 0 1]
 [2 1 1 0 1 1 2 1]
 [1 0 0 1 0 0 1 0]
 [0 1 0 1 0 0 1 0]
 [1 1 0 2 1 1 0 0]
 [1 0 1 1 0 0 0 0]]
[[0 0 1 2 1 0 1 1]
 [0 0 0 1 0 1 1 0]
 [1 0 0 1 0 0 0 1]
 [2 1 1 0 1 1 2 1]
 [1 0 0 1 0 0 1 0]
 [0 1 0 1 0 0 1 0]
 [1 1 0 2 1 1 0 0]
 [1 0 1 1 0 0 0 0]]
[[-0.44 -0.39  0.41 -0.58  0.35  0.    0.   -0.19]
 [-0.24 -0.12  0.41  0.29 -0.24  0.62 -0.34  0.35]
 [-0.24 -0.12 -0.41 -0.29 -0.24 -0.34 -0.62  0.35]
 [-0.56  0.8  -0.    0.    0.19 -0.    0.    0.02]
 [-0.27 -0.01 -0.   -0.   -0.7   0.   -0.   -0.66]
 [-0.24 -0.12  0.41  0.29 -0.24 -0.62  0.34  0.35]
 [-0.44 -0.39 -0.41  0.58  0.35 -0.    0.

# Similarity using TF-IDF

In [None]:
# Tfidf를 이용한 유사도 측정 예시
# 유사도 : 자카드, 코사인, 유클리디언, 맨하탄 유사도
# ------------------------------------------------
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

sent = ("휴일 인 오늘 도 서쪽 을 중심 으로 폭염 이 이어졌는데요, 내일 은 반가운 비 소식 이 있습니다.", 
        "폭염 을 피해서 휴일 에 놀러왔다가 갑작스런 비 로 인해 망연자실 하고 있습니다.") 

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sent).toarray()
print(np.round(tfidf_matrix, 3))

# 1. 자카드 유사도
# ----------------
sent_1 = set(sent[0].split())
sent_2 = set(sent[1].split())
print(sent_1)
print(sent_2)

# 합집합과 교집합을 구한다.
hap_set = sent_1 | sent_2
gyo_set = sent_1 & sent_2
print(hap_set, '\n')
print(gyo_set, '\n')

jaccard = len(gyo_set) / len(hap_set)
print(jaccard)

# 2. 코사인 유사도
# ---------------
from sklearn.metrics.pairwise import cosine_similarity
d = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
print(d)

# 3. 유클리디안 유사도
# -------------------
from sklearn.metrics.pairwise import euclidean_distances

euclidean_distances(tfidf_matrix[0:1], tfidf_matrix[1:2])

# 정규화
def l1_normalize(v):
    return v / np.sum(v)

tfidf_norm_l1 = l1_normalize(tfidf_matrix)
d = euclidean_distances(tfidf_norm_l1[0:1], tfidf_norm_l1[1:2])
print(d)

# 4. 맨하탄 유사도
# ---------------
from sklearn.metrics.pairwise import manhattan_distances

d = manhattan_distances(tfidf_norm_l1[0:1], tfidf_norm_l1[1:2])
print(d)

[[0.    0.324 0.    0.    0.324 0.324 0.324 0.324 0.324 0.324 0.    0.231
  0.324 0.231 0.    0.    0.231]
 [0.365 0.    0.365 0.365 0.    0.    0.    0.    0.    0.    0.365 0.259
  0.    0.259 0.365 0.365 0.259]]
{'중심', '휴일', '오늘', '이', '은', '비', '도', '반가운', '인', '으로', '소식', '폭염', '서쪽', '을', '이어졌는데요,', '있습니다.', '내일'}
{'휴일', '피해서', '갑작스런', '인해', '비', '망연자실', '하고', '로', '폭염', '에', '을', '놀러왔다가', '있습니다.'}
{'오늘', '은', '갑작스런', '인해', '비', '망연자실', '하고', '로', '서쪽', '을', '있습니다.', '놀러왔다가', '중심', '휴일', '이', '피해서', '도', '반가운', '인', '으로', '소식', '폭염', '이어졌는데요,', '에', '내일'} 

{'휴일', '비', '폭염', '을', '있습니다.'} 

0.2
[[0.17952266]]
[[0.20491229]]
[[0.77865927]]


# Similarity using HashingVectorizer

In [None]:
# HashingVectorizer를 이용한 유사도 측정 예시
# 유사도 : 자카드, 코사인, 유클리디언, 맨하탄 유사도
# ------------------------------------------------
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer

sent = ("휴일 인 오늘 도 서쪽 을 중심 으로 폭염 이 이어졌는데요, 내일 은 반가운 비 소식 이 있습니다.", 
        "폭염 을 피해서 휴일 에 놀러왔다가 갑작스런 비 로 인해 망연자실 하고 있습니다.") 

# 카운트 기반
VOCAB_SIZE = 20
hvectorizer = HashingVectorizer(n_features=VOCAB_SIZE,norm=None,alternate_sign=False)
hash_matrix = hvectorizer.fit_transform(sent).toarray()
print(hash_matrix)

# L2 normalization
VOCAB_SIZE = 20
hvectorizer = HashingVectorizer(n_features=VOCAB_SIZE,norm='l2',alternate_sign=False)
hash_matrix = hvectorizer.fit_transform(sent).toarray()
print(np.round(hash_matrix, 3))

# 1. 자카드 유사도
# ----------------
sent_1 = set(sent[0].split())
sent_2 = set(sent[1].split())
print(sent_1)
print(sent_2)

# 합집합과 교집합을 구한다.
hap_set = sent_1 | sent_2
gyo_set = sent_1 & sent_2
print(hap_set, '\n')
print(gyo_set, '\n')

jaccard = len(gyo_set) / len(hap_set)
print(jaccard)

# 2. 코사인 유사도
# ---------------
from sklearn.metrics.pairwise import cosine_similarity
d = cosine_similarity(hash_matrix[0:1], hash_matrix[1:2])
print(d)

# 3. 유클리디안 유사도
# -------------------
from sklearn.metrics.pairwise import euclidean_distances

euclidean_distances(hash_matrix[0:1], hash_matrix[1:2])

# 정규화
def l1_normalize(v):
    return v / np.sum(v)

tfidf_norm_l1 = l1_normalize(hash_matrix)
d = euclidean_distances(tfidf_norm_l1[0:1], tfidf_norm_l1[1:2])
print(d)

# 4. 맨하탄 유사도
# ---------------
from sklearn.metrics.pairwise import manhattan_distances

d = manhattan_distances(tfidf_norm_l1[0:1], tfidf_norm_l1[1:2])
print(d)


[[0. 2. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 2. 2. 0. 1. 0. 0. 0. 0.]
 [0. 2. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 2. 1. 0. 0.]]
[[0.    0.485 0.    0.    0.    0.243 0.243 0.243 0.243 0.    0.    0.
  0.485 0.485 0.    0.243 0.    0.    0.    0.   ]
 [0.    0.555 0.277 0.    0.    0.277 0.277 0.    0.    0.    0.277 0.
  0.    0.    0.    0.    0.555 0.277 0.    0.   ]]
{'중심', '휴일', '오늘', '이', '은', '비', '도', '반가운', '인', '으로', '소식', '폭염', '서쪽', '을', '이어졌는데요,', '있습니다.', '내일'}
{'휴일', '피해서', '갑작스런', '인해', '비', '망연자실', '하고', '로', '폭염', '에', '을', '놀러왔다가', '있습니다.'}
{'오늘', '은', '갑작스런', '인해', '비', '망연자실', '하고', '로', '서쪽', '을', '있습니다.', '놀러왔다가', '중심', '휴일', '이', '피해서', '도', '반가운', '인', '으로', '소식', '폭염', '이어졌는데요,', '에', '내일'} 

{'휴일', '비', '폭염', '을', '있습니다.'} 

0.2
[[0.40360368]]
[[0.21149137]]
[[0.62427015]]


# EDA

In [None]:
# 데이터 이해하기 : 탐색적 데이터 분석 (예시)
# -----------------------------------------
import os

import pandas as pd
import tensorflow as tf

# 영화 리뷰 데이터 불러오기
data_set = tf.keras.utils.get_file(
      fname="imdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
      extract=True)

# 다운받은 데이터 셋의 경로를 확인한다.
#
# 아래 path의 하위 폴더
# aclimdb/test : 테스트 데이터
# aclimdb/train : 학습 데이터
# aclimdb/test/neg : negative 데이터 파일 (txt format)
# aclimdb/test/pos : positive 데이터 파일 (txt format)
data_path = os.path.dirname(data_set)
print(data_path)

#시험 데이터의 negative review file 이름 몇 개를 조회해 본다.
aclimdb = os.path.join(data_path, "aclimdb/test/neg")
print(aclimdb)

file_name = os.listdir(aclimdb)
print(file_name[:20])

# Negative review 문서 2개의 내용을 확인해 본다
for file_path in file_name[:2]:
    with open(os.path.join(aclimdb, file_path), "r", encoding='utf-8') as file:
        print(file.read())
        print()

# 학습 데이터를 읽어와서 데이터 프레임에 저장한다
def directory_data(directory):
    data = {}
    data["review"] = []
    for file_path in os.listdir(directory):
        with open(os.path.join(directory, file_path), "r", encoding='utf-8') as file:
            data["review"].append(file.read())
            
    return pd.DataFrame.from_dict(data)

def data(directory):
    pos_df = directory_data(os.path.join(directory, "pos"))
    neg_df = directory_data(os.path.join(directory, "neg"))
    pos_df["sentiment"] = 1
    neg_df["sentiment"] = 0
    
    return pd.concat([pos_df, neg_df])

train_df = data(os.path.join(os.path.dirname(data_set), "aclImdb", "train"))
test_df = data(os.path.join(os.path.dirname(data_set), "aclImdb", "test"))
train_df.head()
reviews = list(train_df['review'])
print(reviews[0])

# 데이터 분석
# 문자열 문장 리스트를 토큰나이즈
tokenized_reviews = [r.split() for r in reviews]
print(tokenized_reviews[0])

# 토큰나이즈 된 리스트에 대한 각 길이를 저장
review_len_by_token = [len(t) for t in tokenized_reviews]
print(review_len_by_token[:10])

# 토큰나이즈 된 것을 붙여서 음절의 길이를 저장 (문자 길이)
review_len_by_eumjeol = [len(s.replace(' ', '')) for s in reviews]
remove_space = [s.replace(' ', '') for s in reviews]
print(remove_space[0])
print(review_len_by_eumjeol[:20])

# review 문장의 크기 분포를 확인한다. (단어 개수 분포, 문자 개수 분포)
import matplotlib.pyplot as plt

# 그래프에 대한 이미지 사이즈 선언
# figsize: (가로, 세로) 형태의 튜플로 입력
plt.figure(figsize=(8, 4))

# 히스토그램 선언
# bins: 히스토그램 값들에 대한 버켓 범위
# range: x축 값의 범위
# alpha: 그래프 색상 투명도
# color: 그래프 색상
# label: 그래프에 대한 라벨
plt.hist(review_len_by_token, bins=50, alpha=0.5, color= 'r', label='word')
plt.hist(review_len_by_eumjeol, bins=50, alpha=0.5, color='b', label='alphabet')
plt.legend()
plt.yscale('log', nonposy='clip')

# 그래프 제목
plt.title('Review Length Histogram')

# 그래프 x 축 라벨
plt.xlabel('Review Length')

# 그래프 y 축 라벨
plt.ylabel('Number of Reviews')
plt.show()                          # 1

# review 문서의 단어 분포의 기술 통계량을 확인한다
import numpy as np

print('문장 최대길이: {}'.format(np.max(review_len_by_token)))
print('문장 최소길이: {}'.format(np.min(review_len_by_token)))
print('문장 평균길이: {:.2f}'.format(np.mean(review_len_by_token)))
print('문장 길이 표준편차: {:.2f}'.format(np.std(review_len_by_token)))
print('문장 중간길이: {}'.format(np.median(review_len_by_token)))

# 사분위의 대한 경우는 0~100 스케일로 되어있음
print('제 1 사분위 길이: {}'.format(np.percentile(review_len_by_token, 25)))
print('제 3 사분위 길이: {}'.format(np.percentile(review_len_by_token, 75)))

# 단어 분포를 박스 플롯으로 확인한다
plt.figure(figsize=(8, 4))

# 박스플롯 생성
# 첫번째 파라메터: 여러 분포에 대한 데이터 리스트를 입력
# labels: 입력한 데이터에 대한 라벨
# showmeans: 평균값을 마크함

plt.boxplot([review_len_by_token], labels=['token'], showmeans=True)
plt.show()                               # 2 

# 문자 분포를 박스 플롯으로 확인한다
plt.figure(figsize=(8, 4))
plt.boxplot([review_len_by_eumjeol], labels=['Eumjeol'], showmeans=True)
plt.show()                               # 3

# 워드 클라우드
# conda install -c conda-forge wordcloud
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

wordcloud = WordCloud(stopwords = STOPWORDS, background_color = 'black', width = 800, height = 600)\
                .generate(' '.join(train_df['review']))

plt.figure(figsize = (15, 10))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()                              # 4

# 긍정 부정 분포
import seaborn as sns
import matplotlib.pyplot as plt

sentiment = train_df['sentiment'].value_counts()
fig, axe = plt.subplots(ncols=1)
fig.set_size_inches(6, 3)
sns.countplot(train_df['sentiment'])
plt.show()                              # 5

print(sentiment)

- output

print(data_path)
```
C:\Users\student\.keras\datasets
```
print(aclimdb)
```
C:\Users\student\.keras\datasets\aclimdb/test/neg
```
print(file_name[:20])

```
['0_2.txt', '10000_4.txt', '10001_1.txt', '10002_3.txt', '10003_3.txt', '10004_2.txt', '10005_2.txt', '10006_2.txt', '10007_4.txt', '10008_4.txt', '10009_3.txt', '1000_3.txt', '10010_2.txt', '10011_1.txt', '10012_1.txt', '10013_4.txt', '10014_2.txt', '10015_4.txt', '10016_3.txt', '10017_1.txt']
```
for file_path in file_name[:2]:
    with open(os.path.join(aclimdb, file_path), "r", encoding='utf-8') as file:
        print(file.read())
        print()
```
Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner's character are realized early on, and then forgotten until much later, by which time I did not care. The character we should really care about is a very cocky, overconfident Ashton Kutcher. The problem is he comes off as kid who thinks he's better than anyone else around him and shows no signs of a cluttered closet. His only obstacle appears to be winning over Costner. Finally when we are well past the half way point of this stinker, Costner tells us all about Kutcher's ghosts. We are told why Kutcher is driven to be the best with no prior inkling or foreshadowing. No magic here, it was all I could do to keep from turning it off an hour in.

This is an example of why the majority of action films are the same. Generic and boring, there's really nothing worth watching here. A complete waste of the then barely-tapped talents of Ice-T and Ice Cube, who've each proven many times over that they are capable of acting, and acting well. Don't bother with this one, go see New Jack City, Ricochet or watch New York Undercover for Ice-T, or Boyz n the Hood, Higher Learning or Friday for Ice Cube and see the real deal. Ice-T's horribly cliched dialogue alone makes this film grate at the teeth, and I'm still wondering what the heck Bill Paxton was doing in this film? And why the heck does he always play the exact same character? From Aliens onward, every film I've seen with Bill Paxton has him playing the exact same irritating character, and at least in Aliens his character died, which made it somewhat gratifying...<br /><br />Overall, this is second-rate action trash. There are countless better films to see, and if you really want to see this one, watch Judgement Night, which is practically a carbon copy but has better acting and a better script. The only thing that made this at all worth watching was a decent hand on the camera - the cinematography was almost refreshing, which comes close to making up for the horrible film itself - but not quite. 4/10.
```
print(reviews[0])
```
Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!
```
print(tokenized_reviews[0])
```
['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy.', 'It', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life,', 'such', 'as', '"Teachers".', 'My', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'Bromwell', "High's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"Teachers".', 'The', 'scramble', 'to', 'survive', 'financially,', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp,', 'the', 'pettiness', 'of', 'the', 'whole', 'situation,', 'all', 'remind', 'me', 'of', 'the', 'schools', 'I', 'knew', 'and', 'their', 'students.', 'When', 'I', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school,', 'I', 'immediately', 'recalled', '.........', 'at', '..........', 'High.', 'A', 'classic', 'line:', 'INSPECTOR:', "I'm", 'here', 'to', 'sack', 'one', 'of', 'your', 'teachers.', 'STUDENT:', 'Welcome', 'to', 'Bromwell', 'High.', 'I', 'expect', 'that', 'many', 'adults', 'of', 'my', 'age', 'think', 'that', 'Bromwell', 'High', 'is', 'far', 'fetched.', 'What', 'a', 'pity', 'that', 'it', "isn't!"]
```
print(review_len_by_token[:10])
```
[140, 428, 147, 124, 120, 171, 108, 340, 436, 324]
```
print(remove_space[0])
```
BromwellHighisacartooncomedy.Itranatthesametimeassomeotherprogramsaboutschoollife,suchas"Teachers".My35yearsintheteachingprofessionleadmetobelievethatBromwellHigh'ssatireismuchclosertorealitythanis"Teachers".Thescrambletosurvivefinancially,theinsightfulstudentswhocanseerightthroughtheirpatheticteachers'pomp,thepettinessofthewholesituation,allremindmeoftheschoolsIknewandtheirstudents.WhenIsawtheepisodeinwhichastudentrepeatedlytriedtoburndowntheschool,Iimmediatelyrecalled.........at..........High.Aclassicline:INSPECTOR:I'mheretosackoneofyourteachers.STUDENT:WelcometoBromwellHigh.IexpectthatmanyadultsofmyagethinkthatBromwellHighisfarfetched.Whatapitythatitisn't!
```
print(review_len_by_eumjeol[:20])
```
[667, 1939, 695, 540, 528, 847, 521, 1845, 2170, 1537, 1299, 388, 1348, 1027, 626, 718, 908, 1554, 631, 676]
```

- review 문서의 단어 분포의 기술 통계량을 확인한다
import numpy as np

print('문장 최대길이: {}'.format(np.max(review_len_by_token)))
print('문장 최소길이: {}'.format(np.min(review_len_by_token)))
print('문장 평균길이: {:.2f}'.format(np.mean(review_len_by_token)))
print('문장 길이 표준편차: {:.2f}'.format(np.std(review_len_by_token)))
print('문장 중간길이: {}'.format(np.median(review_len_by_token)))

- 사분위의 대한 경우는 0~100 스케일로 되어있음
print('제 1 사분위 길이: {}'.format(np.percentile(review_len_by_token, 25)))
print('제 3 사분위 길이: {}'.format(np.percentile(review_len_by_token, 75)))
```
문장 최대길이: 2470
문장 최소길이: 10
문장 평균길이: 233.79
문장 길이 표준편차: 173.73
문장 중간길이: 174.0   
제 1 사분위 길이: 127.0
제 3 사분위 길이: 284.0
1    12500
0    12500
```
print(sentiment)
```
Name: sentiment, dtype: int64
```

## plt.show() outputs

1. ![#1](https://drive.google.com/uc?view=export&id=1GvsxRY7LjOiVdt8qmXZ3sTsA3z2G06f7)
2. ![#2](https://drive.google.com/uc?view=export&id=1NyENSE_ERdnI7G1H-JqjiQgWvD7MsBwA)
3. ![#3](https://drive.google.com/uc?view=export&id=1glf_BbLGRTUgQnM9gXOGkkyL2Oy9NVmE)
4. ![#4](https://drive.google.com/uc?view=export&id=1glf_BbLGRTUgQnM9gXOGkkyL2Oy9NVmE)
5. ![#5](https://drive.google.com/uc?view=export&id=1p4HyqMhxYic8mj9pzkMYiXn9uFMJj0e_)