# PDF 언어 모형
### 언어 모형과 n-gram
요즘은 잘 사용 안하지만 개념은 알아야 한다

# 신경망 언어 모형
### 텐서플로의 Embedding
* tf.keras.layers.Embedding
* 내부적으로 처리해줘서 원-핫 인코딩 필요 없음
* 절편에 해당하는 부분(y = ax+b에서 b 부분)이 없음  
  
# 실습
GPU에 문제 있으면 코랩으로 할 것
# 1. Tokenizer

## 1.1 데이터 받기

In [1]:
import requests
res = requests.get('https://github.com/euphoris/datasets/raw/master/imdb.zip')

# 저장
with open('imdb.zip', 'wb') as f:
    f.write(res.content)

In [2]:
import pandas as pd
df = pd.read_csv('imdb.zip')

In [3]:
df

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
995,I just got bored watching Jessice Lange take h...,0
996,"Unfortunately, any virtue in this film's produ...",0
997,"In a word, it is embarrassing.",0
998,Exceptionally bad!,0


## 1.2 토큰화

In [4]:
import tensorflow as tf

tk = tf.keras.preprocessing.text.Tokenizer(num_words=2000, oov_token='<unk>')

In [5]:
# 단어들에 번호 붙여주기(많이 나오는 순서대로 번호 붙임)
tk.fit_on_texts(df['review'])

In [6]:
tk.word_index

{'<unk>': 1,
 'the': 2,
 'and': 3,
 'a': 4,
 'of': 5,
 'is': 6,
 'this': 7,
 'i': 8,
 'it': 9,
 'to': 10,
 'in': 11,
 'was': 12,
 'movie': 13,
 'film': 14,
 'that': 15,
 'for': 16,
 'as': 17,
 'but': 18,
 'with': 19,
 'one': 20,
 'on': 21,
 'you': 22,
 'are': 23,
 'not': 24,
 'bad': 25,
 "it's": 26,
 'very': 27,
 'all': 28,
 'just': 29,
 'so': 30,
 'good': 31,
 'at': 32,
 'an': 33,
 'be': 34,
 'there': 35,
 'about': 36,
 'have': 37,
 'by': 38,
 'like': 39,
 'from': 40,
 'if': 41,
 'acting': 42,
 'time': 43,
 'out': 44,
 'his': 45,
 'or': 46,
 'really': 47,
 'great': 48,
 'even': 49,
 'he': 50,
 'who': 51,
 'were': 52,
 'has': 53,
 'see': 54,
 'my': 55,
 'characters': 56,
 'well': 57,
 'most': 58,
 'how': 59,
 'more': 60,
 'no': 61,
 'only': 62,
 'when': 63,
 'ever': 64,
 '10': 65,
 'movies': 66,
 'plot': 67,
 'story': 68,
 'made': 69,
 'some': 70,
 'they': 71,
 'best': 72,
 'because': 73,
 'your': 74,
 'can': 75,
 'also': 76,
 "don't": 77,
 'films': 78,
 'than': 79,
 'its': 80,
 'scrip

In [7]:
# 'good'에 대한 인덱스 가져오기
tk.word_index['good']

31

In [8]:
# 반대로 인덱스 번호로 31을 넣으면 good이 나온다
tk.index_word[31]

'good'

In [9]:
# 객체로 저장
import joblib

joblib.dump(tk, 'tokenizer.pkl')

['tokenizer.pkl']

# 2. 전처리

In [11]:
# 여기서는 같은 파일이지만 다른 파일이라고 가정하고 로딩
import pandas as pd

df = pd.read_csv('imdb.zip')

In [12]:
# 얘도 마찬가지
import joblib

tk = joblib.load('tokenizer.pkl')    # tokenizer.pkl은 imdb에서 데이터 불러와서 사전(딕셔너리)을 만든것

In [13]:
df.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [14]:
seqs = tk.texts_to_sequences(df['review'])    # df의 단어를 숫자로 바꿈

In [15]:
seqs[0]    # 0번째 줄 출력 A -> 4, very -> 27....

[4, 27, 27, 27, 287, 407, 1217, 13, 36, 4, 1218, 1219, 408, 142]

In [16]:
tk.index_word[4], tk.index_word[27], tk.index_word[287], tk.index_word[407]

('a', 'very', 'slow', 'moving')

In [17]:
# 앞의 몇개의 단어로 현재 단어를 예측할 수 있도록 단어를 묶어준다
# (4, 27, 27, 27, 287), (27, 27, 27, 287, 407), (27, 27, 287, 407, 1217)... 이런식으로 묶인다
seq = seqs[0]

list(range(0, len(seq)-4))    # 마지막에서 5번재 앞 까지의 수로 제한해야 리스트를 초과하지 않는다

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [19]:
# 위의 단어 묶어주는 과정을 모든 문자열에 사용
data = []
for seq in seqs:
    for i in range(0, len(seq)-4):
        data.append((seq[i:i+4], seq[i+4]))
        print(seq[i:i+4], seq[i+4])

[4, 27, 27, 27] 287
[27, 27, 27, 287] 407
[27, 27, 287, 407] 1217
[27, 287, 407, 1217] 13
[287, 407, 1217, 13] 36
[407, 1217, 13, 36] 4
[1217, 13, 36, 4] 1218
[13, 36, 4, 1218] 1219
[36, 4, 1218, 1219] 408
[4, 1218, 1219, 408] 142
[24, 522, 51, 12] 60
[522, 51, 12, 60] 409
[51, 12, 60, 409] 2
[12, 60, 409, 2] 736
[60, 409, 2, 736] 56
[409, 2, 736, 56] 46
[2, 736, 56, 46] 2
[736, 56, 46, 2] 337
[56, 46, 2, 337] 1220
[46, 2, 337, 1220] 288
[2, 337, 1220, 288] 5
[337, 1220, 288, 5] 737
[1220, 288, 5, 737] 738
[288, 5, 737, 738] 44
[1221, 1222, 19, 209] 233
[1222, 19, 209, 233] 3
[19, 209, 233, 3] 338
[209, 233, 3, 338] 184
[233, 3, 338, 184] 739
[3, 338, 184, 739] 2
[338, 184, 739, 2] 13
[184, 739, 2, 13] 289
[739, 2, 13, 289] 740
[2, 13, 289, 740] 49
[13, 289, 740, 49] 60
[289, 740, 49, 60] 339
[740, 49, 60, 339] 17
[49, 60, 339, 17] 2
[60, 339, 17, 2] 42
[339, 17, 2, 42] 12
[17, 2, 42, 12] 290
[2, 42, 12, 290] 3
[42, 12, 290, 3] 2
[12, 290, 3, 2] 67
[290, 3, 2, 67] 3
[3, 2, 67, 3] 261
[

[176, 64, 85, 4] 13
[64, 85, 4, 13] 288
[85, 4, 13, 288] 17
[4, 13, 288, 17] 202
[13, 288, 17, 202] 17
[288, 17, 202, 17] 7
[17, 202, 17, 7] 275
[202, 17, 7, 275] 604
[17, 7, 275, 604] 198
[7, 275, 604, 198] 5
[275, 604, 198, 5] 1506
[9, 252, 84, 37] 163
[252, 84, 37, 163] 111
[84, 37, 163, 111] 41
[37, 163, 111, 41] 2
[163, 111, 41, 2] 204
[111, 41, 2, 204] 1507
[41, 2, 204, 1507] 605
[2, 204, 1507, 605] 58
[204, 1507, 605, 58] 5
[1507, 605, 58, 5] 2
[605, 58, 5, 2] 13
[58, 5, 2, 13] 1508
[5, 2, 13, 1508] 45
[2, 13, 1508, 45] 325
[13, 1508, 45, 325] 143
[1508, 45, 325, 143] 105
[45, 325, 143, 105] 94
[325, 143, 105, 94] 47
[143, 105, 94, 47] 606
[105, 94, 47, 606] 15
[94, 47, 606, 15] 890
[269, 153, 8, 89] 47
[153, 8, 89, 47] 39
[8, 89, 47, 39] 6
[89, 47, 39, 6] 63
[47, 39, 6, 63] 4
[39, 6, 63, 4] 83
[6, 63, 4, 83] 253
[63, 4, 83, 253] 1509
[4, 83, 253, 1509] 11
[83, 253, 1509, 11] 2
[253, 1509, 11, 2] 344
[1509, 11, 2, 344] 4
[11, 2, 344, 4] 1510
[2, 344, 4, 1510] 5
[344, 4, 1510, 5]

[568, 6, 547, 1793] 186
[6, 547, 1793, 186] 2
[547, 1793, 186, 2] 484
[1793, 186, 2, 484] 5
[186, 2, 484, 5] 2
[2, 484, 5, 2] 994
[484, 5, 2, 994] 103
[5, 2, 994, 103] 995
[2, 994, 103, 995] 5
[994, 103, 995, 5] 2
[103, 995, 5, 2] 303
[995, 5, 2, 303] 75
[5, 2, 303, 75] 996
[2, 303, 75, 996] 19
[303, 75, 996, 19] 20
[75, 996, 19, 20] 5
[996, 19, 20, 5] 2
[19, 20, 5, 2] 56
[20, 5, 2, 56] 438
[5, 2, 56, 438] 9
[2, 56, 438, 9] 34
[56, 438, 9, 34] 1794
[438, 9, 34, 1794] 1795
[9, 34, 1794, 1795] 1796
[34, 1794, 1795, 1796] 1797
[1794, 1795, 1796, 1797] 1798
[1795, 1796, 1797, 1798] 1799
[1796, 1797, 1798, 1799] 1800
[1797, 1798, 1799, 1800] 1801
[1798, 1799, 1800, 1801] 276
[1799, 1800, 1801, 276] 997
[1800, 1801, 276, 997] 1802
[1801, 276, 997, 1802] 276
[276, 997, 1802, 276] 1803
[997, 1802, 276, 1803] 46
[1802, 276, 1803, 46] 998
[276, 1803, 46, 998] 1804
[1803, 46, 998, 1804] 1805
[46, 998, 1804, 1805] 1806
[998, 1804, 1805, 1806] 669
[1804, 1805, 1806, 669] 75
[1805, 1806, 669, 75] 90

[60, 1089, 341, 5] 14
[1089, 341, 5, 14] 44
[341, 5, 14, 44] 35
[2, 513, 1, 3] 925
[513, 1, 3, 925] 1
[1, 3, 925, 1] 6
[3, 925, 1, 6] 4
[925, 1, 6, 4] 753
[276, 1, 1, 1] 1
[1, 1, 1, 1] 1
[1, 1, 1, 1] 1
[1, 1, 1, 1] 1
[1, 1, 1, 1] 763
[1, 1, 1, 763] 1
[1, 1, 763, 1] 3
[1, 763, 1, 3] 1062
[763, 1, 3, 1062] 38
[1, 3, 1062, 38] 20
[3, 1062, 38, 20] 5
[1062, 38, 20, 5] 2
[38, 20, 5, 2] 531
[20, 5, 2, 531] 1
[5, 2, 531, 1] 10
[2, 531, 1, 10] 64
[531, 1, 10, 64] 555
[1, 10, 64, 555] 2
[10, 64, 555, 2] 132
[64, 555, 2, 132] 514
[555, 2, 132, 514] 1
[2, 132, 514, 1] 1
[8, 77, 98, 35] 23
[77, 98, 35, 23] 97
[98, 35, 23, 97] 876
[35, 23, 97, 876] 1
[23, 97, 876, 1] 11
[97, 876, 1, 11] 7
[876, 1, 11, 7] 584
[7, 510, 34, 2] 62
[510, 34, 2, 62] 25
[34, 2, 62, 25] 14
[2, 62, 25, 14] 50
[62, 25, 14, 50] 64
[25, 14, 50, 64] 69
[30, 25, 57, 26] 29
[25, 57, 26, 29] 25
[7, 13, 6, 4] 390
[13, 6, 4, 390] 673
[6, 4, 390, 673] 5
[4, 390, 673, 5] 4
[390, 673, 5, 4] 25
[673, 5, 4, 25] 67
[5, 4, 25, 67] 3
[4, 25

[3, 4, 1, 16] 28
[4, 1, 16, 28] 1138
[1, 16, 28, 1138] 389
[1139, 1, 1, 1] 91
[1, 1, 1, 91] 10
[1, 1, 91, 10] 1
[1, 91, 10, 1] 19
[91, 10, 1, 19] 45
[10, 1, 19, 45] 707
[1, 19, 45, 707] 1
[19, 45, 707, 1] 44
[45, 707, 1, 44] 1140
[707, 1, 44, 1140] 1
[1, 44, 1140, 1] 44
[44, 1140, 1, 44] 50
[1140, 1, 44, 50] 363
[1, 44, 50, 363] 2
[44, 50, 363, 2] 1
[50, 363, 2, 1] 5
[363, 2, 1, 5] 1141
[2, 1, 5, 1141] 1142
[1, 5, 1141, 1142] 3
[5, 1141, 1142, 3] 1105
[1141, 1142, 3, 1105] 1143
[1142, 3, 1105, 1143] 4
[3, 1105, 1143, 4] 329
[1105, 1143, 4, 329] 344
[1143, 4, 329, 344] 94
[4, 329, 344, 94] 257
[329, 344, 94, 257] 9
[344, 94, 257, 9] 2
[94, 257, 9, 2] 60
[257, 9, 2, 60] 162
[1, 1, 6, 4] 27
[1, 6, 4, 27] 1
[6, 4, 27, 1] 1
[4, 27, 1, 1] 1139
[27, 1, 1, 1139] 51
[1, 1, 1139, 51] 75
[1, 1139, 51, 75] 34
[1139, 51, 75, 34] 27
[51, 75, 34, 27] 1
[75, 34, 27, 1] 32
[34, 27, 1, 32] 374
[27, 1, 32, 374] 3
[1, 32, 374, 3] 449
[32, 374, 3, 449] 183
[374, 3, 449, 183] 29
[3, 449, 183, 29] 705
[449, 

[34, 11, 2, 690] 1
[211, 5, 327, 11] 2
[5, 327, 11, 2] 67
[327, 11, 2, 67] 251
[11, 2, 67, 251] 135
[2, 67, 251, 135] 36
[67, 251, 135, 36] 59
[251, 135, 36, 59] 50
[135, 36, 59, 50] 740
[36, 59, 50, 740] 2
[59, 50, 740, 2] 1
[50, 740, 2, 1] 135
[740, 2, 1, 135] 36
[2, 1, 135, 36] 238
[1, 135, 36, 238] 50
[135, 36, 238, 50] 1
[36, 238, 50, 1] 681
[238, 50, 1, 681] 134
[50, 1, 681, 134] 205
[1, 681, 134, 205] 45
[681, 134, 205, 45] 1195
[134, 205, 45, 1195] 3
[205, 45, 1195, 3] 582
[45, 1195, 3, 582] 469
[181, 5, 1, 6] 20
[5, 1, 6, 20] 5
[1, 6, 20, 5] 2
[6, 20, 5, 2] 175
[20, 5, 2, 175] 199
[5, 2, 175, 199] 264
[2, 175, 199, 264] 277
[175, 199, 264, 277] 66
[199, 264, 277, 66] 176
[264, 277, 66, 176] 64
[277, 66, 176, 64] 85
[50, 1, 160, 139] 59
[1, 160, 139, 59] 10
[160, 139, 59, 10] 88
[139, 59, 10, 88] 4
[59, 10, 88, 4] 1
[10, 88, 4, 1] 277
[88, 4, 1, 277] 13
[4, 1, 277, 13] 40
[1, 277, 13, 40] 2
[277, 13, 40, 2] 600
[13, 40, 2, 600] 5
[40, 2, 600, 5] 1055
[2, 600, 5, 1055] 1
[600, 5

In [20]:
data[0]

([4, 27, 27, 27], 287)

In [21]:
import random

random.shuffle(data)    # 랜덤하게 섞어주기
data

[([147, 7, 16, 280], 51),
 ([777, 46, 429, 73], 5),
 ([1, 11, 2, 14], 71),
 ([1, 1, 1, 270], 1),
 ([168, 66, 44, 35], 39),
 ([12, 2, 248, 1153], 1),
 ([5, 1, 4, 233], 142),
 ([46, 847, 70, 5], 2),
 ([1216, 302, 93, 1], 207),
 ([579, 27, 57, 448], 2),
 ([1766, 15, 333, 2], 229),
 ([1988, 1989, 1990, 1991], 4),
 ([5, 1, 3, 2], 1),
 ([645, 15, 440, 23], 4),
 ([379, 3, 33, 122], 14),
 ([2, 68, 2, 56], 2),
 ([979, 1, 5, 1], 1125),
 ([33, 424, 15, 35], 180),
 ([34, 1879, 19, 7], 1880),
 ([397, 256, 84, 47], 1),
 ([1652, 96, 33, 222], 14),
 ([2, 488, 78, 52], 2),
 ([22, 674, 2, 1], 1),
 ([16, 93, 10, 1771], 7),
 ([36, 2, 13, 802], 428),
 ([17, 8, 445, 10], 460),
 ([8, 271, 146, 8], 37),
 ([1290, 64, 10, 555], 2),
 ([1, 1097, 1098, 11], 693),
 ([205, 45, 1195, 3], 582),
 ([573, 16, 4, 13], 19),
 ([7, 1465, 3, 8], 875),
 ([13, 346, 1207, 433], 18),
 ([89, 88, 96, 16], 2),
 ([24, 10, 781, 524], 112),
 ([23, 24, 1314, 38], 1315),
 ([6, 44, 35, 1742], 6),
 ([241, 5, 236, 11], 1054),
 ([1198, 9, 18

In [23]:
# x와 y에 해당하는 부분 분리
import numpy as np

xs = np.array([x for x, y in data])
ys = np.array([y for x, y in data])

In [24]:
xs

array([[147,   7,  16, 280],
       [777,  46, 429,  73],
       [  1,  11,   2,  14],
       ...,
       [398,   1,  11,   2],
       [  5,   1,  25,   9],
       [ 64,  85,  11,  55]])

In [25]:
ys

array([ 51,   5,  71, ...,   1,   6, 304])

In [26]:
# 전처리된 데이터 저장
joblib.dump((xs, ys), 'Im-data.pkl')

['Im-data.pkl']

# 3. 학습

In [27]:
# 다른 파일에서 작업한다고 가정하고 데이터 불러옴
import joblib

tk = joblib.load('tokenizer.pkl')
xs, ys = joblib.load('Im-data.pkl')

In [28]:
import tensorflow as tf

In [31]:
tk.num_words

2000

In [30]:
NUM_WORD = tk.num_words + 1    # index_word가 1에서부터 시작하니까 0을 넣어준다

In [32]:
tk.index_word

{1: '<unk>',
 2: 'the',
 3: 'and',
 4: 'a',
 5: 'of',
 6: 'is',
 7: 'this',
 8: 'i',
 9: 'it',
 10: 'to',
 11: 'in',
 12: 'was',
 13: 'movie',
 14: 'film',
 15: 'that',
 16: 'for',
 17: 'as',
 18: 'but',
 19: 'with',
 20: 'one',
 21: 'on',
 22: 'you',
 23: 'are',
 24: 'not',
 25: 'bad',
 26: "it's",
 27: 'very',
 28: 'all',
 29: 'just',
 30: 'so',
 31: 'good',
 32: 'at',
 33: 'an',
 34: 'be',
 35: 'there',
 36: 'about',
 37: 'have',
 38: 'by',
 39: 'like',
 40: 'from',
 41: 'if',
 42: 'acting',
 43: 'time',
 44: 'out',
 45: 'his',
 46: 'or',
 47: 'really',
 48: 'great',
 49: 'even',
 50: 'he',
 51: 'who',
 52: 'were',
 53: 'has',
 54: 'see',
 55: 'my',
 56: 'characters',
 57: 'well',
 58: 'most',
 59: 'how',
 60: 'more',
 61: 'no',
 62: 'only',
 63: 'when',
 64: 'ever',
 65: '10',
 66: 'movies',
 67: 'plot',
 68: 'story',
 69: 'made',
 70: 'some',
 71: 'they',
 72: 'best',
 73: 'because',
 74: 'your',
 75: 'can',
 76: 'also',
 77: "don't",
 78: 'films',
 79: 'than',
 80: 'its',
 81: 's

In [33]:
xs[0]

array([147,   7,  16, 280])

In [34]:
# 임베딩 레이어 만들기
emb1 = tf.keras.layers.Embedding(input_dim=NUM_WORD, output_dim=8)
# input_dim : 들어갈 단어의 수
# output_dim : 만들 단어 임베딩의 처음의 크기.
#              크게 하면 성능은 좋아지지만 과적합, 오버피팅이 일어날 수 있다.
#              그래서 적절한 수치를 찾아야 된다

In [37]:
Im = tf.keras.Sequential([emb1,
                         tf.keras.layers.GlobalAveragePooling1D(),    # 한번에 4개의 단어가 입력되면 4개의 단어에 임베딩이 생김 (여기서는 4 * 8 = 32개) 여기서 평균을 내서 숫자를 맞춰줌(여기서는 8개)
                         tf.keras.layers.Dense(8, activation='relu'),
                         tf.keras.layers.Dense(NUM_WORD)])    # 2001개 출력 / 보통은 softmax를 붙여주지만 여기선 생략 가능

In [38]:
Im.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 8)           16008     
_________________________________________________________________
global_average_pooling1d_1 ( (None, 8)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_3 (Dense)              (None, 2001)              18009     
Total params: 34,089
Trainable params: 34,089
Non-trainable params: 0
_________________________________________________________________


In [39]:
Im.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),# 위(모델)에서 softmax를 안썼으니까 컴파일 세팅시 loss설정에 softmax라고 알려줘야 한다(from_logits=True)
          optimizer='adam',
          metrics=['accuracy'])

In [40]:
Im.fit(xs, ys, epochs=1)



<tensorflow.python.keras.callbacks.History at 0x2b36a602ac0>

In [41]:
Im.save('Im.krs')

INFO:tensorflow:Assets written to: Im.krs\assets


## 3.1 단어 임베딩

In [42]:
emb1

<tensorflow.python.keras.layers.embeddings.Embedding at 0x2b368b57190>

In [44]:
emb1.embeddings

<tf.Variable 'embedding/embeddings:0' shape=(2001, 8) dtype=float32, numpy=
array([[-0.02532017, -0.036318  , -0.00613235, ...,  0.04797038,
         0.02455663,  0.00655316],
       [ 0.37152874, -0.3770591 , -0.37524793, ...,  0.3404293 ,
        -0.35393956, -0.3540851 ],
       [ 0.3364316 , -0.3565219 , -0.25342417, ...,  0.33312815,
        -0.30897972, -0.31793803],
       ...,
       [ 0.01673251, -0.00516518,  0.00940595, ...,  0.0208306 ,
        -0.04155857, -0.02227842],
       [ 0.05230597,  0.01546763, -0.06970923, ..., -0.00230557,
        -0.04919909,  0.03485477],
       [-0.04232819,  0.04266235, -0.01121739, ...,  0.03660214,
        -0.03739343,  0.0357894 ]], dtype=float32)>

In [45]:
e = emb1.embeddings.numpy()    # 데이터를 편하게 보기 위해 numpy로 바꿈
e

array([[-0.02532017, -0.036318  , -0.00613235, ...,  0.04797038,
         0.02455663,  0.00655316],
       [ 0.37152874, -0.3770591 , -0.37524793, ...,  0.3404293 ,
        -0.35393956, -0.3540851 ],
       [ 0.3364316 , -0.3565219 , -0.25342417, ...,  0.33312815,
        -0.30897972, -0.31793803],
       ...,
       [ 0.01673251, -0.00516518,  0.00940595, ...,  0.0208306 ,
        -0.04155857, -0.02227842],
       [ 0.05230597,  0.01546763, -0.06970923, ..., -0.00230557,
        -0.04919909,  0.03485477],
       [-0.04232819,  0.04266235, -0.01121739, ...,  0.03660214,
        -0.03739343,  0.0357894 ]], dtype=float32)

In [46]:
e.shape

(2001, 8)

In [47]:
emb1.get_weights()

[array([[-0.02532017, -0.036318  , -0.00613235, ...,  0.04797038,
          0.02455663,  0.00655316],
        [ 0.37152874, -0.3770591 , -0.37524793, ...,  0.3404293 ,
         -0.35393956, -0.3540851 ],
        [ 0.3364316 , -0.3565219 , -0.25342417, ...,  0.33312815,
         -0.30897972, -0.31793803],
        ...,
        [ 0.01673251, -0.00516518,  0.00940595, ...,  0.0208306 ,
         -0.04155857, -0.02227842],
        [ 0.05230597,  0.01546763, -0.06970923, ..., -0.00230557,
         -0.04919909,  0.03485477],
        [-0.04232819,  0.04266235, -0.01121739, ...,  0.03660214,
         -0.03739343,  0.0357894 ]], dtype=float32)]

In [48]:
w = emb1.get_weights()[0]    # 똑같은데 맨 앞에 []가 사라진다
w

array([[-0.02532017, -0.036318  , -0.00613235, ...,  0.04797038,
         0.02455663,  0.00655316],
       [ 0.37152874, -0.3770591 , -0.37524793, ...,  0.3404293 ,
        -0.35393956, -0.3540851 ],
       [ 0.3364316 , -0.3565219 , -0.25342417, ...,  0.33312815,
        -0.30897972, -0.31793803],
       ...,
       [ 0.01673251, -0.00516518,  0.00940595, ...,  0.0208306 ,
        -0.04155857, -0.02227842],
       [ 0.05230597,  0.01546763, -0.06970923, ..., -0.00230557,
        -0.04919909,  0.03485477],
       [-0.04232819,  0.04266235, -0.01121739, ...,  0.03660214,
        -0.03739343,  0.0357894 ]], dtype=float32)

In [49]:
np.array_equal(e, w)

True

In [50]:
np.savez('word-emb.npz', emb=e)    # 저장

# 4. GlobalAveragePooling1D

In [52]:
import tensorflow as tf
import numpy as np

x = np.array([[[1, 2, 3], [3, 6, 9]]], dtype='float32')
x

array([[[1., 2., 3.],
        [3., 6., 9.]]], dtype=float32)

In [53]:
x.shape    # (1 : 신경망에서 데이터의 건수, 2 : 벡터(단어)의 개수, 3 : 하나의 벡터(단어)의 길이)

(1, 2, 3)

In [54]:
avg = tf.keras.layers.GlobalAveragePooling1D()
avg

<tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D at 0x2b36af6db80>

In [55]:
y = avg(x).numpy()
y

array([[2., 4., 6.]], dtype=float32)

In [56]:
y.shape

(1, 3)

In [57]:
x = np.array([[[1, 2, 3], [3, 6, 9], [3, 6, 9]]], dtype='float32')
x

array([[[1., 2., 3.],
        [3., 6., 9.],
        [3., 6., 9.]]], dtype=float32)

In [58]:
x.shape

(1, 3, 3)

In [59]:
avg = tf.keras.layers.GlobalAveragePooling1D()
avg

<tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D at 0x2b36a5167f0>

In [60]:
y = avg(x).numpy()
y

array([[2.3333333, 4.6666665, 7.       ]], dtype=float32)

In [61]:
y.shape

(1, 3)

# 5. 다음 토큰의 확률 예측

In [62]:
# 준비된 데이터 불러오기
import joblib

tk = joblib.load('tokenizer.pkl')

In [63]:
# 텍스트를 신경망 언어 모형에 넣을 수 있는 모형으로 변형
xs, ys = joblib.load('Im-data.pkl')

In [64]:
xs

array([[147,   7,  16, 280],
       [777,  46, 429,  73],
       [  1,  11,   2,  14],
       ...,
       [398,   1,  11,   2],
       [  5,   1,  25,   9],
       [ 64,  85,  11,  55]])

In [65]:
ys

array([ 51,   5,  71, ...,   1,   6, 304])

In [66]:
import os
os.getcwd()    # 어느 폴더에 있는지 확인

'C:\\Users\\Owner\\Desktop\\khh\\git\\ICT'

In [67]:
import tensorflow as tf
Im = tf.keras.models.load_model('Im.krs')

In [68]:
x = xs[0:1]
y = ys[0]

In [69]:
x

array([[147,   7,  16, 280]])

In [72]:
y

51

In [73]:
[tk.index_word[i] for i in x[0]]

['recommend', 'this', 'for', 'everyone']

In [74]:
tk.index_word[22]

'you'

In [75]:
x.shape

(1, 4)

In [76]:
import numpy as np

logit = Im.predict(x.astype('float32'))
logit.shape

(1, 2001)

In [77]:
logit    # softmax 되기 전의 값

array([[-3.6577291,  3.5588021,  3.0467644, ..., -3.572786 , -3.5140846,
        -3.5220203]], dtype=float32)

In [78]:
# softmax로 태우기
p = tf.nn.softmax(logit).numpy()
p

array([[4.5891360e-05, 6.2492892e-02, 3.7450314e-02, ..., 4.9959872e-05,
        5.2980380e-05, 5.2561591e-05]], dtype=float32)

In [79]:
p[0, 22]

0.0037926128

In [80]:
i = p.argmax()
i

1

In [81]:
tk.index_word[1]

'<unk>'

In [82]:
p[0, i]

0.062492892

# 6. 전이 학습
## 6.1 실습 준비

In [83]:
import pandas as pd

df = pd.read_csv('imdb.zip')
df

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
995,I just got bored watching Jessice Lange take h...,0
996,"Unfortunately, any virtue in this film's produ...",0
997,"In a word, it is embarrassing.",0
998,Exceptionally bad!,0


In [85]:
import joblib
tk = joblib.load('tokenizer.pkl')

In [86]:
seqs = tk.texts_to_sequences(df['review'])    # 텍스트를 숫자 번호로 바꿔줌

In [87]:
seqs[0]

[4, 27, 27, 27, 287, 407, 1217, 13, 36, 4, 1218, 1219, 408, 142]

In [88]:
seqs[1]    # 텍스트마다 길이가 다르다

[24, 522, 51, 12, 60, 409, 2, 736, 56, 46, 2, 337, 1220, 288, 5, 737, 738, 44]

In [89]:
# 첫번째 방법 : 짧은 문장에 0을 넣어 길이를 맞춰준다(패딩)
import tensorflow as tf

pads = tf.keras.preprocessing.sequence.pad_sequences(seqs)

In [92]:
pads[0], len(pads[0])

(array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    4,   27,   27,   27,  287,  407, 1217,
          13,   36,    4, 1218, 1219,  408,  142]),
 73)

## 6.2 단어 임베딩 불러오기

In [93]:
import numpy as np

z = np.load('word-emb.npz')
e = z['emb']

## 6.3 감성 분석

In [95]:
emb2 = tf.keras.layers.Embedding(input_dim = tk.num_words + 1,
                                 output_dim = 8,
                                 embeddings_initializer = tf.keras.initializers.Constant(e))    # 이미 학습된 임베딩을 넣어줌. 안해도 돌아는 가지만 수월하게 처리하게 함

In [96]:
model = tf.keras.Sequential([
    emb2,
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')    # 긍정인지 부정인지를 보는거니까
])

In [97]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 8)           16008     
_________________________________________________________________
global_average_pooling1d_4 ( (None, 8)                 0         
_________________________________________________________________
dense_4 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 9         
Total params: 16,089
Trainable params: 16,089
Non-trainable params: 0
_________________________________________________________________


In [98]:
model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [99]:
y = df['sentiment'].values    # sentiment값으로 세팅

In [101]:
model.fit(pads, y)



<tensorflow.python.keras.callbacks.History at 0x2b36ae6a8b0>